# Text Acquisition & Ingestion Assignment

In [1]:
import json
import requests
import feedparser
from bs4 import BeautifulSoup

### Iterate through the list of article URLs below, scraping the text from each one and saving it to a text file. 

In [2]:
articles = ['http://lite.cnn.io/en/article/h_eac18760a7a7f9a1bf33616f1c4a336d',
            'http://lite.cnn.io/en/article/h_de3f82f17d289680dd2b47c6413ebe7c',
            'http://lite.cnn.io/en/article/h_72f4dc9d6f35458a89af014b62e625ad',
            'http://lite.cnn.io/en/article/h_aa21fe6bf176071cb49e09d422c3adf0',
            'http://lite.cnn.io/en/article/h_8ad34a532921c9076cdc9d7390d2f1bc',
            'http://lite.cnn.io/en/article/h_84422c79110d9989177cfaf1c5f45fe7',
            'http://lite.cnn.io/en/article/h_d010d9580abac3a44c6181ec6fb63d58',
            'http://lite.cnn.io/en/article/h_fb11f4e9d7c5323e75b337d9e9e5e368',
            'http://lite.cnn.io/en/article/h_7b27f0b131067f8ece6238ac559670ab',
            'http://lite.cnn.io/en/article/h_8cae7f735fa9573d470f802063ceffe2',
            'http://lite.cnn.io/en/article/h_72c3668280e82576fcc2602b0fa70c14',
            'http://lite.cnn.io/en/article/h_d20658fb0e20212051cda0e0a7248c8a',
            'http://lite.cnn.io/en/article/h_56611c43d7928120d2ae21666ccc7417',
            'http://lite.cnn.io/en/article/h_bda0394e3c5ee7054ee65c022bca7695']

In [16]:
path = 'cnn_articles/'

for i, url in enumerate(articles):
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    text = soup.find('div', {'class':'afe4286c'}).text
    with open(path + f'article_{i}.txt', 'wb') as f:
        f.write(text.encode())

### Ingest the text files generated via web scraping into a corpus and print the corpus statistics.

In [30]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

import nltk
nltk.download('punkt')

doc_pattern = r'.*\.txt'
corpus = PlaintextCorpusReader(path, doc_pattern)
corpus.fileids()

[nltk_data] Downloading package punkt to /Users/abilenky/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['article_0.txt',
 'article_1.txt',
 'article_10.txt',
 'article_11.txt',
 'article_12.txt',
 'article_13.txt',
 'article_2.txt',
 'article_3.txt',
 'article_4.txt',
 'article_5.txt',
 'article_6.txt',
 'article_7.txt',
 'article_8.txt',
 'article_9.txt']

In [35]:
def corpus_stats(corpus):
    print(
        f"Corpus Statistics\n\n"
        f"Number of documents: {len(corpus.fileids())}\n\n"
        f"Number of paragraphs: {len(corpus.paras())}\n\n"
        f"Number of sentences: {len(corpus.sents())}\n\n"
        f"Number of words: {len(corpus.words())}\n\n"
        f"Vocabulary: {len(set(w.lower() for w in corpus.words()))}\n\n"
        f"Avg chars per word: {round(len(corpus.raw())/len(corpus.words()))}\n\n"
        f"Avg words per sentence: {round(len(corpus.words())/len(corpus.sents()))}\n\n"
    )
    
corpus_stats(corpus)

Corpus Statistics

Number of documents: 14

Number of paragraphs: 14

Number of sentences: 427

Number of words: 13824

Vocabulary: 2955

Avg chars per word: 5

Avg words per sentence: 32




### Parse the O'Reilly Radar RSS feed below, extract the text from each post, and save it to a text file.

The content of each post contains HTML tags. Strip those out using the same approach you used for web scraping so that only text is saved to the files.

In [36]:
feed = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [42]:
parsed = feedparser.parse(feed)

In [50]:
parsed.entries[0].summary

'2020 has been a year of great challenges for so many, but it’s not all negative. Around the world, organizations and their workforces have risen to the occasion, recognizing the importance of expanding their knowledge, taking on new tasks, and bettering themselves both personally and professionally. With the uptick in virtual conferencing, remote work, and, [&#8230;]'

### Ingest the text files generated via RSS parsing into a corpus and print the corpus statistics.

In [51]:
path = 'rss_articles/'

for i, entry in enumerate(parsed.entries):
    text = entry.summary
    with open(path + f'article_{i}.txt', 'wb') as f:
        f.write(text.encode())

In [52]:
doc_pattern = r'.*\.txt'
corpus = PlaintextCorpusReader(path, doc_pattern)

corpus_stats(corpus)

Corpus Statistics

Number of documents: 60

Number of paragraphs: 60

Number of sentences: 197

Number of words: 4515

Vocabulary: 1467

Avg chars per word: 5

Avg words per sentence: 23




### Make an API call to the Hacker News API to retrieve their Ask, Show, and Job category items. 

- URL: https://hacker-news.firebaseio.com/v0/askstories.json

In [66]:
ids = []
for cat in ['ask', 'show', 'job']:
    url = f'https://hacker-news.firebaseio.com/v0/{cat}stories.json'
    response = requests.get(url)
    print(response)
    print(f'Added {len(response.json())} from {cat}stories')
    ids.extend(response.json())

<Response [200]>
Added 98 from askstories
<Response [200]>
Added 42 from showstories
<Response [200]>
Added 60 from jobstories


In [67]:
len(ids)

200

### Once you have retrieved the item IDs from the URL above, retrieve each item by adding the item ID to the URL below, extract the item's text property, and save the text from each item to disk as its own document.

- URL: https://hacker-news.firebaseio.com/v0/item/ITEM_ID_HERE.json

The content of some items may contain HTML tags. Strip those out using the same approach you used for web scraping so that only text is saved to the files.

In [89]:
url = f'https://hacker-news.firebaseio.com/v0/item/{ids[0]}.json'
response = requests.get(url)
soup = BeautifulSoup(response.json()['text'])
soup.text

"What manual skills can one learn that pay well, and can be self-taught?'pay well' means earning (independently or in a  job) more than or equal to 60% of the average salary of a Software Engineer, in a given municipality."

In [91]:
path = 'api_articles/'

for i, id_ in enumerate(ids):
    url = f'https://hacker-news.firebaseio.com/v0/item/{id_}.json'
    response = requests.get(url)
    if 'text' in response.json().keys():
        soup = BeautifulSoup(response.json()['text'])
        text = soup.text
        with open(path + f'article_{i}.txt', 'wb') as f:
            f.write(text.encode())

### Ingest the text files generated via API into a corpus and print the corpus statistics.

In [92]:
doc_pattern = r'.*\.txt'
corpus = PlaintextCorpusReader(path, doc_pattern)

corpus_stats(corpus)

Corpus Statistics

Number of documents: 84

Number of paragraphs: 84

Number of sentences: 273

Number of words: 7851

Vocabulary: 2019

Avg chars per word: 5

Avg words per sentence: 29


