In [72]:
!pip install feedparser



# Text Acquisition & Ingestion Assignment

# New Section

In [73]:
import json
import requests
import feedparser
from bs4 import BeautifulSoup

### Iterate through the list of article URLs below, scraping the text from each one and saving it to a text file. 

In [74]:
articles = ['http://lite.cnn.io/en/article/h_eac18760a7a7f9a1bf33616f1c4a336d',
            'http://lite.cnn.io/en/article/h_de3f82f17d289680dd2b47c6413ebe7c',
            'http://lite.cnn.io/en/article/h_72f4dc9d6f35458a89af014b62e625ad',
            'http://lite.cnn.io/en/article/h_aa21fe6bf176071cb49e09d422c3adf0',
            'http://lite.cnn.io/en/article/h_8ad34a532921c9076cdc9d7390d2f1bc',
            'http://lite.cnn.io/en/article/h_84422c79110d9989177cfaf1c5f45fe7',
            'http://lite.cnn.io/en/article/h_d010d9580abac3a44c6181ec6fb63d58',
            'http://lite.cnn.io/en/article/h_fb11f4e9d7c5323e75b337d9e9e5e368',
            'http://lite.cnn.io/en/article/h_7b27f0b131067f8ece6238ac559670ab',
            'http://lite.cnn.io/en/article/h_8cae7f735fa9573d470f802063ceffe2',
            'http://lite.cnn.io/en/article/h_72c3668280e82576fcc2602b0fa70c14',
            'http://lite.cnn.io/en/article/h_d20658fb0e20212051cda0e0a7248c8a',
            'http://lite.cnn.io/en/article/h_56611c43d7928120d2ae21666ccc7417',
            'http://lite.cnn.io/en/article/h_bda0394e3c5ee7054ee65c022bca7695']

In [75]:
PATH = '/content/news_articles/'

for i, article in enumerate(articles):
  response = requests.get(article)
  content = response.text
  soup = BeautifulSoup(content)
  article_text = soup.find('div', {'class': 'afe4286c'}).get_text()

  with open(PATH + f'_post_{i}.txt', 'wb') as f:
    f.write(article_text.encode())

### Ingest the text files generated via web scraping into a corpus and print the corpus statistics.

In [98]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import nltk
nltk.download('punkt')

DOC_PATTERN = r'.*\.txt'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [99]:
def corpus_stats(corpus):
  print("Corpus Statistics")
  print("Number of Documents: ", str(len(corpus.fileids())))
  print("Number of paragraphs: ", str(len(corpus.paras())))
  print("Number of sentences: ", str(len(corpus.sents())))
  print("Number of words: ", str(len(corpus.words())))
  print("Vocabulary: ", str(len(set(w.lower for w in corpus.words()))))
  print("Avg chars per word: ", str(round(len(corpus.raw())/len(corpus.words()),1)))
  print("Avg words per sentence: ", str(round(len(corpus.words())/len(corpus.sents()),1)))

In [100]:
corpus_stats(corpus)

Corpus Statistics
Number of Documents:  14
Number of paragraphs:  14
Number of sentences:  427
Number of words:  13824
Vocabulary:  11481
Avg chars per word:  5.0
Avg words per sentence:  32.4


### Parse the O'Reilly Radar RSS feed below, extract the text from each post, and save it to a text file.

The content of each post contains HTML tags. Strip those out using the same approach you used for web scraping so that only text is saved to the files.

In [101]:
feed = 'http://feeds.feedburner.com/oreilly/radar/atom'

In [102]:
parsed = feedparser.parse(feed)

In [103]:
posts = parsed.entries

In [104]:
posts[0]["summary"]

'2020 has been a year of great challenges for so many, but it’s not all negative. Around the world, organizations and their workforces have risen to the occasion, recognizing the importance of expanding their knowledge, taking on new tasks, and bettering themselves both personally and professionally. With the uptick in virtual conferencing, remote work, and, [&#8230;]'

### Ingest the text files generated via RSS parsing into a corpus and print the corpus statistics.

In [105]:
PATH = '/content/rss/'

for i, post in enumerate(posts):
  text = post.summary

  with open(PATH + f'_post_{i}.txt', 'wb') as f:
    f.write(text.encode())

In [106]:
DOC_PATTERN = r'.*\.txt'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

corpus_stats(corpus)

Corpus Statistics
Number of Documents:  60
Number of paragraphs:  60
Number of sentences:  197
Number of words:  4515
Vocabulary:  3751
Avg chars per word:  4.9
Avg words per sentence:  22.9


### Make an API call to the Hacker News API to retrieve their Ask, Show, and Job category items. 

- URL: https://hacker-news.firebaseio.com/v0/askstories.json

In [122]:
url = 'https://hacker-news.firebaseio.com/v0/askstories.json'

response = requests.get(url).json()
response

[25562022,
 25561398,
 25560185,
 25553818,
 25558927,
 25559755,
 25554464,
 25562723,
 25559274,
 25559174,
 25560180,
 25553772,
 25540583,
 25538586,
 25556199,
 25537230,
 25555746,
 25556563,
 25559143,
 25546445,
 25557720,
 25525457,
 25541269,
 25541616,
 25557852,
 25558741,
 25525426,
 25548802,
 25530700,
 25553613,
 25533487,
 25547050,
 25541964,
 25557927,
 25552885,
 25542676,
 25557631,
 25545469,
 25551133,
 25544753,
 25538405,
 25531729,
 25546557,
 25550111,
 25545136,
 25542679,
 25551290,
 25540343,
 25541939,
 25559571,
 25542290,
 25533051,
 25538258,
 25538128,
 25541828,
 25528481,
 25535752,
 25540059,
 25544961,
 25550627,
 25528596,
 25543287,
 25526708,
 25543087,
 25542812,
 25555295,
 25530559,
 25535332,
 25528837,
 25533472,
 25525590,
 25539594,
 25539230,
 25537569,
 25535792,
 25533505,
 25534981,
 25526280,
 25533682,
 25542189,
 25549841,
 25548696,
 25527401,
 25526579,
 25545525,
 25549864,
 25525446,
 25536672,
 25539190,
 25533770,
 25527006,

### Once you have retrieved the item IDs from the URL above, retrieve each item by adding the item ID to the URL below, extract the item's text property, and save the text from each item to disk as its own document.

- URL: https://hacker-news.firebaseio.com/v0/item/ITEM_ID_HERE.json

The content of some items may contain HTML tags. Strip those out using the same approach you used for web scraping so that only text is saved to the files.

In [132]:
PATH = '/content/api/'

for id in response:
  url = f'https://hacker-news.firebaseio.com/v0/item/{id}.json'
  content = requests.get(url).json()
  if 'text' in content:
    soup = BeautifulSoup(content['text']).get_text()

    with open(PATH + f'post_{id}.txt', 'wb') as f:
      f.write(article_text.encode())


### Ingest the text files generated via API into a corpus and print the corpus statistics.

In [133]:
DOC_PATTERN = r'.*\.txt'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

corpus_stats(corpus)

Corpus Statistics
Number of Documents:  76
Number of paragraphs:  76
Number of sentences:  3572
Number of words:  102676
Vocabulary:  77767
Avg chars per word:  4.5
Avg words per sentence:  28.7
