# Extracting Information from Text Data Assignment

In [5]:
import spacy
import string
import textacy
import itertools
from nltk import pos_tag
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import tree2conlltags
from gensim.summarization import keywords
from nltk.chunk.regexp import RegexpParser
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [6]:
!pip install textacy --quiet
!pip install rake_nltk --quiet

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [8]:
PATH = '/content/drive/MyDrive/web scrap/CNN'

DOC_PATTERN = r'.*\.txt'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [9]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

### Extract the top 5 keywords from every document in the corpus. Print them and compare the differences in keywords among the documents.

In [14]:
doc = docs[0]
[[keywords(doc, words=5, lemmatize=True).split('\n')] for doc in docs]

[[['pink', 'tour', 'star', 'pretty', 'motocross']],
 [['primary', 'patrick', 'telling', 'year']],
 [['narwhal', 'tail', 'puppy', 'unicorn', 'dogs']],
 [['states', 'democratic', 'running', 'nthe', 'bloomberg']],
 [['ukraine', 'presidents', 'rep', 'nhe', 'republican']],
 [['muslimness', 'people', 'skin', 'religion', 'white']],
 [['trump', 'said', 'new', 'media', 'republican']],
 [['said', 'brown', 'police', 'roanoke', 'jones']],
 [['trump', 'nthe', 'hotels', 'office', 'president']],
 [['keys', 'grammys', 'award', 'power', 'wanna']],
 [['americans', 'republican', 'trump', 'ukrainians', 'presidency']],
 [['said', 'student', 'told', 'pence', 'schools']],
 [['crows', 'old', 'original', 'american', 'depict']],
 [['protester', 'nthe', 'police', 'chinese', 'new']]]

### Extract the top 3 keyphrases from each document, print them, and compare the differences.

In [18]:
r = Rake()
for doc in docs:  
  r.extract_keywords_from_text(doc)
  print(r.get_ranked_phrases_with_scores()[:5])


[(32.5, 'country music association awards red carpet'), (23.0, '"\\ nthe star also praised'), (17.5, 'country star chris stapleton'), (17.25, 'school soon ," pink said'), (15.0, 'beautiful trauma tour ranks')]
[(95.9, 'former new york city mayor michael bloomberg stepped forward last week'), (54.858333333333334, 'friday .\\ npatrick could seize upon potential advantages'), (47.99166666666666, 'bid late last year .\\ nthe conversations began'), (42.05833333333333, 'progressive policies .\\ npatrick last year defended'), (29.0, 'elections process would ultimately splash back')]
[(29.5, 'facebook post .\\ nvets took x'), (28.166666666666668, 'little magical furry unicorn ," according'), (24.0, 'internet fame .\\ nthe rush'), (23.833333333333336, "dog rescue nonprofit organization mac \\'"), (19.833333333333336, 'spokesperson told cnn .\\ nnarwhal')]
[(50.666666666666664, 'recent months .\\ na new monmouth university poll'), (48.925, "prolonged delegate fight .\\ nlast week bloomberg \\'")

### Identify and extract the named entities in each document, filtering out the numeric types. Print them and compare the differences between documents.

In [19]:
import spacy
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [21]:
nlp = spacy.load("en_core_web_sm")

spacy_doc = nlp(doc)

entities = [[entity.text, entity.label_] for entity in spacy_doc.ents]
entities

[['Hong Kong', 'GPE'],
 ['8:17 AM ET', 'TIME'],
 ['November 14', 'DATE'],
 ['CNN', 'ORG'],
 ['Hong Kong', 'GPE'],
 ['Thursday', 'DATE'],
 ['evening', 'TIME'],
 ['as thousands', 'CARDINAL'],
 ['\\nThe', 'NORP'],
 ['the almost six-month-long', 'DATE'],
 ['recent days', 'DATE'],
 ['Chinese', 'NORP'],
 ['Thursday morning', 'TIME'],
 ['the Hong Kong Polytechnic University', 'ORG'],
 ['Kowloon', 'GPE'],
 ['Hong Kong Island', 'GPE'],
 ['the University of Hong Kong', 'ORG'],
 ['\\nUniversities', 'ORG'],
 ['the Chinese University of Hong Kong', 'ORG'],
 ['CUHK', 'ORG'],
 ['the New Territories', 'GPE'],
 ['several thousand', 'CARDINAL'],
 ['the third straight day', 'DATE'],
 ['CUHK', 'ORG'],
 ['Tuesday', 'DATE'],
 ['June', 'DATE'],
 ['hundreds', 'CARDINAL'],
 ['more than 1,567', 'CARDINAL'],
 ['Wednesday', 'DATE'],
 ['Thursday', 'DATE'],
 ['petrol bombs', 'PERSON'],
 ['nails.\\nSo', 'ORG'],
 ['Hong Kong', 'GPE'],
 ['CUHK', 'ORG'],
 ['around two weeks', 'DATE'],
 ['Wednesday', 'DATE'],
 ['China',

### For every document in the corpus, iterate over every sentence, extract any SVO triples, print them, and compare.

In [22]:
results = []

for sent in spacy_doc.sents:
  svo = textacy.extract.subject_verb_object_triples(sent)
  results += svo

# de-duplicate our svo results
results = list(set(results))
results

[(television channel, issued, editorial),
 (Student protesters, fortify, campus occupations),
 (protesters, continued, bringing),
 (you, can make, call),
 (you, are going, have),
 (government supporters, hurling, bricks),
 (police, fired, gas),
 (Adam Ni, told, CNN),
 (injuries\nThe protests, have taken, turn),
 (those, continued, to pour),
 (he, began, clearing),
 (protesters, seek, to breach),
 (protesters, continued, to pour),
 ("\n"We, warn, protesters),
 (who, are coerced, be),
 (thousands, prepared, to face),
 (country, will never accept, situation),
 (government, unveiled, range),
 (hundreds, firing, canisters),
 (level, has reached, heights),
 (they, had evacuated, students),
 (MTR, continued, to suspend),
 (protesters, to cross, it),
 (what, is believed, be),
 (students, have been asked, to leave),
 (those, continued, bringing),
 (protesters, hurling, bricks),
 (protesters, barricaded, themselves)]