# Extracting Information from Text Data Assignment

In [57]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [58]:
!pip install textacy --quiet
!pip install rake_nltk --quiet

In [59]:
import spacy
import string
import textacy
import itertools
from nltk import pos_tag
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import tree2conlltags
from gensim.summarization import keywords
from nltk.chunk.regexp import RegexpParser
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from spacy.lang.en import English

In [65]:
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [60]:
PATH = '/content/drive/MyDrive/Thinkful/NLP/cnn_articles/'

DOC_PATTERN = r'.*\.txt'

corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [61]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

### Extract the top 5 keywords from every document in the corpus. Print them and compare the differences in keywords among the documents.

In [62]:
top5_keywords = [keywords(doc, words=5, lemmatize=True).split('\n') for doc in docs]

In [63]:
top5_keywords

[['pink', 'carey', 'tour', 'pretty', 'motocross'],
 ['patrick', 'primary', 'telling', 'democrats', 'cnn'],
 ['narwhal', 'tail', 'puppy', 'unicorn', 'dogs'],
 ['states', 'democratic', 'bloomberg', 'told', 'new'],
 ['republican', 'taylor', 'rep', 'presidents', 'ukraine'],
 ['muslimness', 'people', 'skin', 'white', 'religion'],
 ['news', 'trump', 'said', 'republican', 'media'],
 ['said', 'police', 'brown', 'jones', 'roanoke'],
 ['trump', 'hotels', 'office', 'profit', 'owned'],
 ['keys', 'grammys', 'award', 'power', 'wanna'],
 ['americans', 'republican', 'trump', 'ukrainians', 'investigate'],
 ['student', 'said', 'told', 'pence', 'schools'],
 ['crows', 'disney', 'american', 'old', 'movie'],
 ['protester', 'police', 'chinese', 'new', 'kong']]

### Extract the top 3 keyphrases from each document, print them, and compare the differences.

In [64]:
for doc in docs:
  r = Rake()
  r.extract_keywords_from_text(doc)
  print(r.get_ranked_phrases_with_scores()[:3])

[(32.5, 'country music association awards red carpet'), (17.166666666666664, 'school soon ," pink said'), (16.5, 'country star chris stapleton')]
[(94.03333333333333, 'former new york city mayor michael bloomberg stepped forward last week'), (30.61111111111111, 'patrick could seize upon potential advantages'), (29.0, 'elections process would ultimately splash back')]
[(28.166666666666668, 'little magical furry unicorn ," according'), (17.666666666666664, 'dog rescue nonprofit organization mac'), (13.666666666666666, 'rescue workers speculate may')]
[(40.45, 'current 2020 democrats -- clinton told bbc radio'), (37.666666666666664, 'former new york mayor michael bloomberg made'), (29.0, '2016 democratic nominee playfully tweeted back')]
[(69.41666666666667, 'money ," tweeted white house press secretary stephanie grisham'), (67.75, 'former white house homeland security adviser tom bossert summed'), (63.416666666666664, 'former national security council russia expert fiona hill said')]
[(2

### Identify and extract the named entities in each document, filtering out the numeric types. Print them and compare the differences between documents.

In [66]:
nlp = spacy.load('en_core_web_sm')
spacy_docs = [nlp(doc) for doc in docs]

In [70]:
numeric = ['CARDINAL', 'DATE', 'TIME', 'ORDINAL', 'PERCENT', 'MONEY', 'QUANTITY']
entities = [[[entity.text, entity.label_] for entity in spacy_doc.ents 
             if entity.label_ not in numeric] 
            for spacy_doc in spacy_docs]

In [71]:
entities

[[['Entertainment Tonight', 'WORK_OF_ART'],
  ['the Country Music Association Awards', 'ORG'],
  ['Carey Hart', 'PERSON'],
  ['Willow', 'PERSON'],
  ['Jameson', 'ORG'],
  ['Love Me Anyway', 'WORK_OF_ART'],
  ['Chris Stapleton', 'PERSON'],
  ['Willow', 'PERSON'],
  ['Jameson', 'ORG'],
  ['"Carey', 'ORG'],
  ['Hart', 'ORG'],
  ['Billboard', 'PERSON'],
  ['Pink', 'ORG'],
  ['Beautiful Trauma Tour', 'ORG']],
 [['Deval Patrick', 'PERSON'],
  ['Massachusetts', 'GPE'],
  ['Deval Patrick', 'PERSON'],
  ['flux', 'GPE'],
  ['CNN', 'ORG'],
  ['Patrick', 'ORG'],
  ['New Hampshire', 'GPE'],
  ['Concord', 'GPE'],
  ['CBS This Morning', 'WORK_OF_ART'],
  ['Patrick', 'PERSON'],
  ['Patrick', 'PERSON'],
  ['Alabama', 'GPE'],
  ['Arkansas', 'GPE'],
  ['New Hampshire', 'GPE'],
  ['Patrick', 'PERSON'],
  ['Massachusetts', 'GPE'],
  ['New Hampshire', 'GPE'],
  ['African American', 'NORP'],
  ['South Carolina', 'GPE'],
  ['Democratic', 'NORP'],
  ['CNN', 'ORG'],
  ['Patrick', 'PERSON'],
  ["the Democratic P

### For every document in the corpus, iterate over every sentence, extract any SVO triples, print them, and compare.

In [116]:
SVO_results = []

for doc in spacy_docs:
  temp = []
  for sent in doc.sents:
    svo = textacy.extract.subject_verb_object_triples(sent)
    temp += svo
  SVO_results.append(list(set(temp)))


In [117]:
SVO_results

[[(Pink, taking, break),
  (Jameson, 's going, to start),
  (he, follows, me),
  (she, will be taking, step),
  (star, praised, husband),
  (she, will celebrate, years)],
 [(he, planned, bid),
  (he, would be entering, race),
  (Patrick, had built, team),
  (Patrick, defended, work),
  (he, has made, decision),
  (I, 've never taken, job),
  (I, 've left, conscience),
  (Patrick, entered, sector),
  (he, could make, minute entry),
  (Massachusetts Gov. Deval Patrick, told, allies),
  (that, became, liability),
  (source, tells, CNN),
  (he, seeks, nomination),
  (he, wanted, to put),
  (Deval Patrick, tells, allies),
  (he, ruled, bid),
  (Patrick, told, friends),
  (frontrunner Democrats, have cast, interests),
  (Patrick, has missed, deadline),
  (he, was going, to jump),
  (Patrick, told, WBUR),
  (Patrick, has missed, to appear),
  (Massachusetts Gov. Deval Patrick, told, friends),
  (He, cited, it)],
 [(spokesperson, told, CNN.Narwhal),
  (who, has been named, Smurf),
  (Vets, too

#Lecture Notes

In [6]:
PATH = '/content/drive/MyDrive/Thinkful/NLP/cnn_articles/'

DOC_PATTERN = r'.*\.txt'

corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

In [7]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

In [9]:
doc = docs[0]

In [17]:
#TextRank algorithm 
keyw = keywords(doc, words=5, lemmatize=True).split('\n')
print(keyw)

['pink', 'carey', 'tour', 'year', 'pretty']


In [18]:
r = Rake()
r.extract_keywords_from_text(doc)

In [20]:
r.get_ranked_phrases_with_scores()[:5]

[(32.5, 'country music association awards red carpet'),
 (17.166666666666664, 'school soon ," pink said'),
 (16.5, 'country star chris stapleton'),
 (15.0, 'beautiful trauma tour ranks'),
 (9.5, 'star also praised')]

In [54]:
# Named Entity Recognition
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [55]:
nlp = spacy.load('en_core_web_sm')

spacy_doc = nlp(doc)

In [27]:
type(spacy_doc)

spacy.tokens.doc.Doc

In [28]:
[print(sent) for sent in spacy_doc.sents]

Pink taking a break to focus on familyUpdated 10:18 AM ET, Thu November 14, 2019(CNN) -
Pink has been working pretty hard and it sounds like she will be taking a step back in 2020.Speaking with "Entertainment Tonight" on the Country Music Association Awards red carpet
, the singer was joined by her husband, Carey Hart, and their kids Willow, 8, and Jameson, 2.Pink was there to perform her song "
Love Me Anyway" with country star Chris Stapleton, and she  talked about how hectic things have been.
"We did two and a half years of [music] and Willow's back in school now, Jameson's going to start pre-school soon," Pink said.
"It's kind of the year of the family.
"The
star also praised her husband, with whom she will celebrate 14 years of marriage in January.
"Carey
has a lot going on as well," she said of Hart, who went from being a professional motocross competitor to racing off-road trucks.
"He's super supportive, he follows me around the world and now it's his turn.
"According
to Billboa

[None, None, None, None, None, None, None, None, None, None, None, None, None]

In [36]:
entities = [[entity.text, entity.label_] for entity in spacy_doc.ents]

In [38]:
spacy.displacy.render(spacy_doc, style='ent', jupyter=True, options={'distance': 90})

In [47]:
results = []

for sent in spacy_doc.sents:
  svo = textacy.extract.subject_verb_object_triples(sent)
  results += svo

In [48]:
results = list(set(results))

In [49]:
results

[(she, will celebrate, years),
 (she, will be taking, step),
 (Pink, taking, break),
 (star, praised, husband),
 (Jameson, 's going, to start),
 (he, follows, me)]

In [101]:
svo_results = [list(textacy.extract.subject_verb_object_triples(sent)) for sent in spacy_doc.sents 
               if list(textacy.extract.subject_verb_object_triples(sent)) != []]

In [102]:
svo_results

[[(Pink, taking, break)],
 [(she, will be taking, step)],
 [(Jameson, 's going, to start)],
 [(star, praised, husband), (she, will celebrate, years)],
 [(he, follows, me)]]

In [113]:
def extract_SVO(text):
    tuples = textacy.extract.subject_verb_object_triples(text)
    if tuples:
        tuples_to_list = list(tuples)
        tuples_list.append(tuples_to_list)