# Extracting Information from Text Data Assignment

In [1]:
import spacy
import string
import textacy
import itertools
from nltk import pos_tag
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import tree2conlltags
from gensim.summarization import keywords
from nltk.chunk.regexp import RegexpParser
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [None]:
# !pip install textacy --quiet
# !pip install rake_nltk --quiet

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [2]:
path = 'cnn_articles/'
doc_pattern = r'.*\.txt'
corpus = PlaintextCorpusReader(path, doc_pattern)
corpus

<PlaintextCorpusReader in '/Users/abilenky/projects/cnn_articles'>

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [5]:
corpus.fileids()

['article_0.txt',
 'article_1.txt',
 'article_10.txt',
 'article_11.txt',
 'article_12.txt',
 'article_13.txt',
 'article_2.txt',
 'article_3.txt',
 'article_4.txt',
 'article_5.txt',
 'article_6.txt',
 'article_7.txt',
 'article_8.txt',
 'article_9.txt']

In [6]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]
docs[0]

'Pink taking a break to focus on familyUpdated 10:18 AM ET, Thu November 14, 2019(CNN) - Pink has been working pretty hard and it sounds like she will be taking a step back in 2020.Speaking with "Entertainment Tonight" on the Country Music Association Awards red carpet, the singer was joined by her husband, Carey Hart, and their kids Willow, 8, and Jameson, 2.Pink was there to perform her song "Love Me Anyway" with country star Chris Stapleton, and she  talked about how hectic things have been. "We did two and a half years of [music] and Willow\'s back in school now, Jameson\'s going to start pre-school soon," Pink said. "It\'s kind of the year of the family."The star also praised her husband, with whom she will celebrate 14 years of marriage in January."Carey has a lot going on as well," she said of Hart, who went from being a professional motocross competitor to racing off-road trucks. "He\'s super supportive, he follows me around the world and now it\'s his turn."According to Billbo

### Extract the top 5 keywords from every document in the corpus. Print them and compare the differences in keywords among the documents.

In [8]:
keywords(docs[0], words=5, lemmatize=True).split('\n')

['pink', 'carey', 'tour', 'motocross', 'pretty']

In [9]:
for doc in docs:
    print(keywords(doc, words=5, lemmatize=True).split('\n'))
    print('----------')

['pink', 'carey', 'tour', 'pretty', 'year']
----------
['patrick', 'primary', 'telling', 'democrats', 'cnn']
----------
['narwhal', 'tail', 'puppy', 'unicorn', 'dogs']
----------
['states', 'democratic', 'bloomberg', 'told', 'new']
----------
['republican', 'taylor', 'rep', 'presidents', 'ukraine']
----------
['muslimness', 'people', 'skin', 'white', 'religion']
----------
['news', 'trump', 'said', 'republican', 'media']
----------
['said', 'police', 'brown', 'jones', 'roanoke']
----------
['trump', 'hotels', 'office', 'profit', 'owned']
----------
['keys', 'grammys', 'award', 'power', 'wanna']
----------
['americans', 'republican', 'trump', 'ukrainians', 'investigate']
----------
['student', 'said', 'told', 'pence', 'schools']
----------
['crows', 'disney', 'american', 'old', 'movie']
----------
['protester', 'police', 'chinese', 'new', 'kong']
----------


### Extract the top 3 keyphrases from each document, print them, and compare the differences.

In [13]:
r = Rake()
r.extract_keywords_from_text(docs[0])
r.get_ranked_phrases_with_scores()[:3]

[(32.5, 'country music association awards red carpet'),
 (17.166666666666664, 'school soon ," pink said'),
 (16.5, 'country star chris stapleton')]

In [14]:
for doc in docs:
    r.extract_keywords_from_text(doc)
    print(r.get_ranked_phrases_with_scores()[:3])
    print('------------------')

[(32.5, 'country music association awards red carpet'), (17.166666666666664, 'school soon ," pink said'), (16.5, 'country star chris stapleton')]
------------------
[(94.03333333333333, 'former new york city mayor michael bloomberg stepped forward last week'), (30.61111111111111, 'patrick could seize upon potential advantages'), (29.0, 'elections process would ultimately splash back')]
------------------
[(28.166666666666668, 'little magical furry unicorn ," according'), (17.666666666666664, 'dog rescue nonprofit organization mac'), (13.666666666666666, 'rescue workers speculate may')]
------------------
[(40.45, 'current 2020 democrats -- clinton told bbc radio'), (37.666666666666664, 'former new york mayor michael bloomberg made'), (29.0, '2016 democratic nominee playfully tweeted back')]
------------------
[(69.41666666666667, 'money ," tweeted white house press secretary stephanie grisham'), (67.75, 'former white house homeland security adviser tom bossert summed'), (63.41666666666

### Identify and extract the named entities in each document, filtering out the numeric types. Print them and compare the differences between documents.

In [24]:
numeric = ['PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
nlp = spacy.load('en_core_web_sm')
spacy_doc = nlp(docs[0])
for ent in spacy_doc.ents:
    if ent.label_ not in numeric:
        print(ent.text, '-', ent.label_)
        print('-----')

10:18 AM ET - TIME
-----
November 14 - DATE
-----
Entertainment Tonight - WORK_OF_ART
-----
the Country Music Association Awards - ORG
-----
Carey Hart - PERSON
-----
Willow - PERSON
-----
8 - DATE
-----
Jameson - PERSON
-----
Love Me Anyway - WORK_OF_ART
-----
Chris Stapleton - PERSON
-----
two and a half years - DATE
-----
Willow - PERSON
-----
Jameson - PERSON
-----
Pink - ORG
-----
the year - DATE
-----
14 years - DATE
-----
January - DATE
-----
"Carey - ORG
-----
Hart - PERSON
-----


In [25]:
for doc in docs:
    spacy_doc = nlp(doc)
    for ent in spacy_doc.ents:
        if ent.label_ not in numeric:
            print(ent.text, '-', ent.label_)
    
    print('-----')
    

10:18 AM ET - TIME
November 14 - DATE
Entertainment Tonight - WORK_OF_ART
the Country Music Association Awards - ORG
Carey Hart - PERSON
Willow - PERSON
8 - DATE
Jameson - PERSON
Love Me Anyway - WORK_OF_ART
Chris Stapleton - PERSON
two and a half years - DATE
Willow - PERSON
Jameson - PERSON
Pink - ORG
the year - DATE
14 years - DATE
January - DATE
"Carey - ORG
Hart - PERSON
-----
Deval Patrick - PERSON
11:06 PM ET - TIME
November 13 - DATE
Massachusetts - GPE
Deval Patrick - PERSON
Wednesday - DATE
flux Wednesday - DATE
CNN - ORG
Patrick - ORG
New Hampshire - GPE
Thursday - DATE
Concord - GPE
CBS This Morning - WORK_OF_ART
Patrick - PERSON
Patrick - PERSON
Alabama - GPE
Arkansas - GPE
New Hampshire - GPE
Friday - DATE
Patrick - ORG
Massachusetts - GPE
New Hampshire - GPE
African American - NORP
South Carolina - GPE
Democratic - NORP
CNN - ORG
Monday - DATE
Patrick - PERSON
last-minute - TIME
the Democratic Party's - ORG
late last year - DATE
New York City - GPE
Michael Bloomberg - PE

Fox - ORG
November 15 - DATE
2019New York - GPE
CNN - ORG
Trump - PERSON
about day one - DATE
Fox News - ORG
Wednesday night - TIME
White House - ORG
Stephanie Grisham - PERSON
today - DATE
Donald Trump Jr. - PERSON
Jeff Sessions - PERSON
Fox - ORG
Wednesday - DATE
Democrats - NORP
Republicans - NORP
Republican - NORP
Democrat - NORP
every minute - TIME
Trump - PERSON
CNN - ORG
Oliver Darcy - PERSON
earlier in the day - TIME
Wednesday - DATE
Trump - PERSON
Darcy - PERSON
Trump - ORG
Fox News - ORG
Wednesday night - TIME
Tucker - PERSON
8 p.m. - TIME
Tucker Carlson - PERSON
Christmas - DATE
New Year's - EVENT
the Super Bowl - EVENT
Carlson - ORG
Wednesday - DATE
Christian Whiton - PERSON
Bill Taylor - PERSON
George Kent - PERSON
all evening - TIME
Grisham - PERSON
Trump - ORG
Later in the hour - TIME
Carlson - ORG
today - DATE
Larry O'Connor - PERSON
America - GPE
A few minutes later - TIME
Trish Regan - PERSON
the Fox Business Network - ORG
Regan - PERSON
Fox News - ORG
Carlson - ORG
W

Hong Kong - GPE
November 14 - DATE
2019Hong Kong - GPE
CNN - ORG
Hong Kong - GPE
Thursday - DATE
evening - TIME
recent days - DATE
Chinese - NORP
Thursday - DATE
morning - TIME
the Hong Kong Polytechnic University - ORG
Kowloon - GPE
Hong Kong Island - GPE
the University of Hong Kong - ORG
the Chinese University of Hong Kong - ORG
the New Territories - GPE
the third straight day - DATE
Tuesday - DATE
June - DATE
Wednesday - DATE
Thursday - DATE
Hong Kong - GPE
two weeks early - DATE
Wednesday - DATE
China - GPE
Thursday - DATE
Central district - LOC
a fourth straight day - DATE
Tai Koo - GPE
Hong Kong Island - GPE
Kowloon - LOC
Cross-Harbour Tunnel - ORG
Thursday - DATE
morning - TIME
MTR - ORG
the week - DATE
this week - DATE
Hong Kong's - GPE
Hospital Authority - ORG
Wednesday - DATE
Wednesday - DATE
70-year-old - DATE
15-year-old - DATE
Hong Kong's - GPE
Hospital Authority - ORG
Earlier in the week - DATE
Monday - DATE
this week - DATE
Chinese - NORP
Thursday - DATE
China - GPE
Hong

### For every document in the corpus, iterate over every sentence, extract any SVO triples, print them, and compare.

In [31]:
results = []
spacy_doc = nlp(docs[0])
for sent in spacy_doc.sents:
    svo = textacy.extract.subject_verb_object_triples(sent)
    results += svo
    
results = list(set(results))
for res in results:
    print(res)

(she, will be taking, step)
(star, praised, husband)
(she, will celebrate, years)
(he, follows, me)
(Jameson, 's going, to start)


In [32]:
for doc in docs:
    results = []
    spacy_doc = nlp(doc)
    for sent in spacy_doc.sents:
        svo = textacy.extract.subject_verb_object_triples(sent)
        results += svo

    results = list(set(results))
    for res in results:
        print(res)
    print('---------')

(she, will celebrate, years)
(he, follows, me)
(Jameson, 's going, to start)
(she, will be taking, step)
(star, praised, husband)
---------
(that, became, liability)
(source, tells, CNN)
(Patrick, told, WBUR)
(I, 've left, conscience)
(Patrick, defended, work)
(Patrick, told, friends)
(Gov. Deval Patrick, told, allies)
(he, has made, decision)
(he, ruled, bid)
(frontrunner Democrats, have cast, interests)
(he, would be entering, race)
(he, planned, bid)
(Patrick, has missed, deadline)
(Patrick, has missed, to appear)
(Deval Patrick, tells, allies)
(he, could make, minute entry)
(Patrick, entered, sector)
(he, was going, to jump)
(he, wanted, to put)
(Patrick, had built, team)
(he, seeks, nomination)
(Gov. Deval Patrick, told, friends)
(I, 've never taken, job)
(He, cited, it)
---------
(they, want, to monitor)
(which, took, him)
(spokesperson, told, CNN)
(which, showed, tail)
(he, got, meds)
(He, seems, other)
(it, doesn't cause, pain)
(organization, received, adoption applications)
(t

(Dumbo, featured, character)
(He, omitted, scene)
(that, feature, prejudices)
(characters, put, circus tents)
(that, feature, content)
(cartoons, do not represent, society)
(Warner Bros., used, this)
(They, may depict, some)
(Director Tim Burton, chose, to not include)
(It, may contain, depictions)
---------
(those, continued, to pour)
(Student protesters, fortify, campus occupations)
(police, have appeared, reticent)
(level, has reached, heights)
(""We, have had, talking)
(who, are coerced, be)
(students, have been asked, to leave)
(you, are going, have)
(he, began, clearing)
(you, can make, call)
(Adam Ni, told, CNN)
(schools, have suspended, classes)
(country, will never accept, situation)
(protesters, to cross, it)
(what, is believed, be)
(""We, warn, protesters)
(police, fired, gas)
(protesters, continued, to pour)
(injuriesThe protests, have taken, turn)
(they, had evacuated, students)
(thousands, prepared, to face)
(MTR, continued, to suspend)
(It, echoed, editorial)
(run televi