# Extracting Information from Text Data Assignment

In [None]:
!pip install textacy --quiet
!pip install rake_nltk --quiet

[K     |████████████████████████████████| 184kB 5.6MB/s 
[K     |████████████████████████████████| 102kB 4.1MB/s 
[K     |████████████████████████████████| 1.9MB 6.7MB/s 
[K     |████████████████████████████████| 481kB 21.5MB/s 
[?25h  Building wheel for cytoolz (setup.py) ... [?25l[?25hdone
  Building wheel for rake-nltk (setup.py) ... [?25l[?25hdone


In [None]:
import spacy
import string
import textacy
import itertools
from nltk import pos_tag
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import tree2conlltags
from gensim.summarization import keywords
from nltk.chunk.regexp import RegexpParser
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [None]:
PATH = '/content/drive/MyDrive/content/cnn_lite'

DOC_PATTERN = r'.*\.txt'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [None]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

### Extract the top 5 keywords from every document in the corpus. Print them and compare the differences in keywords among the documents.

In [None]:
keys = []
for doc in docs:
  keys.append(keywords(doc, words=5, lemmatize=True).split('\n'))

In [None]:
keys

[['sanders', 'campaign', 'news', 'democrats', 'recent'],
 ['courts', 'trump', 'states', 'general', 'businesses'],
 ['politically', 'news', 'people', 'america', 'trump'],
 ['erdogan', 'said', 'trump', 'meet', 'turkish'],
 ['laurent', 'said', 'news', 'age', 'student'],
 ['schools', 'high', 'gun', 'town', 'clarita'],
 ['politically', 'people', 'american', 'guns', 'news'],
 ['trump', 'november', 'border', 'facts', 'said'],
 ['politically', 'presidents', 'likely', 'community', 'american'],
 ['visitors', 'park', 'national', 'african', 'giraffes'],
 ['livingstone', 'christmas', 'clarke', 'said', 'wham'],
 ['trump', 'taylor', 'president', 'news', 'republican'],
 ['bevin', 'general', 'kentucky', 'thursday', 'cnn'],
 ['gaynor', 'hurricane', 'carper', 'changes', 'said'],
 ['trump', 'democratic', 'republicans', 'presidency', 'news'],
 ['rehaag', 'room', 'media', 'woman', 'later'],
 ['snacks', 'hostess', 'twinkies', 'cereals', 'cnn'],
 ['kushner', 'trump', 'news', 'department', 'impeachment'],
 ['p

A lot of information about trump and politics. There's a lot of difference between location and topic. 

### Extract the top 3 keyphrases from each document, print them, and compare the differences.

In [None]:
keys = []
for doc in docs:
  r = Rake()
  r.extract_keywords_from_text(doc)
  keys.append(r.get_ranked_phrases_with_scores()[:3])

keys

[[(64.35555555555555,
   'win ," campaign manager faiz shakir told cnn last month'),
  (52.0, 'billionaire former new york city mayor michael bloomberg'),
  (33.0, 'former vice president joe biden came')],
 [(43.25, 'attorneys general offices could gather financial information'),
  (25.25, 'attorney general brian frosh called'),
  (25.0, '© 2019 cable news network')],
 [(26.015151515151516, 'america ," cnn opinion invited readers'),
  (23.0, '© 2019 cable news network'),
  (23.0, 'seen increasingly vast gaps develop')],
 [(29.42820512820513,
   'senior administration official said trump first raised'),
  (29.2, 'turkish president recep tayyip erdoğan pulled'),
  (25.0, '© 2019 cable news network')],
 [(25.0, '© 2019 cable news network'),
  (20.625, 'unusual ," said sjoerd hulshof'),
  (14.8, 'discover new things ."')],
 [(82.4, 'make quick judgments ," saugus high student lèan aguilar said'),
  (25.0, '© 2019 cable news network'),
  (24.0, 'santa clarita valley signal reported')],
 [(2

A lot of them have copyright information on them. 

### Identify and extract the named entities in each document, filtering out the numeric types. Print them and compare the differences between documents.

In [None]:
nlp = spacy.load("en_core_web_sm")
spacy_docs = [nlp(doc) for doc in docs]

In [None]:
entities = []
disp = []
for doc in spacy_docs:
  b = [[entity.text, entity.label_] for entity in doc.ents if entity.label_ not in ['Quantity']]
  entities.append(b)
  disp += b

In [None]:
entities

[[['Des Moines', 'GPE'],
  ['Iowa', 'GPE'],
  ['CNN', 'ORG'],
  ['Alexandria Ocasio-Cortez', 'PERSON'],
  ['Iowa', 'GPE'],
  ['Bernie Sanders', 'PERSON'],
  ['last week', 'DATE'],
  ['Vermont', 'GPE'],
  ['Council Bluffs', 'ORG'],
  ['the last six weeks', 'DATE'],
  ['Sanders', 'ORG'],
  ['second', 'ORDINAL'],
  ['Democratic', 'NORP'],
  ['Las Vegas', 'GPE'],
  ['the first night of October', 'TIME'],
  ['Sanders', 'ORG'],
  ['New Hampshire', 'GPE'],
  ['Iowa', 'GPE'],
  ['New York', 'GPE'],
  ['Minnesota', 'GPE'],
  ['a trying summer', 'DATE'],
  ['march', 'DATE'],
  ['Sanders', 'ORG'],
  ['Saturday', 'DATE'],
  ['Des Moines', 'GPE'],
  ['three', 'CARDINAL'],
  ['Drake University', 'ORG'],
  ['one', 'CARDINAL'],
  ['Ocasio-Cortez', 'ORG'],
  ['Ilhan Omar', 'PERSON'],
  ['Rashida Tlaib', 'PERSON'],
  ['last month', 'DATE'],
  ['Ohio', 'GPE'],
  ['Sanders', 'ORG'],
  ['months', 'DATE'],
  ['Democratic', 'NORP'],
  ['Sanders', 'ORG'],
  ['About two weeks', 'DATE'],
  ['CNN', 'ORG'],
  ['D

There are a lot of similarities, namely in the organizations that we're using. 

### For every document in the corpus, iterate over every sentence, extract any SVO triples, print them, and compare.

In [None]:
results = []

for doc in spacy_docs:
  for sent in doc.sents:
    svo = textacy.extract.subject_verb_object_triples(sent)
    results += svo
# de-duplicate our svo results
results = list(set(results))

In [None]:
results

[(Patrick, defended, work),
 (Robert Butler Jr., opens, fire),
 (Dorian, brought, tornadoes),
 (Taylor, delivered, opening statement),
 (they, risked, lives),
 (he, had approached, Michael),
 (Smith, called, criticism),
 (he, accepted, gifts),
 (Taylor, shed, light),
 (children, are watching, bloodsport),
 (frontrunner Democrats, have cast, interests),
 (fathers, fought, War),
 (they, have certified, results),
 (Nathan Faris, shoots, Perrin),
 (rules, permitted, to pontificate),
 (opposition, is going, to say),
 (GOP lawmakers, used, to point),
 (who, follow, Trump),
 (shooter Darrick Evans, is given, sentence),
 (hearings, would lay, facts),
 (people, form, echo chambers),
 (student, stopped, offering),
 (there, 's going, be),
 (White House, lifted, hold),
 (people, call, it),
 (who, rescued, passenger),
 (he, pushed, Erdoğan),
 (he, leaves, office),
 (Twinkies shortage, gripped, States),
 (She, injures, people),
 (We, asked, voters),
 (guests, included, members),
 (I, 've got, arteri

I have a lot of sentences. 