# Extracting Information from Text Data Assignment

In [1]:
!pip install textacy --quiet
!pip install rake_nltk --quiet

In [2]:
import spacy
import string
import textacy
import itertools
from nltk import pos_tag
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import tree2conlltags
from gensim.summarization import keywords
from nltk.chunk.regexp import RegexpParser
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [9]:
PATH = '/content/drive/MyDrive/python_for_data_scientists/DSI07/DataSets/cnn_lite'

DOC_PATTERN = r'.*\.txt'

corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [10]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

### Extract the top 5 keywords from every document in the corpus. Print them and compare the differences in keywords among the documents.

In [6]:
top5_keywords = [keywords(doc, words=5, lemmatize=True).split('\n') for doc in docs]
top5_keywords

[['sanders', 'campaign', 'news', 'democrats', 'recent'],
 ['courts', 'trump', 'states', 'general', 'businesses'],
 ['politically', 'news', 'people', 'america', 'trump'],
 ['erdogan', 'said', 'trump', 'meet', 'turkish'],
 ['laurent', 'said', 'news', 'age', 'student'],
 ['schools', 'high', 'gun', 'town', 'clarita'],
 ['politically', 'people', 'american', 'guns', 'news'],
 ['trump', 'november', 'border', 'facts', 'said'],
 ['politically', 'presidents', 'likely', 'community', 'american'],
 ['visitors', 'park', 'national', 'african', 'giraffes'],
 ['livingstone', 'christmas', 'clarke', 'said', 'wham'],
 ['trump', 'taylor', 'president', 'news', 'republican'],
 ['bevin', 'general', 'kentucky', 'thursday', 'cnn'],
 ['gaynor', 'hurricane', 'carper', 'changes', 'said'],
 ['trump', 'democratic', 'republicans', 'presidency', 'news'],
 ['rehaag', 'room', 'media', 'woman', 'later'],
 ['snacks', 'hostess', 'twinkies', 'cereals', 'cnn'],
 ['kushner', 'trump', 'news', 'department', 'impeachment'],
 ['p

### Extract the top 3 keyphrases from each document, print them, and compare the differences.

In [22]:
for doc in docs:
  r = Rake()
  r.extract_keywords_from_text(doc)
  print(r.get_ranked_phrases_with_scores()[:3])
  print('-------------------')

[(64.35555555555555, 'win ," campaign manager faiz shakir told cnn last month'), (52.0, 'billionaire former new york city mayor michael bloomberg'), (33.0, 'former vice president joe biden came')]
-------------------
[(43.25, 'attorneys general offices could gather financial information'), (25.25, 'attorney general brian frosh called'), (25.0, '© 2019 cable news network')]
-------------------
[(26.015151515151516, 'america ," cnn opinion invited readers'), (23.0, '© 2019 cable news network'), (23.0, 'seen increasingly vast gaps develop')]
-------------------
[(29.42820512820513, 'senior administration official said trump first raised'), (29.2, 'turkish president recep tayyip erdoğan pulled'), (25.0, '© 2019 cable news network')]
-------------------
[(25.0, '© 2019 cable news network'), (20.625, 'unusual ," said sjoerd hulshof'), (14.8, 'discover new things ."')]
-------------------
[(82.4, 'make quick judgments ," saugus high student lèan aguilar said'), (25.0, '© 2019 cable news netwo

### Identify and extract the named entities in each document, filtering out the numeric types. Print them and compare the differences between documents.

In [23]:

nlp = spacy.load('en_core_web_sm')
spacy_docs = [nlp(doc) for doc in docs]

In [24]:
numeric = ['CARDINAL', 'DATE', 'TIME', 'ORDINAL', 'PERCENT', 'MONEY', 'QUANTITY']

entities = [[[entity.text, entity.label_] for entity in spacy_doc.ents 
             if entity.label_ not in numeric] 
            for spacy_doc in spacy_docs]

In [25]:
entities

[[['Des Moines', 'GPE'],
  ['Iowa', 'GPE'],
  ['CNN', 'ORG'],
  ['Alexandria Ocasio-Cortez', 'PERSON'],
  ['Iowa', 'GPE'],
  ['Bernie Sanders', 'PERSON'],
  ['Vermont', 'GPE'],
  ['Council Bluffs', 'ORG'],
  ['Sanders', 'ORG'],
  ['Democratic', 'NORP'],
  ['Las Vegas', 'GPE'],
  ['Sanders', 'ORG'],
  ['New Hampshire', 'GPE'],
  ['Iowa', 'GPE'],
  ['New York', 'GPE'],
  ['Minnesota', 'GPE'],
  ['Sanders', 'ORG'],
  ['Des Moines', 'GPE'],
  ['Drake University', 'ORG'],
  ['Ocasio-Cortez', 'ORG'],
  ['Ilhan Omar', 'PERSON'],
  ['Rashida Tlaib', 'PERSON'],
  ['Ohio', 'GPE'],
  ['Sanders', 'ORG'],
  ['Democratic', 'NORP'],
  ['Sanders', 'ORG'],
  ['CNN', 'ORG'],
  ['Democrats', 'NORP'],
  ['New Hampshire', 'GPE'],
  ["Elizabeth Warren's", 'PERSON'],
  ['Joe Biden', 'PERSON'],
  ['South Bend', 'GPE'],
  ['Indiana', 'GPE'],
  ['Pete Buttigieg', 'PERSON'],
  ['Democrats', 'NORP'],
  ['Iowa', 'GPE'],
  ['Bernie', 'PERSON'],
  ['the White House', 'ORG'],
  ['Democratic', 'NORP'],
  ['the better 

### For every document in the corpus, iterate over every sentence, extract any SVO triples, print them, and compare.

In [26]:
SVO_results = []

for doc in spacy_docs:
  temp = []
  for sent in doc.sents:
    svo = textacy.extract.subject_verb_object_triples(sent)
    temp += svo
  SVO_results.append(list(set(temp)))

In [27]:
SVO_results

[[(that, jarred, people),
  (we, allow, to happen),
  (Sanders, delivered, punchline),
  (appeal wane, to play, foil),
  (aide, told, CNN),
  (we, tell, Trump),
  (Sanders, invited, television cameras),
  (We, don't watch, race),
  (I, know, that),
  (supporters, are saying, much),
  (he, challenged, Clinton),
  (me, say, it),
  (I, 'm going, to tell),
  (that, showed, him),
  (Mayor Pete Buttigieg, has emerged, foe),
  (staff, are saying, much),
  (Iowa caucuses, kick, primary),
  (we, tell, Trump),
  (Sanders, decided, give),
  (I, thank, you),
  (Sanders, welcomed, news),
  (nobody, would give, damn),
  (surveys, show, group jockeying),
  (I, want, to say),
  (one, touched, digits),
  (person, seems, different),
  (Sanders, welcomed, run),
  (Sanders, has charted, revival),
  (backing, bolstered, argument),
  (Sanders, is taking, source),
  (South Bend, has emerged, foe),
  (campaign, attracting, class coalition),
  (Sanders, began, bid),
  (manager Faiz Shakir, told, CNN),
  (Biden