# Extracting Information from Text Data Assignment

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import spacy
import string
import textacy
import itertools

from nltk import pos_tag
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import tree2conlltags
from gensim.summarization import keywords
from nltk.chunk.regexp import RegexpParser
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [1]:
!pip install textacy --quiet
!pip install rake_nltk --quiet

[K     |████████████████████████████████| 184kB 6.1MB/s 
[K     |████████████████████████████████| 481kB 6.9MB/s 
[K     |████████████████████████████████| 102kB 5.8MB/s 
[K     |████████████████████████████████| 1.9MB 9.7MB/s 
[?25h  Building wheel for cytoolz (setup.py) ... [?25l[?25hdone
  Building wheel for rake-nltk (setup.py) ... [?25l[?25hdone


In [5]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [10]:
path = "cnn_lite/"
doc_pattern = r".*\.txt"
corpus = PlaintextCorpusReader(path, doc_pattern)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [11]:
docs = [corpus.raw(id_) for id_ in corpus.fileids()]

In [13]:
doc = docs[0]

### Extract the top 5 keywords from every document in the corpus. Print them and compare the differences in keywords among the documents.

In [14]:
keywords(doc, words=5, lemmatize=True).split('\n')

['sanders', 'campaign', 'news', 'democrats', 'recent']

### Extract the top 3 keyphrases from each document, print them, and compare the differences.

In [20]:
from rake_nltk import Rake
def get_ranked_phrases(doc):    
    r = Rake()
    r.extract_keywords_from_text(doc)
    
    return r.get_ranked_phrases_with_scores()[:3]

top_3_key_phrases = [get_ranked_phrases(doc) for doc in docs]

In [21]:
top_3_key_phrases

[[(64.35555555555555,
   'win ," campaign manager faiz shakir told cnn last month'),
  (52.0, 'billionaire former new york city mayor michael bloomberg'),
  (33.0, 'former vice president joe biden came')],
 [(43.25, 'attorneys general offices could gather financial information'),
  (25.25, 'attorney general brian frosh called'),
  (25.0, '© 2019 cable news network')],
 [(26.015151515151516, 'america ," cnn opinion invited readers'),
  (23.0, '© 2019 cable news network'),
  (23.0, 'seen increasingly vast gaps develop')],
 [(29.42820512820513,
   'senior administration official said trump first raised'),
  (29.2, 'turkish president recep tayyip erdoğan pulled'),
  (25.0, '© 2019 cable news network')],
 [(25.0, '© 2019 cable news network'),
  (20.625, 'unusual ," said sjoerd hulshof'),
  (14.8, 'discover new things ."')],
 [(82.4, 'make quick judgments ," saugus high student lèan aguilar said'),
  (25.0, '© 2019 cable news network'),
  (24.0, 'santa clarita valley signal reported')],
 [(2

### Identify and extract the named entities in each document, filtering out the numeric types. Print them and compare the differences between documents.

In [22]:
nlp = spacy.load('en_core_web_sm')

spacy_doc = nlp(doc)

In [23]:
#for entity in spacy_doc.ents:
#    print(entity.text, '-', entity.label_)

entities = [[entity.text, entity.label_] for entity in spacy_doc.ents]
entities

[['Des Moines', 'GPE'],
 ['Iowa', 'GPE'],
 ['CNN', 'ORG'],
 ['Alexandria Ocasio-Cortez', 'PERSON'],
 ['Iowa', 'GPE'],
 ['Bernie Sanders', 'PERSON'],
 ['last week', 'DATE'],
 ['Vermont', 'GPE'],
 ['Council Bluffs', 'ORG'],
 ['the last six weeks', 'DATE'],
 ['Sanders', 'ORG'],
 ['second', 'ORDINAL'],
 ['Democratic', 'NORP'],
 ['Las Vegas', 'GPE'],
 ['the first night of October', 'TIME'],
 ['Sanders', 'ORG'],
 ['New Hampshire', 'GPE'],
 ['Iowa', 'GPE'],
 ['New York', 'GPE'],
 ['Minnesota', 'GPE'],
 ['a trying summer', 'DATE'],
 ['march', 'DATE'],
 ['Sanders', 'ORG'],
 ['Saturday', 'DATE'],
 ['Des Moines', 'GPE'],
 ['three', 'CARDINAL'],
 ['Drake University', 'ORG'],
 ['one', 'CARDINAL'],
 ['Ocasio-Cortez', 'ORG'],
 ['Ilhan Omar', 'PERSON'],
 ['Rashida Tlaib', 'PERSON'],
 ['last month', 'DATE'],
 ['Ohio', 'GPE'],
 ['Sanders', 'ORG'],
 ['months', 'DATE'],
 ['Democratic', 'NORP'],
 ['Sanders', 'ORG'],
 ['About two weeks', 'DATE'],
 ['CNN', 'ORG'],
 ['Democrats', 'NORP'],
 ['New Hampshire', '

### For every document in the corpus, iterate over every sentence, extract any SVO triples, print them, and compare.

In [25]:
import textacy

In [62]:
"""
def get_svo_triplets(sent):    
    svo = textacy.extract.subject_verb_object_triples(sent)
    return list(svo)

results = []
results = [[results.append(get_svo_triplets(sent))] for sent in spacy_doc.sents]
"""

In [66]:
results = []

for sent in spacy_doc.sents:
  svo = textacy.extract.subject_verb_object_triples(sent)
  results += svo

In [67]:
results = list(set(results))

In [68]:
results

[(I, 've got, arteries),
 (backing, bolstered, argument),
 (candidate, entered, race),
 (I, know, that),
 (he, fair, option),
 (Iowa caucuses, kick, primary),
 (campaign, puts, it),
 (that, jarred, people),
 (I, 'm going, to tell),
 (Sanders, began, bid),
 (we, tell, Trump),
 (supporters, are saying, much),
 (surveys, show, group jockeying),
 (Sanders, invited, reporters),
 (pugilist, welcomed, news),
 (that, showed, him),
 (nobody, would give, damn),
 (others, began, to migrate),
 (campaign, attracting, class coalition),
 (Sanders, remains, front),
 (he, 's taken, tone),
 (Sanders, has charted, revival),
 (he, can grow, it),
 (Cortez, capped, debut),
 (we, tell, Trump),
 (me, say, it),
 (that, welcomed, him),
 (we, allow, to happen),
 (Sanders, injected, some),
 (appeal wane, to play, foil),
 (I, want, to say),
 (CNN poll, showed, Sanders),
 (that, has vaulted, him),
 (Mayor Pete Buttigieg, has emerged, foe),
 (staff, are saying, much),
 (aide, told, CNN),
 (Sanders, delivered, punchl