# Extracting Information from Text Data Assignment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install textacy --quiet
!pip install rake_nltk --quiet

[K     |████████████████████████████████| 184kB 6.0MB/s 
[K     |████████████████████████████████| 1.9MB 23.7MB/s 
[K     |████████████████████████████████| 102kB 9.4MB/s 
[K     |████████████████████████████████| 481kB 41.8MB/s 
[?25h  Building wheel for cytoolz (setup.py) ... [?25l[?25hdone
  Building wheel for rake-nltk (setup.py) ... [?25l[?25hdone


In [4]:
import spacy
import string
import textacy
import itertools
from nltk import pos_tag
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import tree2conlltags
from gensim.summarization import keywords
from nltk.chunk.regexp import RegexpParser
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [109]:
PATH = '/content/drive/MyDrive/python_for_data_scientists/cnn_lite'
DOC_PATTERN = r'.*\.txt'
cnn = PlaintextCorpusReader(PATH , DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [110]:
docs = [cnn.raw(fileid) for fileid in cnn.fileids()]

In [111]:
len(docs)

57

### Extract the top 5 keywords from every document in the corpus. Print them and compare the differences in keywords among the documents.

In [112]:
key_words = [keywords(doc, words=5, lemmatize=True).split('\n') for doc in docs]

In [113]:
key_words[:5]

[['sanders', 'campaign', 'news', 'democrats', 'recent'],
 ['courts', 'trump', 'states', 'general', 'businesses'],
 ['politically', 'news', 'people', 'america', 'trump'],
 ['erdogan', 'said', 'trump', 'meet', 'turkish'],
 ['laurent', 'said', 'news', 'student', 'age']]

In [114]:
len(key_words)

57

### Extract the top 3 keyphrases from each document, print them, and compare the differences.

In [115]:
r = Rake()
r.extract_keywords_from_text(docs[0])
r.get_ranked_phrases_with_scores()[:3]

[(64.35555555555555,
  'win ," campaign manager faiz shakir told cnn last month'),
 (52.0, 'billionaire former new york city mayor michael bloomberg'),
 (33.0, 'former vice president joe biden came')]

In [116]:
key_phrases = []

for doc in docs:
  r.extract_keywords_from_text(doc)
  key_phrases.append(r.get_ranked_phrases()[:3])

In [117]:
# list comprehension goes here
#key_phrases_ = []

In [118]:
key_phrases[:5]

[['win ," campaign manager faiz shakir told cnn last month',
  'billionaire former new york city mayor michael bloomberg',
  'former vice president joe biden came'],
 ['attorneys general offices could gather financial information',
  'attorney general brian frosh called',
  '© 2019 cable news network'],
 ['america ," cnn opinion invited readers',
  '© 2019 cable news network',
  'seen increasingly vast gaps develop'],
 ['senior administration official said trump first raised',
  'turkish president recep tayyip erdoğan pulled',
  '© 2019 cable news network'],
 ['© 2019 cable news network',
  'unusual ," said sjoerd hulshof',
  'discover new things ."']]

In [119]:
len(key_phrases)

57

### Identify and extract the named entities in each document, filtering out the numeric types. Print them and compare the differences between documents.

In [120]:
spacy.cli.download('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [121]:
nlp = spacy.load('en_core_web_sm')

spacy_docs = [nlp(doc) for doc in docs] 

In [122]:
filter = ['PERSON','NORP','FAC','ORG','GPE','LOC','PRODUCT','EVENT','WORK_OF_ART','LAW','LANGUAGE']

In [123]:
entities = [[[entity.text, entity.label_] for entity in spacy_doc.ents if entity.label_ in filter] for spacy_doc in spacy_docs]

In [124]:
entities[:5]

[[['Des Moines', 'GPE'],
  ['Iowa', 'GPE'],
  ['CNN', 'ORG'],
  ['Alexandria Ocasio-Cortez', 'PERSON'],
  ['Iowa', 'GPE'],
  ['Bernie Sanders', 'PERSON'],
  ['Vermont', 'GPE'],
  ['Council Bluffs', 'ORG'],
  ['Sanders', 'ORG'],
  ['Democratic', 'NORP'],
  ['Las Vegas', 'GPE'],
  ['Sanders', 'ORG'],
  ['New Hampshire', 'GPE'],
  ['Iowa', 'GPE'],
  ['New York', 'GPE'],
  ['Minnesota', 'GPE'],
  ['Sanders', 'ORG'],
  ['Des Moines', 'GPE'],
  ['Drake University', 'ORG'],
  ['Ocasio-Cortez', 'ORG'],
  ['Ilhan Omar', 'PERSON'],
  ['Rashida Tlaib', 'PERSON'],
  ['Ohio', 'GPE'],
  ['Sanders', 'ORG'],
  ['Democratic', 'NORP'],
  ['Sanders', 'ORG'],
  ['CNN', 'ORG'],
  ['Democrats', 'NORP'],
  ['New Hampshire', 'GPE'],
  ["Elizabeth Warren's", 'PERSON'],
  ['Joe Biden', 'PERSON'],
  ['South Bend', 'GPE'],
  ['Indiana', 'GPE'],
  ['Pete Buttigieg', 'PERSON'],
  ['Democrats', 'NORP'],
  ['Iowa', 'GPE'],
  ['Bernie', 'PERSON'],
  ['the White House', 'ORG'],
  ['Democratic', 'NORP'],
  ['the better 

In [125]:
len(entites)

57

### For every document in the corpus, iterate over every sentence, extract any SVO triples, print them, and compare.

In [126]:
SVO = []
for spacy_doc in spacy_docs:
  spacy_doc_sents = spacy_doc.sents 
  results = []
  for sent in spacy_doc_sents:
    svo = textacy.extract.subject_verb_object_triples(sent)
    results += svo
  SVO.append(results)

In [127]:
# list comprehension goes here
# SVO_ = []

In [128]:
SVO = [list(set(SVOa)) for SVOa in SVO]

In [129]:
SVO[:5]

[[(staff, are saying, much),
  (Sanders, entered, primary),
  (campaign, attracting, class coalition),
  (Sanders, invited, television cameras),
  (Sanders, injected, some),
  (person, seems, different),
  (I, use, profane word),
  (supporters, are saying, much),
  (pugilist, welcomed, run),
  (me, say, it),
  (we, allow, to happen),
  (surveys, show, group jockeying),
  (South Bend, has emerged, foe),
  (we, tell, Trump),
  (campaign, puts, it),
  (I, know, that),
  (he, challenged, Clinton),
  (Sanders, decided, to stick),
  (Sanders, is taking, source),
  (that, showed, him),
  (candidate, entered, race),
  (one, touched, digits),
  (Sanders, welcomed, run),
  (that, jarred, people),
  (pugilist, welcomed, news),
  (Sanders, has charted, revival),
  (nobody, would give, damn),
  (manager Faiz Shakir, told, CNN),
  (Sanders, began, bid),
  (others, began, to migrate),
  (Sanders, invited, reporters),
  (Biden, offers, contrast),
  (Sanders, remains, front),
  (I, thank, you),
  (he, 

In [130]:
len(SVO)

57