# Extracting Information from Text Data Assignment

In [29]:
import spacy
import string
import textacy
import itertools
from nltk import pos_tag
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import tree2conlltags
from gensim.summarization import keywords
from nltk.chunk.regexp import RegexpParser
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import pandas as pd
from itertools import groupby

### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [4]:
file_location2 = r'C:\Users\Adam Parente\Documents\Python Scripts\thinkful_assignments\example_text_files\cnn'

DOC_PATTERN = r'.*\.txt'


corpus = PlaintextCorpusReader(file_location2, DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [7]:
file_ids = corpus.fileids()
text_list = []
for x in file_ids:
    text = corpus.words(x)
    text_list.append(' '.join(text))
text_df = pd.DataFrame({'id':file_ids,'text':text_list})

### Extract the top 5 keywords from every document in the corpus. Print them and compare the differences in keywords among the documents.

In [9]:
key_word_list = []
for x in text_list:
    key_words = keywords(x, words=5, lemmatize=True).split('\n')
    key_word_list.append(key_words)

In [10]:
key_word_list

[['pink', 'cnn', 'carey', 'tour', 'reserved'],
 ['patrick', 'primary', 'cnn', 'news', 'telling'],
 ['narwhal', 'tail', 'puppy', 'cnn', 'unicorn'],
 ['states', 'democratic', 'news', 'bloomberg', 'cnn'],
 ['republican', 'taylor', 'presidents', 'rep', 'ukraine'],
 ['muslimness', 'people', 'skin', 'white', 'religion'],
 ['news', 'trump', 'said', 'republican', 'media'],
 ['jones said', 'brown', 'police', 'roanoke'],
 ['trump', 'hotels', 'office', 'profit', 'owned'],
 ['keys', 'grammys', 'award', 'reserved', 'wanna'],
 ['americans', 'republican', 'trump', 'ukrainians', 'investigate'],
 ['student', 'said', 'told', 'pence', 'schools'],
 ['cnn', 'crows', 'disney', 'american', 'old'],
 ['protester', 'police', 'news', 'chinese', 'kong']]

### Extract the top 3 keyphrases from each document, print them, and compare the differences.

In [11]:
key_phrases_list = []
for x in text_list:
    r = Rake()
    r.extract_keywords_from_text(x)
    key_phrases = r.get_ranked_phrases_with_scores()
    key_phrases_list.append(key_phrases)

In [14]:
key_phrases_list[5]

[(25.0, '© 2019 cable news network'),
 (20.77777777777778, 'support various international policies --'),
 (16.0, 'protesters stormed john f'),
 (16.0, 'executive order banning entry'),
 (15.5, 'president donald trump issued'),
 (15.333333333333334, 'every pearl jam song'),
 (14.5, 'words would always ring'),
 (13.833333333333334, 'gently educating people —'),
 (13.277777777777779, 'wildly misunderstood -- religion'),
 (13.0, 'people would judge us'),
 (12.94047619047619, 'american muslim girl editor'),
 (12.833333333333334, 'even basic american girls'),
 (12.666666666666666, 'actually pray every day'),
 (11.44047619047619, 'american muslim girl ."'),
 (9.166666666666666, 'day trump instituted'),
 (9.0, 'turner broadcasting system'),
 (9.0, 'syria ), thousands'),
 (9.0, 'rarely attend mosque'),
 (9.0, 'new york city'),
 (9.0, 'nadine jolie courtney'),
 (9.0, 'indefinitely halting refugees'),
 (9.0, 'done anything wrong'),
 (8.94047619047619, 'american muslim girl'),
 (8.833333333333334,

### Identify and extract the named entities in each document, filtering out the numeric types. Print them and compare the differences between documents.

In [16]:
nlp = spacy.load('en_core_web_sm')
entity_list = []
for x in text_list:
    spacy_doc = nlp(x)
    entities = [[entity.text,entity.label_] for entity in spacy_doc.ents]
    entity_list.append(entities)

In [19]:
entity_list[0]

[['CNN', 'ORG'],
 ['2020', 'DATE'],
 ['Entertainment Tonight', 'WORK_OF_ART'],
 ['the Country Music Association Awards', 'ORG'],
 ['Carey Hart', 'PERSON'],
 ['8', 'DATE'],
 ['Jameson', 'ORG'],
 ['2', 'CARDINAL'],
 ['Love Me Anyway', 'WORK_OF_ART'],
 ['Chris Stapleton', 'PERSON'],
 ['two and a half years', 'DATE'],
 ['Willow', 'PERSON'],
 ['Jameson', 'PERSON'],
 ['the year', 'DATE'],
 ['14 years', 'DATE'],
 ['January', 'DATE'],
 ['Carey', 'PERSON'],
 ['Hart', 'PERSON'],
 ['Billboard', 'PERSON'],
 ['10th', 'ORDINAL'],
 ['more than $ 397 million', 'MONEY'],
 ['Cable News Network', 'ORG'],
 ['Turner Broadcasting System , Inc', 'ORG'],
 ['CNN', 'ORG'],
 ['CNN', 'ORG']]

### Use NLTK's RegexpParser to extract all instances of two or more consecutive nouns (of any tense).

In [38]:
grammar = r'NP: {<NN><NN>}'
def extract_phrases(doc, grammar):
    chunker = RegexpParser(grammar)
    results = []
    for sent in sent_tokenize(doc):
        tokenized = word_tokenize(sent)
        tagged = pos_tag(tokenized)
        cleaned = [tag for tag in tagged if not tag[1] in string.punctuation]
        
        chunks = tree2conlltags(chunker.parse(cleaned))
        
        phrases = [''.join(word for word, pos, chunk in group)
                      for key, group in groupby(chunks, lambda term: term[-1] != 'O') if key]
        
        results += phrases
        results = list(set(results))
    return results

In [41]:
GRAMMAR = r'NP: {<NN><NN>}'
nouns_list = []
for x in text_list:
    nouns_list.append(extract_phrases(x,GRAMMAR))



### For every document in the corpus, iterate over every sentence, extract any SVO triples, print them, and compare.

In [49]:
svo_list = []
for x in text_list:
    results = []
    spacy_doc = nlp(x)
    for sent in spacy_doc.sents:
        svo = textacy.extract.subject_verb_object_triples(sent)
        results += svo
    results = list(set(results))
    svo_list.append(results)

In [55]:
svo_list

[[(s, going, to start),
  (he, follows, me),
  (she, will celebrate, years),
  (she, will be taking, step),
  (it, s, turn),
  (star, praised, husband)],
 [(he, regrets, not moving),
  (he, seeks, nomination),
  (I, never taken, job),
  (Patrick, had upped, involvement),
  (Mayor Michael Bloomberg, stepped, to tease),
  (He, cited, it),
  (Patrick, has missed, to appear),
  (Democrats, have cast, interests),
  (he, ruled, bid),
  (he, would be entering, race),
  (he, could make, minute entry),
  (Deval Patrick, told, allies),
  (Deval Patrick, tells, allies),
  (Patrick, weighing, contest),
  (source, tells, CNN),
  (Patrick, had built, team),
  (Patrick, entered, sector),
  (that, became, liability),
  (Patrick, told, WBUR),
  (Patrick, told, friends),
  (Patrick, defended, work),
  (he, wanted, to put),
  (he, was going, to jump),
  (he, has made, decision),
  (Patrick, has missed, deadline),
  (he, planned, bid),
  (ve, never taken, job),
  (Deval Patrick, told, friends),
  (ve, lef