# Extracting Information from Text Data Assignment

In [3]:
import spacy
import string
import textacy
import itertools
from nltk import pos_tag
from rake_nltk import Rake
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import tree2conlltags
from gensim.summarization import keywords
from nltk.chunk.regexp import RegexpParser
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

In [4]:
!pip install textacy --quiet
!pip install rake_nltk --quiet

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Read the CNN Lite plain text file articles into a corpus using the NLTK's PlaintextCorpusReader.

In [6]:
PATH = '/content/drive/MyDrive/News_Articles/'

DOC_PATTERN = r'.*\.txt'
corpus = PlaintextCorpusReader(PATH, DOC_PATTERN)

### Iterate through the fileids in the corpus, extract the raw text of each document, and store them in a list.

In [7]:
docs = [corpus.raw(fileid) for fileid in corpus.fileids()]

### Extract the top 5 keywords from every document in the corpus. Print them and compare the differences in keywords among the documents.

In [8]:
from gensim.summarization import keywords

keywords = [keywords(doc, words=5, split=True, scores=True, lemmatize=True) for doc in docs]
keywords

[[('pink', 0.36103630830551936),
  ('carey', 0.23215481937929475),
  ('tour', 0.20476977920324893),
  ('pretty', 0.17656845417580172),
  ('year', 0.1765684541758015)],
 [('patrick', 0.3836932720517404),
  ('primary', 0.24487197026440186),
  ('telling', 0.1703411842462005),
  ('democrats', 0.1517592786917821),
  ('cnn', 0.14403265361714826)],
 [('narwhal', 0.3133366003525194),
  ('tail', 0.28283883626002426),
  ('puppy', 0.2316379679310094),
  ('unicorn', 0.17906622339930195),
  ('dogs', 0.16681170704255696)],
 [('states', 0.24817654471361467),
  ('democratic', 0.2303527895986974),
  ('bloomberg', 0.18915630281964302),
  ('told', 0.1572750181872013),
  ('new', 0.14556798040890168)],
 [('republican', 0.22113260970811982),
  ('taylor', 0.19257575461465587),
  ('rep', 0.18799403904568687),
  ('presidents', 0.18758001961203277),
  ('ukraine', 0.17089777047719126)],
 [('muslimness', 0.43299695783129016),
  ('people', 0.17524395460296813),
  ('skin', 0.17327231624954875),
  ('white', 0.137121

### Extract the top 3 keyphrases from each document, print them, and compare the differences.

In [9]:
r = Rake()

def rake(x):
  for doc in x:
    r.extract_keywords_from_text(doc)
    print(r.get_ranked_phrases_with_scores()[:3])
    print('------------------')

In [10]:
rake(docs)

[(32.5, 'country music association awards red carpet'), (17.166666666666664, 'school soon ," pink said'), (16.5, 'country star chris stapleton')]
------------------
[(94.03333333333333, 'former new york city mayor michael bloomberg stepped forward last week'), (30.61111111111111, 'patrick could seize upon potential advantages'), (29.0, 'elections process would ultimately splash back')]
------------------
[(28.166666666666668, 'little magical furry unicorn ," according'), (17.666666666666664, 'dog rescue nonprofit organization mac'), (13.666666666666666, 'rescue workers speculate may')]
------------------
[(40.45, 'current 2020 democrats -- clinton told bbc radio'), (37.666666666666664, 'former new york mayor michael bloomberg made'), (29.0, '2016 democratic nominee playfully tweeted back')]
------------------
[(69.41666666666667, 'money ," tweeted white house press secretary stephanie grisham'), (67.75, 'former white house homeland security adviser tom bossert summed'), (63.41666666666

### Identify and extract the named entities in each document, filtering out the numeric types. Print them and compare the differences between documents.

In [11]:
numerics = ['DATE', 'TIME', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
#if entity.label_ not in numerics:

In [12]:
nlp = spacy.load("en_core_web_sm")

def extract_ents(x):
  for doc in x:
    spacy_doc = nlp(doc)
    entities = [[entity.text, entity.label_] for entity in spacy_doc.ents if entity.label_ not in numerics]
    print(entities)
    print('-------------------------')

In [13]:
extract_ents(docs)

[['Entertainment Tonight', 'WORK_OF_ART'], ['the Country Music Association Awards', 'ORG'], ['Carey Hart', 'PERSON'], ['Willow', 'PERSON'], ['Jameson', 'ORG'], ['Love Me Anyway', 'WORK_OF_ART'], ['Chris Stapleton', 'PERSON'], ['Willow', 'PERSON'], ['Jameson', 'ORG'], ['"Carey', 'ORG'], ['Hart', 'ORG'], ['Billboard', 'PERSON'], ['Pink', 'ORG'], ['Beautiful Trauma Tour', 'ORG']]
-------------------------
[['Deval Patrick', 'PERSON'], ['Massachusetts', 'GPE'], ['Deval Patrick', 'PERSON'], ['flux', 'GPE'], ['CNN', 'ORG'], ['Patrick', 'ORG'], ['New Hampshire', 'GPE'], ['Concord', 'GPE'], ['CBS This Morning', 'WORK_OF_ART'], ['Patrick', 'PERSON'], ['Patrick', 'PERSON'], ['Alabama', 'GPE'], ['Arkansas', 'GPE'], ['New Hampshire', 'GPE'], ['Patrick', 'PERSON'], ['Massachusetts', 'GPE'], ['New Hampshire', 'GPE'], ['African American', 'NORP'], ['South Carolina', 'GPE'], ['Democratic', 'NORP'], ['CNN', 'ORG'], ['Patrick', 'PERSON'], ["the Democratic Party's", 'ORG'], ['New York City', 'GPE'], ['Mi

### For every document in the corpus, iterate over every sentence, extract any SVO triples, print them, and compare.

In [14]:
def extract_svo(x):
  for doc in x:
    spacy_doc = nlp(doc)
    results = []
    for sent in spacy_doc.sents:
      svo = textacy.extract.subject_verb_object_triples(sent)
      results += svo
    results = list(set(results))
    print(results)
    print('------------------------')

In [15]:
extract_svo(docs)

[(she, will celebrate, years), (Pink, taking, break), (star, praised, husband), (he, follows, me), (she, will be taking, step), (Jameson, 's going, to start)]
------------------------
[(he, ruled, bid), (Patrick, told, friends), (Massachusetts Gov. Deval Patrick, told, friends), (Patrick, entered, sector), (Patrick, has missed, deadline), (I, 've never taken, job), (Patrick, told, WBUR), (he, seeks, nomination), (frontrunner Democrats, have cast, interests), (He, cited, it), (he, wanted, to put), (source, tells, CNN), (Patrick, has missed, to appear), (Deval Patrick, tells, allies), (Massachusetts Gov. Deval Patrick, told, allies), (he, could make, minute entry), (that, became, liability), (he, was going, to jump), (he, would be entering, race), (he, has made, decision), (I, 've left, conscience), (Patrick, had built, team), (Patrick, defended, work), (he, planned, bid)]
------------------------
[(which, took, him), (who, has been named, Smurf), (they, want, to monitor), (organization,