# Week 7 : Information Extraction

In [31]:
#Special module written for this class
#This provides access to data and to helper functions from previous weeks
#Make sure you update it before starting this notebook
import lucem_illud #pip install -U git+git://github.com/Computational-Content-Analysis-2018/lucem_illud.git

#All these packages need to be installed from pip
#For NLP
import nltk

import numpy as np #For arrays
import pandas as pd #Gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import seaborn #Makes the graphics look nicer

#Displays the graphs
import graphviz #You also need to install the command line graphviz

#These are from the standard library
import os.path
import zipfile
import subprocess
import io
import tempfile

import lucem_illud.stanford as stanford

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


# Exercise 1

In the cells immediately following, perform POS tagging on a meaningful (but modest) subset of a corpus associated with your final project. Examine the list of words associated with at least three different parts of speech. Consider conditional frequencies (e.g., adjectives associated with nouns of interest or adverbs with verbs of interest). What do these distributions suggest about your corpus?

In [9]:
# import data
df = pd.read_csv("week7.csv",encoding = "ISO-8859-1")

In [32]:
redditTopScores = df
redditTopScores['sentences'] = redditTopScores['text'].apply(lambda x: [nltk.word_tokenize(s) for s in nltk.sent_tokenize(x)])


In [12]:
%%time
redditTopScores['POS_sents'] = redditTopScores['sentences'].apply(lambda x: stanford.postTagger.tag_sents(x))

CPU times: user 65.1 ms, sys: 194 ms, total: 259 ms
Wall time: 38 s


In [13]:
redditTopScores['POS_sents']

0    [[(``, ``), (I, PRP), (was, VBD), (delighted, ...
1    [[(During, IN), (their, PRP$), (first, JJ), (1...
2    [[(Last, JJ), (week, NN), (,, ,), (Ron, NNP), ...
3    [[(``, ``), (I, PRP), (certainly, RB), (do, VB...
4    [[(Musical-theater, NN), (productions, NNS), (...
5    [[(During, IN), (the, DT), (past, JJ), (few, J...
6    [[(For, IN), (years, NNS), (,, ,), (Def, NNP),...
7    [[(On, IN), (February, NNP), (12th, JJ), (,, ,...
8    [[(``, ``), (I, PRP), ('ve, VBP), (never, RB),...
9    [[(As, IN), (it, PRP), (turns, VBZ), (out, RP)...
Name: POS_sents, dtype: object

In [14]:
countTarget = 'NN'
targetCounts = {}
for entry in redditTopScores['POS_sents']:
    for sentence in entry:
        for ent, kind in sentence:
            if kind != countTarget:
                continue
            elif ent in targetCounts:
                targetCounts[ent] += 1
            else:
                targetCounts[ent] = 1
sortedTargets = sorted(targetCounts.items(), key = lambda x: x[1], reverse = True)
sortedTargets[:20]

[('band', 55),
 ('album', 38),
 ('\x89ÛÒ', 35),
 ('music', 35),
 ('time', 26),
 ('year', 23),
 ('nbsp', 23),
 (')', 22),
 ('song', 22),
 ('show', 21),
 ('thing', 18),
 ('rock', 17),
 ('lot', 17),
 ('amp', 17),
 ('way', 17),
 ('something', 15),
 ('everything', 14),
 ('record', 14),
 (']', 13),
 ('tour', 12)]

In [15]:
countTarget = 'VB'
targetCounts = {}
for entry in redditTopScores['POS_sents']:
    for sentence in entry:
        for ent, kind in sentence:
            if kind != countTarget:
                continue
            elif ent in targetCounts:
                targetCounts[ent] += 1
            else:
                targetCounts[ent] = 1
sortedTargets = sorted(targetCounts.items(), key = lambda x: x[1], reverse = True)
sortedTargets[:20]

[('be', 48),
 ('do', 34),
 ('have', 27),
 ('go', 20),
 ('make', 16),
 ('say', 15),
 ('get', 15),
 ('know', 15),
 ('play', 13),
 ('sing', 9),
 ('come', 8),
 ('see', 8),
 ('want', 8),
 ('think', 6),
 ('take', 6),
 ('keep', 6),
 ('give', 6),
 ('work', 5),
 ('look', 4),
 ('try', 4)]

In [19]:
NTarget = 'JJ'
Word = 'music'
NResults = set()
for entry in redditTopScores['POS_sents']:
    for sentence in entry:
        for (ent1, kind1),(ent2,kind2) in zip(sentence[:-1], sentence[1:]):
            if (kind1,ent2.lower())==(NTarget,Word):
                NResults.add(ent1)
            else:
                continue

print(NResults)     

{'own', 'Dead', 'good', 'commercial', 'subscription-based', 'contemporary', 'rough'}


treeBank = nltk.corpus.treebank
treeBank.tagged_sents()[0]
treeBank.sents()[0]
stanfordTags = stanford.postTagger.tag_sents(treeBank.sents()[:30])
NumDiffs = 0
for sentIndex in range(len(stanfordTags)):
    for wordIndex in range(len(stanfordTags[sentIndex])):
        if stanfordTags[sentIndex][wordIndex][1] != treeBank.tagged_sents()[sentIndex][wordIndex][1]:
            if treeBank.tagged_sents()[sentIndex][wordIndex][1] != '-NONE-':
                print("Word: {}  \tStanford: {}\tTreebank: {}".format(stanfordTags[sentIndex][wordIndex][0], stanfordTags[sentIndex][wordIndex][1], treeBank.tagged_sents()[sentIndex][wordIndex][1]))
                NumDiffs += 1
total = sum([len(s) for s in stanfordTags])
print("The Precision is {:.3f}%".format((total-NumDiffs)/total * 100))

# Exercise 2

In the cells immediately following, perform NER on a (modest) subset of your corpus of interest. List all of the different kinds of entities tagged? What does their distribution suggest about the focus of your corpus? For a subset of your corpus, tally at least one type of named entity and calculate the Precision, Recall and F-score for the NER classification just performed (using your own hand-codings as "ground truth").

In [25]:
redditTopScores['classified_sents'] = redditTopScores['sentences'].apply(lambda x: stanford.nerTagger.tag_sents(x))

In [26]:
redditTopScores['classified_sents']

0    [[(``, O), (I, O), (was, O), (delighted, O), (...
1    [[(During, O), (their, O), (first, O), (10, O)...
2    [[(Last, O), (week, O), (,, O), (Ron, PERSON),...
3    [[(``, O), (I, O), (certainly, O), (do, O), (n...
4    [[(Musical-theater, O), (productions, O), (ten...
5    [[(During, O), (the, O), (past, O), (few, O), ...
6    [[(For, O), (years, O), (,, O), (Def, PERSON),...
7    [[(On, O), (February, O), (12th, O), (,, O), (...
8    [[(``, O), (I, O), ('ve, O), (never, O), (felt...
9    [[(As, O), (it, O), (turns, O), (out, O), (,, ...
Name: classified_sents, dtype: object

In [27]:
entityCounts = {}
for entry in redditTopScores['classified_sents']:
    for sentence in entry:
        for ent, kind in sentence:
            if ent in entityCounts:
                entityCounts[ent] += 1
            else:
                entityCounts[ent] = 1
sortedEntities = sorted(entityCounts.items(), key = lambda x: x[1], reverse = True)
sortedEntities[:10]

[(',', 908),
 ('.', 714),
 ('the', 619),
 ('to', 376),
 ('and', 340),
 ('a', 330),
 ('``', 323),
 ('of', 274),
 ('in', 251),
 ("''", 229)]

In [28]:
[x[0] for x in sortedEntities if x[1] == 2]

['streak',
 'Hook',
 'busy',
 'Ron',
 'mistakes',
 'BeyoncÌ©',
 'added',
 'strings',
 'rarely',
 'earliest',
 'upset',
 'loop',
 'landed',
 'suddenly',
 'meetings',
 'Good',
 'Shakedown',
 'hold',
 'concussions',
 'Into',
 'associated',
 'everyone',
 'nomination',
 'light',
 'lots',
 'reflected',
 'terms',
 'crack',
 'nowhere',
 'current',
 'stop',
 'Robert',
 'While',
 'produced',
 'signing',
 'Keystone',
 'taken',
 'accompanied',
 'characters',
 'nice',
 'buy',
 'dressing',
 'core',
 'offers',
 'sign',
 'Hong',
 'response',
 'Beers',
 'Youngs',
 'possible',
 'stream',
 'president',
 'publishing',
 'instrument',
 'agreed',
 'tunes',
 'mother',
 'air',
 'job',
 'starts',
 'working',
 'Night',
 'Kong',
 'drive',
 'former',
 'happy',
 'understood',
 'dollar',
 'press',
 '1978',
 'Jay-Z',
 'football',
 'door',
 'numbers',
 'energy',
 'Seven',
 'love',
 'sings',
 'Now',
 'familiar',
 'choose',
 'foresight',
 'Words',
 'wonderful',
 'led',
 'bassist',
 'heart',
 'categories',
 'Live',
 'hon

In [29]:
nonObjCounts = {}
for entry in redditTopScores['classified_sents']:
    for sentence in entry:
        for ent, kind in sentence:
            if kind == 'O':
                continue
            elif ent in nonObjCounts:
                nonObjCounts[ent] += 1
            else:
                nonObjCounts[ent] = 1
sortedNonObj = sorted(nonObjCounts.items(), key = lambda x: x[1], reverse = True)
sortedNonObj[:10]

[('Garcia', 20),
 ('Malcolm', 14),
 ('Hutchence', 13),
 ('INXS', 12),
 ('Andrew', 11),
 ('Murphy', 11),
 ('Mann', 11),
 ('Bennett', 11),
 ('Steadman', 11),
 ('Jerry', 11)]

In [30]:
OrgCounts = {}
for entry in redditTopScores['classified_sents']:
    for sentence in entry:
        for ent, kind in sentence:
            if kind != 'ORGANIZATION':
                continue
            elif ent in OrgCounts:
                OrgCounts[ent] += 1
            else:
                OrgCounts[ent] = 1
sortedOrgs = sorted(OrgCounts.items(), key = lambda x: x[1], reverse = True)
sortedOrgs[:10]

[('INXS', 12),
 ('Academy', 8),
 ('Rolling', 8),
 ('Stone', 7),
 ('Recording', 7),
 ('Grammys', 4),
 ('Gold', 4),
 ('Green', 4),
 ('Ocean', 3),
 ('Marcus', 2)]

# Exercise 3

In the cells immediately following, parse a (modest) subset of your corpus of interest. How deep are the phrase structure and dependency parse trees nested? How does parse depth relate to perceived sentence complexity? What are five things you can extract from these parses for subsequent analysis? (e.g., nouns collocated in a noun phrase; adjectives that modify a noun; etc.) Capture these sets of things for a focal set of words (e.g., "Bush", "Obama", "Trump"). What do they reveal about the roles that these entities are perceived to play in the social world inscribed by your texts?