In [2]:
import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.collocations import *

import numpy as np 
from matplotlib import pyplot as plt 

%matplotlib inline

stop_words = stopwords.words('english')

#To add additional stopwords, uncomment the following line and adjust the words you want to remove
stop_words.extend(['states'])

file = open('/Users/asg/Dropbox/00-UCLA/Courses/DH199-s20/Corpus/shawnee/shawnee_chron.txt').read()

tokens = word_tokenize(file)

#Make all words lowercase
tokens_lower = [w.lower() for w in tokens]

#Remove stopwords, punctuation, and numbers.
content = [w for w in tokens_lower if w not in stop_words and w.isalpha()]

In [3]:
#The Porter stemmer from NLTK was chosen for this example because it leaves more of the original word intact, 
#whereas the Lancaster stemmer is more aggressive and removes additional letters from the original word.
porter = nltk.PorterStemmer()

stemmed = [porter.stem(t) for t in content]

#Print the tokenized and stemmed results after removing the stopwords to see how the stemmer functions.
#print(stemmed)

<h2>Collocations</h2>



In [4]:
from nltk.collocations import BigramCollocationFinder 
from nltk.metrics import BigramAssocMeasures

biagram_collocation = BigramCollocationFinder.from_words(content) 
biagram_collocation.nbest(BigramAssocMeasures.likelihood_ratio, 15)

[('fort', 'pitt'),
 ('de', 'peyster'),
 ('six', 'nations'),
 ('arent', 'de'),
 ('humble', 'servant'),
 ('beaver', 'creek'),
 ('new', 'orleans'),
 ('halted', 'refresh'),
 ('dear', 'sir'),
 ('detroit', 'mi'),
 ('main', 'body'),
 ('du', 'vernet'),
 ('obedient', 'humble'),
 ('two', 'hundred'),
 ('alexander', 'mckee')]

In [5]:
from nltk.collocations import TrigramCollocationFinder 
from nltk.metrics import TrigramAssocMeasures 

trigram_collocation = TrigramCollocationFinder.from_words(content)  
trigram_collocation.nbest(TrigramAssocMeasures.likelihood_ratio, 15)

[('reached', 'fort', 'pitt'),
 ('fort', 'pitt', 'refused'),
 ('treaty', 'fort', 'pitt'),
 ('commandant', 'fort', 'pitt'),
 ('fort', 'pitt', 'commissioners'),
 ('miles', 'fort', 'pitt'),
 ('dii', 'fort', 'pitt'),
 ('fort', 'pitt', 'nous'),
 ('mackentoy', 'fort', 'pitt'),
 ('population', 'fort', 'pitt'),
 ('left', 'fort', 'pitt'),
 ('accompany', 'fort', 'pitt'),
 ('betwixt', 'fort', 'pitt'),
 ('fort', 'pitt', 'lest'),
 ('fort', 'pitt', 'medium')]

<h3>Spanning Intervening Words</h3>

In [6]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(content, window_size=5)
finder.apply_freq_filter(2)
finder.nbest(bigram_measures.likelihood_ratio, 10)

[('fort', 'pitt'),
 ('de', 'peyster'),
 ('humble', 'servant'),
 ('obedient', 'servant'),
 ('six', 'nations'),
 ('sir', 'servant'),
 ('sir', 'humble'),
 ('beaver', 'creek'),
 ('arent', 'de'),
 ('halted', 'refresh')]

In [16]:
#The following code finds bigrams of the keyword, but does not span an interval of words.

bigram_measures = nltk.collocations.BigramAssocMeasures()
kw_filter = lambda *w: 'man' not in w
finder.apply_ngram_filter(kw_filter)
finder.nbest(bigram_measures.pmi,15)

[('man', 'requiring'),
 ('remarkable', 'man'),
 ('airs', 'man'),
 ('avas', 'man'),
 ('citizen', 'man'),
 ('courage', 'man'),
 ('delegate', 'man'),
 ('deserving', 'man'),
 ('dreading', 'man'),
 ('eights', 'man'),
 ('essay', 'man'),
 ('foolish', 'man'),
 ('goes', 'man'),
 ('inadvertently', 'man'),
 ('losing', 'man')]

### Notice that the code above identifies the top 15 bigrams using the <b><a href="https://medium.com/dataseries/understanding-pointwise-mutual-information-in-nlp-e4ef75ecb57a">pointwise mutual information</a></b> association measure, but you can select different metrics to score bigrams, and thus identify them, differently. 

#### Notice the difference between the results above and the results below with the <em>mutual information</em> and <em>likelihood ratio</em> association measures, which, in this case, provide very similar results to each other, but quite different results from the PMI measure used above.

In [17]:
#The following code finds bigrams of the keyword, but does not span an interval of words.

bigram_measures = nltk.collocations.BigramAssocMeasures()
kw_filter = lambda *w: 'man' not in w
finder.apply_ngram_filter(kw_filter)
finder.nbest(bigram_measures.mi_like,15)

[('every', 'man'),
 ('single', 'man'),
 ('young', 'man'),
 ('man', 'requiring'),
 ('remarkable', 'man'),
 ('man', 'column'),
 ('man', 'died'),
 ('man', 'whites'),
 ('kissingua', 'man'),
 ('man', 'allmost'),
 ('man', 'domestic'),
 ('man', 'stuart'),
 ('man', 'woman'),
 ('one', 'man'),
 ('man', 'nation')]

In [18]:
#The following code finds bigrams of the keyword, but does not span an interval of words.

bigram_measures = nltk.collocations.BigramAssocMeasures()
kw_filter = lambda *w: 'man' not in w
finder.apply_ngram_filter(kw_filter)
finder.nbest(bigram_measures.likelihood_ratio,15)

[('every', 'man'),
 ('young', 'man'),
 ('single', 'man'),
 ('remarkable', 'man'),
 ('man', 'requiring'),
 ('man', 'column'),
 ('man', 'died'),
 ('man', 'whites'),
 ('man', 'nation'),
 ('man', 'allmost'),
 ('man', 'domestic'),
 ('man', 'stuart'),
 ('man', 'woman'),
 ('kissingua', 'man'),
 ('man', 'horse')]

### To learn more about the various types of co-occurrence and collocation identification, as well as the differences between association measures, read through the following slides and consider the questions embedded in them with regard to your own project.

For even more details, read through <a href="https://drive.google.com/open?id=1Ji1dKprM9ufK5pmMZEGZLkN-_l9IcmqD">Manning & Schutze, "Collocations," (draft, 1999).</a> I've found their draft document to be much more detailed than their published version.

In [15]:
#There is a known bug when using the suggested method (IPython.display.IFrame) in Jupyter Notebooks, but
#the following code will actually render the desired iframe in notebooks without a problem.

import IPython
url = 'https://docs.google.com/presentation/d/e/2PACX-1vQPv3LUx0o9VfANY81eNbgbRr3RkS6STA4yXDbc72AyNKMWOEXcWQol4oO6JaZAFiumt8feN2TSR8wc/embed?start=false&loop=false&delayms=3000'
iframe = '<iframe src=' + url + ' width=960 height=569></iframe>'
IPython.display.HTML(iframe)

