In [1]:
import nltk;
import string;
import math;
import csv;

# Collocations

*Benjamin Bray*

(this notebook requires [NLTK](http://www.nltk.org/))

### Text Preprocessing

First, we preprocess the text document by
- converting to lowercase
- removing punctuation
- counting unigrams and bigrams
- saving unigram and bigram counts to a file

In [2]:
# read text file
text_path = "data/crime-and-punishment.txt";
with open(text_path) as f:
    text_raw = f.read().lower();

# remove punctuation
translate_table = dict((ord(char), None) for char in string.punctuation);
text_raw = text_raw.translate(translate_table);

# tokenize
tokens = nltk.word_tokenize(text_raw);
bigrams = nltk.bigrams(tokens);

# unigram/bigram frequencies
unigram_counts = nltk.FreqDist(tokens);
bigram_counts = nltk.FreqDist(bigrams);

# write to file
unigram_path = text_path + ".unigrams";
bigram_path = text_path + ".bigrams";

with open(unigram_path, "w") as f:
    writer = csv.writer(f);
    filtered = [ (w,c) for w,c in unigram_counts.items() if c > 1];
    writer.writerows(filtered);
    
with open(bigram_path, "w") as f:
    writer = csv.writer(f);
    filtered = [ (b[0], b[1],c) for b,c in bigram_counts.items() if c > 3];
    writer.writerows(filtered);

### Most Common Words & Phrases

Here are the top few most common words:

In [3]:
unigram_counts.most_common(20)

[('the', 7807),
 ('and', 6902),
 ('to', 5266),
 ('he', 4657),
 ('a', 4568),
 ('i', 3939),
 ('you', 3807),
 ('of', 3806),
 ('in', 3188),
 ('it', 2973),
 ('that', 2913),
 ('was', 2820),
 ('his', 2115),
 ('at', 2064),
 ('her', 1823),
 ('but', 1780),
 ('not', 1778),
 ('with', 1706),
 ('for', 1648),
 ('she', 1628)]

Below are the most commmon word pairs.  These aren't collocations!

In [4]:
bigram_counts.most_common(20)

[(('in', 'the'), 778),
 (('of', 'the'), 598),
 (('he', 'was'), 505),
 (('he', 'had'), 498),
 (('to', 'the'), 488),
 (('on', 'the'), 479),
 (('i', 'am'), 460),
 (('at', 'the'), 459),
 (('it', 'was'), 413),
 (('that', 'he'), 335),
 (('you', 'are'), 326),
 (('to', 'be'), 308),
 (('in', 'a'), 307),
 (('do', 'you'), 292),
 (('with', 'a'), 264),
 (('did', 'not'), 256),
 (('was', 'a'), 249),
 (('for', 'the'), 246),
 (('at', 'once'), 244),
 (('and', 'he'), 241)]

### Collocations

To find collocations, we sort pairs of words by their **pointwise mutual information**,
$$
\mathrm{pmi}(x;y) = \log \frac{p(x,y)}{p(x)p(y)}
$$

In [5]:
# compute pmi
pmi_bigrams = [];

for bigram,_ in bigram_counts.most_common(1000):
    w1, w2 = bigram;
    
    # compute pmi
    actual = bigram_counts[bigram];
    expected = unigram_counts[w1] * unigram_counts[w2];
    pmi = math.log( actual / expected );
    
    pmi_bigrams.append( (w1, w2, pmi) );

# sort pmi
pmi_sorted = sorted(pmi_bigrams, key=lambda x: x[2], reverse=True);

Here are the top 30 collocations according to PMI:

In [6]:
pmi_sorted[:30]

[('nikodim', 'fomitch', -3.1780538303479458),
 ('andrey', 'semyonovitch', -3.1780538303479458),
 ('dmitri', 'prokofitch', -3.871201010907891),
 ('sofya', 'semyonovna', -4.330733340286331),
 ('marfa', 'petrovna', -4.37158498596076),
 ('rodion', 'romanovitch', -4.574710978503383),
 ('avdotya', 'romanovna', -4.74493212836325),
 ('pulcheria', 'alexandrovna', -4.820281565605037),
 ('great', 'deal', -5.2805000013568755),
 ('good', 'heavens', -5.509550266836412),
 ('katerina', 'ivanovnas', -5.569434422125123),
 ('ilya', 'petrovitch', -5.636573724962751),
 ('pyotr', 'petrovitch', -5.665343459987014),
 ('katerina', 'ivanovna', -5.731418449229828),
 ('amalia', 'ivanovna', -5.87493073085203),
 ('make', 'haste', -5.961996125397379),
 ('each', 'other', -6.009777970852694),
 ('head', 'clerk', -6.170751200206783),
 ('old', 'woman', -6.17264389932316),
 ('any', 'case', -6.267612970702526),
 ('sat', 'down', -6.283876793935164),
 ('long', 'ago', -6.322955453378991),
 ('sit', 'down', -6.343232735746065),

Just for fun, here are the bottom 30 collocations according to PMI.  These are the word pairs that occur together **less frequently** than expected:

In [7]:
pmi_sorted[-30:]

[('was', 'that', -12.663334382870111),
 ('you', 'that', -12.666187451852519),
 ('and', 'for', -12.691536152637951),
 ('her', 'he', -12.696268979639559),
 ('have', 'the', -12.711844297245333),
 ('had', 'he', -12.768049453586553),
 ('you', 'to', -12.770571828604202),
 ('it', 'you', -12.776187676580868),
 ('as', 'the', -12.792253213656782),
 ('you', 'and', -12.80749714102439),
 ('it', 'it', -12.903611192637879),
 ('but', 'to', -12.917900767413236),
 ('and', 'it', -12.94507302854542),
 ('that', 'in', -12.953045343511224),
 ('you', 'you', -12.962832988148833),
 ('to', 'a', -12.98852390615963),
 ('had', 'the', -13.071124656426408),
 ('that', 'a', -13.071564998120076),
 ('you', 'in', -13.092869971424403),
 ('to', 'it', -13.132365958569206),
 ('to', 'that', -13.178669267029216),
 ('it', 'a', -13.205281749465744),
 ('you', 'he', -13.224987596565148),
 ('and', 'to', -13.331055399808484),
 ('i', 'to', -13.35120094156068),
 ('that', 'and', -13.597629435749182),
 ('you', 'a', -13.62690506999492),
 

# Reading from CSV

Here I'm just testing out reading from the CSV files I created:

In [8]:
unigram_path = "data/crime-and-punishment.txt.unigrams";
bigram_path = "data/crime-and-punishment.txt.bigrams";

with open(unigram_path) as f:
    reader = csv.reader(f);
    unigrams = { row[0] : int(row[1]) for row in csv.reader(f)}
    
with open(bigram_path) as f:
    reader = csv.reader(f);
    bigrams = { (row[0],row[1]) : int(row[2]) for row in csv.reader(f)}

In [9]:
bigrams

{('to', 'hell'): 5,
 ('room', 'the'): 13,
 ('people', 'to'): 4,
 ('the', 'assistant'): 10,
 ('so', 'soon'): 5,
 ('get', 'out'): 5,
 ('and', 'seemed'): 8,
 ('the', 'murder'): 24,
 ('gazed', 'at'): 20,
 ('stairs', 'to'): 5,
 ('discuss', 'it'): 4,
 ('me', 'all'): 5,
 ('an', 'axe'): 12,
 ('to', 'treat'): 4,
 ('in', 'very'): 7,
 ('i', 'look'): 9,
 ('why', 'who'): 4,
 ('now', 'not'): 4,
 ('a', 'time'): 26,
 ('among', 'them'): 9,
 ('direction', 'of'): 9,
 ('and', 'never'): 4,
 ('last', 'week'): 6,
 ('the', 'palais'): 6,
 ('he', 'turned'): 47,
 ('known', 'it'): 4,
 ('on', 'her'): 46,
 ('what', 'made'): 12,
 ('and', 'out'): 4,
 ('and', 'for'): 35,
 ('to', 'zossimov'): 5,
 ('him', 'but'): 41,
 ('why', 'he'): 20,
 ('a', 'cup'): 7,
 ('wont', 'have'): 5,
 ('her', 'work'): 4,
 ('that', 'minute'): 7,
 ('go', 'on'): 32,
 ('her', 'voice'): 10,
 ('possibility', 'of'): 7,
 ('so', 'isnt'): 5,
 ('your', 'eyes'): 5,
 ('asked', 'in'): 10,
 ('why', 'is'): 14,
 ('for', 'half'): 8,
 ('knew', 'it'): 10,
 ('on', 