In [1]:
from collections import defaultdict
import numpy as np

In [34]:
def fancySplit(sents):
    return [("SENT " + s + " *").split() for s in sents]

def words(sents):
    return [[(s,) for s in sent] for sent in sents]

def bigrams(sents):
    return [[(sent[i], sent[i+1]) for i in range(len(sent)-1)] for sent in sents]

def trigrams(sents):
    return [[(sent[i], sent[i+1], sent[i+2]) for i in range(len(sent)-2)] for sent in sents]

def getTokens(sents):
    # Tokenize the input sentences
    print("Tokenizing sentences...")
    toks = fancySplit(sents)

    # Create words, bigrams, and trigrams
    print("Creating unigrams, bigrams, and trigrams...")
    w = words(toks)
    b = bigrams(toks)
    t = trigrams(toks)

    return (w, b, t)

def getCounts(toks):
    d = defaultdict(int)
    for t in toks:
        d[t] += 1
    return d

In [35]:
with open('data/Sample1.txt', 'r') as ifile:
    corpus = ifile.readlines()

(words, bigrams, trigrams) = getTokens(corpus)

Tokenizing sentences...
Creating unigrams, bigrams, and trigrams...


In [44]:
flatWords = [w for sublist in words for w in sublist]
unigram_counts = getCounts(flatWords)
totalWords = len(flatWords)

In [62]:
flatWords[:50]

[('SENT',),
 ('The',),
 ('Fulton',),
 ('County',),
 ('Grand',),
 ('Jury',),
 ('said',),
 ('Friday',),
 ('an',),
 ('investigation',),
 ('of',),
 ("Atlanta's",),
 ('recent',),
 ('primary',),
 ('election',),
 ('produced',),
 ('``',),
 ('no',),
 ('evidence',),
 ("''",),
 ('that',),
 ('any',),
 ('irregularities',),
 ('took',),
 ('place',),
 ('.',),
 ('*',),
 ('SENT',),
 ('The',),
 ('jury',),
 ('further',),
 ('said',),
 ('in',),
 ('term-end',),
 ('presentments',),
 ('that',),
 ('the',),
 ('City',),
 ('Executive',),
 ('Committee',),
 (',',),
 ('which',),
 ('had',),
 ('over-all',),
 ('charge',),
 ('of',),
 ('the',),
 ('election',),
 (',',),
 ('``',)]

In [46]:
unigram_counts[('SENT',)]

5000

In [47]:
totalWords

131102

In [48]:
unigram_p = {k : np.log2((float(v) / totalWords)) for k, v in unigram_counts.iteritems()}

In [50]:
unigram_p[('SENT',)]

-4.7126177893568162

In [51]:
flatBigrams = [b for sublist in bigrams for b in sublist]
bigram_counts = getCounts(flatBigrams)

In [52]:
flatBigrams[0]

('SENT', 'The')

In [53]:
bigram_counts[('SENT', 'The')]

819

In [54]:
bigram_p = {}
for k, v in bigram_counts.iteritems():
    u_Count = unigram_counts[k[0:1]]
    bigram_p[k] = np.log2(float(v) / u_Count)

In [55]:
bigram_p[('SENT', 'The')]

-2.6099927379084407

In [56]:
flatTrigrams = [t for sublist in trigrams for t in sublist]
trigram_counts = getCounts(flatTrigrams)

In [57]:
flatTrigrams[0]

('SENT', 'The', 'Fulton')

In [58]:
trigram_counts[('SENT','The','Fulton')]

1

In [59]:
trigram_p = {}
for k, v in trigram_counts.iteritems():
    b_Count = bigram_counts[k[0:2]]
    trigram_p[k] = np.log2(float(v) / b_Count)

In [60]:
trigram_p[('SENT','The','Fulton')]

-9.6777196416410085