In [217]:
import nltk
from nltk.corpus import brown
from nltk import bigrams, ngrams, trigrams
import re
import numpy as np
from itertools import chain
from collections import Counter
import math


print(brown.sents())
sent = [[y.lower() for y in x] for x in brown.sents()[0:40000]]
sent = [[re.sub('[^A-Za-z]', '', y) for y in x if re.sub('[^A-Za-z]', '', y)!=''] for x in sent]

print(sent[0], len(sent))

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]
['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', 'atlantas', 'recent', 'primary', 'election', 'produced', 'no', 'evidence', 'that', 'any', 'irregularities', 'took', 'place'] 40000


In [218]:
unigrams = list(chain.from_iterable(sent))
unigram_counts=Counter(unigrams)
unigram_counts['_'] = 2*40000
unigram_total=len(unigrams)
for word in unigram_counts:
    unigram_counts[word]/=unigram_total
#print(unigram_counts)

In [219]:
def bigram_model(sentences):
    model={}
    for sent in sentences:
         for w1,w2 in ngrams(sent,2, pad_left=True,pad_right=True):
            if w1 not in model:
                model[w1]={}
            if w2 not in model[w1]:
                model[w1][w2]=0
            model[w1][w2]+=1
    for w1 in model:
        tot_count=float(sum(model[w1].values()))
        for w2 in model[w1]:
            model[w1][w2]/=tot_count
     
    return model

bigram_counts= bigram_model(sent)
#print(bigram_counts)

In [220]:
def trigram_model(sentences):
    model={}
    for sent in sentences:
         for w1,w2,w3 in ngrams(sent,3, pad_left=True,pad_right=True):
            if (w1,w2) not in model:
                model[(w1,w2)]={}
            if w3 not in model[(w1,w2)]:
                model[(w1,w2)][w3]=0
            model[(w1,w2)][w3]+=1
    for (w1,w2) in model:
        tot_count=float(sum(model[(w1,w2)].values()))
        for w3 in model[(w1,w2)]:
            model[(w1,w2)][w3]/=tot_count
     
    return model

trigram_counts= trigram_model(sent)
#print(trigram_counts)

In [221]:
#unigrams counts

print("TOP 10 UNIGRAMS:")
for key, value in unigram_counts.most_common(10):
    print(key)
    

TOP 10 UNIGRAMS:
_
the
of
and
to
a
in
is
that
for


In [222]:
#bigram counts
print ("TOP 10 BIGRAMS")
bcounts = {}
for key, value in bigram_counts.items():
    s1 = key
    if(s1==None):
        s1='_'
    p = unigram_counts[s1]
    for k, v in value.items():
        s2 = k
        if(s2==None):
            s2='_'
        bcounts[(s1,s2)]= v*p
bitems = [(v, k) for k, v in bcounts.items()]
bitems.sort()
bitems.reverse()
bitems = [(k, v) for v, k in bitems]
for k,v in bitems[0:10]:
    print(k)

TOP 10 BIGRAMS
('_', 'the')
('of', 'the')
('in', 'the')
('_', 'in')
('_', 'it')
('to', 'the')
('_', 'he')
('_', 'this')
('_', 'but')
('_', 'a')


In [223]:
#trigram counts
print ("TOP 10 TRIGRAMS")
tcounts = {}
for key, value in trigram_counts.items():
    s1 = key[0]
    if(s1==None):
        s1='_'
    s2 = key[1]
    if(s2==None):
        s2='_'
    p = bcounts[(s1,s2)]*unigram_counts[s1]
    for k, v in value.items():
        s3 = k
        if(s3==None):
            s3='_'
        tcounts[(s1,s2,s3)] = p*v
titems = [(v, k) for k, v in tcounts.items()]
titems.sort()
titems.reverse()
titems = [(k, v) for v, k in titems]
for k,v in titems[0:10]:
    print(k)

TOP 10 TRIGRAMS
('_', 'it', 'is')
('_', 'in', 'the')
('_', 'it', 'was')
('_', 'this', 'is')
('_', 'there', 'is')
('_', 'he', 'was')
('_', 'on', 'the')
('_', 'but', 'the')
('_', 'there', 'are')
('the', 'united', 'states')


In [238]:

f = open("test_examples.txt",'r')
out = f.readlines() # will append in the list out
lines = []
for line in out:
    lines.append(line.split())

lines = [[y.lower() for y in x] for x in lines]
lines = [[re.sub('[^A-Za-z]', '', y) for y in x if re.sub('[^A-Za-z]', '', y)!=''] for x in lines]
print(lines)
print("\nUNIGRAM model")
for line in lines:
    loghd1 = 2*math.log(unigram_counts['_'],2)
    try:
        for w in line:
            loghd1 += math.log(unigram_counts[w],2)
        print(loghd1,"\t" ,2**(-loghd1/len(line)))
    except KeyError:
        print("Unigram Not found")

print("\nBIGRAM model")
for line in lines:
    line.insert(0, '_')
    line.insert(len(line),'_')
for line in lines:
    loghd = math.log(unigram_counts['_'],2)
    try:
        for i in range(len(line)-1):
            loghd += math.log(bcounts[(line[i], line[i+1])])
        print(loghd,"\t" ,2**(-loghd/len(line)))
    except KeyError:
        print("Bigram Not found")
        
print("\nTRIGRAM model")
for line in lines:
    line.insert(0, '_')
    line.insert(len(line),'_')
for line in lines:
    loghd = 2*math.log(unigram_counts['_'],2)
    try:
        for i in range(len(line)-2):
            loghd += math.log(tcounts[(line[i], line[i+1], line[i+2])])
        print(loghd,"\t" ,2**(-loghd/len(line)))
    except KeyError:
        print("Trigram Not found")

[['he', 'lived', 'a', 'good', 'life'], ['the', 'man', 'was', 'happy'], ['the', 'person', 'was', 'good'], ['the', 'girl', 'was', 'sad'], ['he', 'won', 'the', 'war']]

UNIGRAM model
-53.72305858992935 	 1715.7358223779738
-41.15377306182752 	 1250.6334248191617
-40.1854180524399 	 1057.4359118874631
-45.825409974027565 	 2809.996257215713
-42.087205551464166 	 1470.204765618504

BIGRAM model
-65.00823474424979 	 624.644126176937
-51.4958293385688 	 383.38184854556533
-53.65179779938966 	 491.8130818840422
Bigram Not found
-51.08920948382118 	 365.7890929215067

TRIGRAM model
Trigram Not found
Trigram Not found
Trigram Not found
Trigram Not found
-106.09974288697859 	 9826.540369216864
