In [33]:
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
import re

# import and preprocess the corpus
def preprocess(filename):
    output = ""
    with open(filename, "r", encoding="unicode-escape") as corpus:
        sentence = corpus.read().split('\n')
        for line in sentence:
            output += " " + line
    return output
            
text = preprocess("corpus-2.2.txt")

# remove punctuation
text = re.sub(r'[^\w\s]', '', text)

# make sure all spacing is one white space
text = ' '.join(text.split())

['Musharrafs',
 'Last',
 'Act',
 'General',
 'Musharraf',
 'appeared',
 'on',
 'the',
 'national',
 'scene',
 'on',
 'October',
 '12',
 '1999',
 'when',
 'he',
 'ousted',
 'an',
 'elected',
 'government',
 'and',
 'announced',
 'an',
 'ambitious',
 'nationbuilding',
 'project',
 'Many',
 'Pakistanis',
 'disillusioned',
 'with',
 'Pakistans',
 'political',
 'class',
 'remained',
 'mute',
 'thinking',
 'that',
 'he',
 'might',
 'deliver',
 'The',
 'September',
 '11',
 '2001',
 'terrorist',
 'attacks',
 'on',
 'America',
 'brought',
 'Musharraf',
 'into',
 'the',
 'international',
 'limelight',
 'as',
 'he',
 'agreed',
 'to',
 'ditch',
 'the',
 'Taliban',
 'and',
 'support',
 'the',
 'United',
 'Statesled',
 'war',
 'on',
 'terror',
 'Musharraf',
 'clamped',
 'down',
 'on',
 'some',
 'religious',
 'militants',
 'operating',
 'inside',
 'Pakistan',
 'and',
 'also',
 'on',
 'those',
 'fighting',
 'Indian',
 'forces',
 'in',
 'Kashmir',
 'As',
 'a',
 'result',
 'Pakistan',
 'was',
 'rewarded

In [34]:
from nltk.util import ngrams


# get ngrams
def get_ngrams(corpus, n):
    ngram = ngrams(corpus.split(), n)
    return [i for i in ngram]
# get ngram frequencies as a dictionary
def ngram_freq(ngram_list):
    freq = {}
    for ngram in ngram_list:
        if ngram in freq:
            freq[ngram] += 1
        else:
            freq[ngram] = 1
    return freq

# get a list of bigrams
bigram_list = get_ngrams(text, 2)

# get a list of trigrams
trigram_list = get_ngrams(text, 3)

bigram_freq = ngram_freq(bigram_list)
trigram_freq = ngram_freq(trigram_list)

In [35]:
# regular probability
vocabulary = len(trigram_list)
probs={}
for trigram_value, trigram_count in trigram_freq.items():
    probs[trigram_value] = trigram_count / bigram_freq[trigram_value[:-1]]

{('Musharrafs', 'Last', 'Act'): 1.0,
 ('Last', 'Act', 'General'): 1.0,
 ('Act', 'General', 'Musharraf'): 1.0,
 ('General', 'Musharraf', 'appeared'): 0.1111111111111111,
 ('Musharraf', 'appeared', 'on'): 1.0,
 ('appeared', 'on', 'the'): 0.4,
 ('on', 'the', 'national'): 0.0016366612111292963,
 ('the', 'national', 'scene'): 0.005952380952380952,
 ('national', 'scene', 'on'): 1.0,
 ('scene', 'on', 'October'): 1.0,
 ('on', 'October', '12'): 0.125,
 ('October', '12', '1999'): 0.5,
 ('12', '1999', 'when'): 1.0,
 ('1999', 'when', 'he'): 0.14285714285714285,
 ('when', 'he', 'ousted'): 0.007575757575757576,
 ('he', 'ousted', 'an'): 0.5,
 ('ousted', 'an', 'elected'): 1.0,
 ('an', 'elected', 'government'): 0.21428571428571427,
 ('elected', 'government', 'and'): 0.10526315789473684,
 ('government', 'and', 'announced'): 0.005952380952380952,
 ('and', 'announced', 'an'): 0.14285714285714285,
 ('announced', 'an', 'ambitious'): 0.16666666666666666,
 ('an', 'ambitious', 'nationbuilding'): 0.045454545454

In [41]:
# plus one smoothing
add_one_probs={}
for trigram_value, trigram_count in trigram_freq.items():
    add_one_probs[trigram_value] = (trigram_count + 1) / (bigram_freq[trigram_value[:-1]] + vocabulary)

In [44]:
import numpy as np
# prob of sentence
def sentence_prob(sentence):
    sentence_trigrams = get_ngrams(sentence, 3)
    prob_list = []
    for i in sentence_trigrams:
        prob_list.append(add_one_probs[i])
    return np.prod(prob_list)

sentence_prob('this is a test')

5.515789681589914e-11

In [55]:
# calculate perplexity
def get_perp(sentence):
    sentence_trigrams = get_ngrams(sentence, 3)
    log_list = []
    for i in sentence_trigrams:
        try:
            log_list.append(np.log2(add_one_probs[i]))
        except KeyError:
            log_list.append(0)
            continue
    return (sum(log_list))/len(sentence_trigrams)

In [56]:
from nltk.tokenize import sent_tokenize

test = preprocess("test-2.2.txt")

tokenized_test = sent_tokenize(test)

for i in tokenized_test:
    print(get_perp(i))

-4.559848682138508
-5.978471917084863
-7.3762999337096
-4.937548132239096
0.0
-0.9822961708788477
-5.499176778368703
-6.097299204716596
-4.576311455084519
-2.9964734515740092
0.0
-2.5259044394027512
-2.1575882138299773
-5.765669144827017
-2.9291074440008846
-9.093461585706747
-7.002931376744497
-3.709058527178596
-5.199403235134223
-9.662193234671136
-4.007537635800398
0.0
-3.884122296263707
-5.362501404848392
-13.74246984446396
