#### Imports

In [1]:
from nltk.corpus import treebank
import pickle



In [3]:
# custom written code

from ngrams import add_unk_tokens, training_ngrams, test_ngrams #for laplace model
from laplace_model import laplace_model
from perplexity import perplexity

#### Penn Treebank

In [2]:
train_treebank = []
for j in range(150): # len(treebank.fileids()) = 199
    for i in treebank.sents(treebank.fileids()[j]):
        l = [j for j in i if '*' not in j] # remove tokens that contain '*'
        train_treebank.append(l)

test_treebank = []
for j in range(150, 199): # len(treebank.fileids()) = 199
    for i in treebank.sents(treebank.fileids()[j]):
        l = [j for j in i if '*' not in j]
        test_treebank.append(l)

#### 3-gram language model with Laplace smoothing

In [4]:
train_sentences = [sentence.copy() for sentence in train_treebank]
test_sentences = [sentence.copy() for sentence in test_treebank]

In [5]:
# replace all tokens that appear less than 3 times with <UNK>
train_sentences = add_unk_tokens(train_sentences)

In [6]:
#the vocabulary is useful for the testing phase
vocabulary = set([item for sublist in train_sentences for item in sublist])

In [7]:
#training 3-grams
train_trigrams = training_ngrams(3, train_sentences)

#test 3-grams - tokens not included in the vocabulary are replaced by <UNK>
test_trigrams = test_ngrams(vocabulary, 3, test_sentences)

In [8]:
len(train_trigrams), len(test_trigrams)

(85951, 14452)

In [9]:
#example of the 3-grams extracted from the first training sentence
print(train_treebank[0], '\n')
print(train_sentences[0], '\n')
print(train_trigrams[:20])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 

['<UNK>', '<UNK>', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 

[['<BOS>', '<BOS>', '<UNK>'], ['<BOS>', '<UNK>', '<UNK>'], ['<UNK>', '<UNK>', ','], ['<UNK>', ',', '61'], [',', '61', 'years'], ['61', 'years', 'old'], ['years', 'old', ','], ['old', ',', 'will'], [',', 'will', 'join'], ['will', 'join', 'the'], ['join', 'the', 'board'], ['the', 'board', 'as'], ['board', 'as', 'a'], ['as', 'a', 'nonexecutive'], ['a', 'nonexecutive', 'director'], ['nonexecutive', 'director', 'Nov.'], ['director', 'Nov.', '29'], ['Nov.', '29', '.'], ['29', '.', '<EOS>'], ['.', '<EOS>', '<EOS>']]


In [10]:
#example of the 3-grams extracted from the first training sentence
print(test_treebank[0], '\n')
print(test_sentences[0], '\n')
print(test_trigrams[:39])

['Intelogic', 'Trace', 'Inc.', ',', 'San', 'Antonio', ',', 'Texas', ',', 'said', '0', 'it', 'bought', '2.7', 'million', 'shares', ',', 'or', 'about', '18', '%', ',', 'of', 'its', 'common', 'stock', 'from', 'an', 'unaffiliated', 'shareholder', 'for', '$', '3.625', 'a', 'share', ',', 'or', '$', '9.9', 'million', '.'] 

['<UNK>', '<UNK>', 'Inc.', ',', 'San', '<UNK>', ',', 'Texas', ',', 'said', '0', 'it', 'bought', '<UNK>', 'million', 'shares', ',', 'or', 'about', '18', '%', ',', 'of', 'its', 'common', 'stock', 'from', 'an', '<UNK>', 'shareholder', 'for', '$', '<UNK>', 'a', 'share', ',', 'or', '$', '<UNK>', 'million', '.'] 

[['<UNK>', '<UNK>', 'Inc.'], ['<UNK>', 'Inc.', ','], ['Inc.', ',', 'San'], [',', 'San', '<UNK>'], ['San', '<UNK>', ','], ['<UNK>', ',', 'Texas'], [',', 'Texas', ','], ['Texas', ',', 'said'], [',', 'said', '0'], ['said', '0', 'it'], ['0', 'it', 'bought'], ['it', 'bought', '<UNK>'], ['bought', '<UNK>', 'million'], ['<UNK>', 'million', 'shares'], ['million', 'shares', ','

In [12]:
laplace_learned_probs, laplace_unseen_prob = laplace_model(train_trigrams)

In [13]:
with open('laplace_model.pickle', 'wb') as f:
        pickle.dump([laplace_learned_probs, laplace_unseen_prob], f)

In [21]:
laplace_perplexity = perplexity(learned_distribution = laplace_learned_probs,
                               unseen_prob = laplace_unseen_prob,
                               ngrams = test_trigrams,
                               N = len(test_treebank))

In [22]:
laplace_perplexity

8.065682401595497e+32