<a href="https://colab.research.google.com/github/vggls/language_models/blob/main/notebooks/3_gram_model_with_Laplace_smoothing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Imports

In [None]:
import pickle
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import math
#import string #string.punctuation contains punctuation symbols

In [None]:
# for google colab import run this cell as well
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [None]:
from nltk.corpus import treebank

#### Dataset and Preprocessing

In [None]:
# custom written code
from preprocessing import lower, add_unk_tokens_for_training, replace_with_unk_for_testing, create_ngrams
from laplace_model import count_n_grams, laplace_model, perplexity_ngram_model

In [None]:
# Penn Treebank
symbols_to_remove = set(['-LRB-', '-RRB-', '-LSB-', '-RSB-', '-LCB-', '-RCB-']) # parentheses

train_treebank = []
for j in range(175):
    for i in treebank.sents(treebank.fileids()[j]):
        l = [token for token in i if ('*' not in token) and ('\/' not in token) and (token not in symbols_to_remove)] # Remove tokens that contain '*', '\/' or symbols_to_remove
        train_treebank.append(l) # Append the sentence to the training data

test_treebank = []
for j in range(175, 199):
    for i in treebank.sents(treebank.fileids()[j]):
        l = [token for token in i if '*' not in token and token not in symbols_to_remove]
        test_treebank.append(l)

len(train_treebank), len(test_treebank)

(3576, 338)

In [None]:
#lower first letter of each token
train_tokenized_sentences = lower(train_treebank)
test_tokenized_sentences = lower(test_treebank)

In [None]:
# insert <unk> token to training data
train_tokenized_sentences = add_unk_tokens_for_training(train_tokenized_sentences) #replace all tokens that appear less than 3 times with <unk>

In [None]:
vocabulary = set([item for sublist in train_tokenized_sentences for item in sublist])
len(vocabulary)

3466

In [None]:
'<unk>' in vocabulary, '<bos>' in vocabulary, '<eos>' in vocabulary

(True, False, False)

In [None]:
# insert <unk> token to test data
test_tokenized_sentences = replace_with_unk_for_testing(vocabulary, test_tokenized_sentences)

In [None]:
#"create_ngrams" method adds <bos> and <eos> tokens and computes ngrams
train_bigrams = create_ngrams(2, train_tokenized_sentences)
train_trigrams = create_ngrams(3, train_tokenized_sentences)
test_trigrams = create_ngrams(3, test_tokenized_sentences)

len(train_bigrams), len(train_trigrams), len(test_trigrams)

(90375, 93951, 8663)

In [None]:
#example of 2-grams and 3-grams extracted from the first training sentence
print(train_treebank[0], '\n')
print(train_tokenized_sentences[0], '\n')
print(train_bigrams[:19], '\n')
print(train_trigrams[:20])

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 

['<unk>', '<unk>', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.'] 

[['<bos>', '<unk>'], ['<unk>', '<unk>'], ['<unk>', ','], [',', '61'], ['61', 'years'], ['years', 'old'], ['old', ','], [',', 'will'], ['will', 'join'], ['join', 'the'], ['the', 'board'], ['board', 'as'], ['as', 'a'], ['a', 'nonexecutive'], ['nonexecutive', 'director'], ['director', 'nov.'], ['nov.', '29'], ['29', '.'], ['.', '<eos>']] 

[['<bos>', '<bos>', '<unk>'], ['<bos>', '<unk>', '<unk>'], ['<unk>', '<unk>', ','], ['<unk>', ',', '61'], [',', '61', 'years'], ['61', 'years', 'old'], ['years', 'old', ','], ['old', ',', 'will'], [',', 'will', 'join'], ['will', 'join', 'the'], ['join', 'the', 'board'], ['the', 'board', 'as'], ['board', 'as', 'a'], ['as', 'a', 'nonexecutive'], ['a', 'nonexecutive', 'dire

In [None]:
#example of 3-grams extracted from the first test sentence
print(test_treebank[0], '\n')
print(test_tokenized_sentences[0], '\n')
print(test_trigrams[:31])

['Xerox', 'Corp.', 'has', 'told', 'employees', 'in', 'its', 'Crum', '&', 'Forster', 'personal', 'insurance', 'operations', 'that', 'it', 'is', 'laying', 'off', 'about', '300', 'people', ',', 'or', '25', '%', 'of', 'the', 'staff', '.'] 

['<unk>', 'corp.', 'has', 'told', 'employees', 'in', 'its', '<unk>', '&', '<unk>', 'personal', 'insurance', 'operations', 'that', 'it', 'is', '<unk>', 'off', 'about', '300', 'people', ',', 'or', '25', '%', 'of', 'the', 'staff', '.'] 

[['<bos>', '<bos>', '<unk>'], ['<bos>', '<unk>', 'corp.'], ['<unk>', 'corp.', 'has'], ['corp.', 'has', 'told'], ['has', 'told', 'employees'], ['told', 'employees', 'in'], ['employees', 'in', 'its'], ['in', 'its', '<unk>'], ['its', '<unk>', '&'], ['<unk>', '&', '<unk>'], ['&', '<unk>', 'personal'], ['<unk>', 'personal', 'insurance'], ['personal', 'insurance', 'operations'], ['insurance', 'operations', 'that'], ['operations', 'that', 'it'], ['that', 'it', 'is'], ['it', 'is', '<unk>'], ['is', '<unk>', 'off'], ['<unk>', 'off',

#### Model

In [None]:
#2-grams and 3-grams frequencies
bigrams_counts = count_n_grams(train_bigrams)
trigrams_counts = count_n_grams(train_trigrams)

In [None]:
with open('ngrams_counts.pickle', 'wb') as f:
        pickle.dump([bigrams_counts, trigrams_counts], f)

#### Perplexity

In [None]:
perplexity_ngram_model(nminus1_grams_counts=bigrams_counts,
                       n_grams_counts=trigrams_counts,
                       test_n_grams=test_trigrams,
                       vocab_size=len(vocabulary))

1082.933692249023