# Week 3: N-gram language models

In [1]:
from nltk.corpus import brown
import random
import math
import pandas as pd
from collections import Counter
import numpy as np
eps = np.finfo(float).eps
random.seed(123)

The Brown Corpus comes preprocessed via word tokenization.

In [2]:
dataset = brown.words()
len(dataset)

1161192

For the purpose of experimentation, let's create a train/test split of the dataset.

In [3]:
train_data = dataset[:1000000]
test_data = dataset[1000000:]

### Train uni-gram language model

Let's now start by implementing a bag-of-words, or our unigram model.

In [5]:
def get_unigram_vocabulary(dataset):
    types = list(set(dataset))
    return types

def unigram_lm(sequence_tokens, vocabulary):
    BoW = {t: 0 for t in vocabulary}
    counts = dict(Counter(sequence_tokens))
    total = sum(counts.values())
    for token in BoW:
        if token in counts:
            BoW[token] = counts[token]/total + eps
        else:
            BoW[token] = eps
    return BoW

Let's fit our unigram model to our dataset!

In [6]:
brown_unigrams = unigram_lm(train_data, get_unigram_vocabulary(dataset))

In [8]:
brown_unigrams["horse"]

7.400000000022204e-05

### Train an bi-gram language model

Now let's write a function that returns a bigram model. The first step is a function that returns the set of possible bigrams in our dataset.

In [9]:
def get_bigram_vocabulary(dataset):
    bigram_types = []
    pad_token = "[PAD]"
    ## TO DO
    prev_token = pad_token
    for token in dataset:
        bigram_types.append((prev_token, token))
        prev_token = token
    bigram_types = list(set(bigram_types))
    ##
    return bigram_types

In [10]:
bigrams = get_bigram_vocabulary(dataset)
len(bigrams)

455268

Now that we have a way to get the set of bigram types lets write the bigram model (don't forget to implement smoothing by adding eps to all probability values):

In [13]:
def bigram_lm(train_data, dataset):
    bigrams = get_bigram_vocabulary(dataset)
    unigrams = get_unigram_vocabulary(dataset)+["[PAD]"]
    bigram_counts = {t: 0 for t in bigrams}
    unigram_counts = {t: 0 for t in unigrams}
    unigram_counts["[PAD]"] = 1
    bigram_probs = dict()
    ## TO DO
    sequence_tokens = ["[PAD]"] + train_data
    for i in range(len(sequence_tokens)-1):
        bigram = (sequence_tokens[i], sequence_tokens[i+1])
        bigram_counts[bigram] += 1
        unigram_counts[sequence_tokens[i]] +=1
    for bigram in bigram_counts:
        if bigram_counts[bigram] > 0:
            bigram_probs[bigram] = bigram_counts[bigram]/unigram_counts[bigram[0]] + eps
        else:
            bigram_probs[bigram] = eps
    ##
    return bigram_probs

Let's fit a bigram model to the brown corpus.

In [14]:
brown_bigrams = bigram_lm(train_data, dataset)

### Train a tri-gram language model

Lets now repeat these steps but for a trigram model.

In [18]:
def get_trigram_vocabulary(dataset):
    trigram_types = []
    pad_token = "[PAD]"
    ## TO DO
    prev_token_1 = pad_token
    prev_token_2 = pad_token
    for token in dataset:
        trigram_types.append((prev_token_2, prev_token_1, token))
        prev_token_2 = prev_token_1
        prev_token_1 = token
    trigram_types = list(set(trigram_types))
    ##
    return trigram_types

In [19]:
def trigram_lm(train_data, dataset):
    trigrams = get_trigram_vocabulary(dataset)
    bigrams = get_bigram_vocabulary(dataset)+[("[PAD]","[PAD]")]
    trigram_counts = {t: 0 for t in trigrams}
    bigram_counts = {t: 0 for t in bigrams}
    trigram_probs = dict()
    ## TO DO
    sequence_tokens = ["[PAD]", "[PAD]"] + train_data
    for i in range(len(sequence_tokens)-2):
        trigram = (sequence_tokens[i], sequence_tokens[i+1], sequence_tokens[i+2])
        bigram = (sequence_tokens[i], sequence_tokens[i+1])
        trigram_counts[trigram] += 1
        bigram_counts[bigram] += 1
    for trigram in trigram_counts:
        if trigram_counts[trigram] > 0:
            trigram_probs[trigram] = trigram_counts[trigram]/bigram_counts[(trigram[0],trigram[1])] + eps
        else:
            trigram_probs[trigram] = eps
    ##
    return trigram_probs

Let's fit a trigram model to the brown corpus.

In [20]:
brown_trigrams = trigram_lm(train_data, dataset)

### Compare the perplexity of the test data of each model

Which of these models performs best at representing the test data distribution? Write a function that takes a fitted ngram model and a test dataset and returns the perplexity of that dataset. 

Since the probability of the test data is the product of the probabilities of the ngrams which compose it, it is a very small number and we risk running into a floating-point error when trying to compute it. Thus, we should calculate perplexity in log base 2 space. Here is the formula.

$PP(W) = 2^{-\frac{1}{n}\log P(W)}$

In [21]:
def get_perplexity(ngram_lm, test_data):
    if type(list(ngram_lm.keys())[0]) is tuple :
        ngram_size = len(list(ngram_lm.keys())[0])
    else:
        ngram_size = 1
    perplexity = 0.0
    n = len(test_data)+(ngram_size-1)
    ## TO DO
    cum_prob = 0.0
    for i in range(len(test_data)-(ngram_size+1)):
        if ngram_size > 1: 
            ngram = tuple(test_data[i:(i+ngram_size)])
        else:
            ngram = test_data[i]
        prob = ngram_lm[ngram]
        cum_prob = cum_prob + math.log(prob)
    perplexity = 2**((-1/n)*cum_prob)
    ##
    return perplexity

In [22]:
get_perplexity(brown_trigrams, train_data)

3.4270349833895764

In [23]:
def compare_perplexity_scores(models, dataset):
    results = [get_perplexity(lm, dataset) for lm in models]
    return results

models = [brown_unigrams, brown_bigrams, brown_trigrams]
train_perplexity = compare_perplexity_scores(models, train_data)
test_perplexity = compare_perplexity_scores(models, test_data)

results = {'models':['unigram','bigram','trigram'],
 'train_perplexity':train_perplexity,
 'test_perplexity':test_perplexity}

df_results = pd.DataFrame(data=results)
df_results

Unnamed: 0,models,train_perplexity,test_perplexity
0,unigram,144.94642,255.1061
1,bigram,20.556295,21667.69
2,trigram,3.427035,119037900.0


What do you notice about these results? Why might that be?