# Evaluating an N-Gram Language Model



In [1]:
from n_gram import NGramLM

BOS = '<BOS>'
EOS = '<EOS>'
OOV = '<OOV>'

# Load pre-built n-gram languae models
model_unigram = NGramLM('arthur-conan-doyle.tok.train.n1.pkl', .01, verbose=True)
model_bigram = NGramLM('arthur-conan-doyle.tok.train.n2.pkl', .01)
model_trigram = NGramLM('arthur-conan-doyle.tok.train.n3.pkl', .01)
model_4gram = NGramLM('arthur-conan-doyle.tok.train.n4.pkl', .01)
model_5gram = NGramLM('arthur-conan-doyle.tok.train.n5.pkl', .01)

Now it's time to see how well these models fit our data! We'll use Perplexity for this calculation, but it's up to you to implement it below.

Recall the formula for perplexity from the lecture:

$$
perplexity = 2^{\frac{-1}{n}\sum \log_2(P(w_i|w_{<i}))}
$$

Hint: you'll want to use the [`math.log2`](https://docs.python.org/3/library/math.html#math.log2) function

In [2]:
import math


def perplexity(model: NGramLM, texts: list[tuple[str]]) -> float:
    n_word = sum(len(text) for text in texts)
    res = sum(
        math.log2(model.get_prob(text[:i], text[i]))
        for text in texts
        for i in range(len(text))
    )

    return math.pow(2, -1 / n_word * res)


model_unigram.verbose = True
print(perplexity(model_unigram, [('My', 'dear', 'Watson', '.'), ('Come', 'over', 'here', '!')]))
model_unigram.verbose = False

0.0006603 -> My
5.955e-05 My -> dear
5.955e-05 My dear -> Watson
5.955e-05 My dear Watson -> .
0.0001687 -> Come
5.955e-05 Come -> over
5.955e-05 Come over -> here
5.955e-05 Come over here -> !
10914.060522177839


In [3]:
# Tests
assert round(perplexity(model_unigram, [('My', 'dear', 'Watson')])) == 7531
assert round(perplexity(model_bigram, [('My', 'dear', 'Watson')])) == 24
assert round(perplexity(model_trigram, [('My', 'dear', 'Watson')])) == 521