In [23]:
import itertools

from nltk.lm import Vocabulary
from nltk.lm.models import KneserNeyInterpolated, Laplace, MLE, LanguageModel, WittenBellInterpolated
from nltk.util import ngrams

In [6]:
N = 3
train_filename = "wiki-en-train.word"
test_filename = "wiki-en-test.word"

In [4]:
def load_sentences(filename: str) -> list:
    sentences = []
    with open(filename, "rt") as f:
        for line in f:
            words = ["__BOS__"] + line.lower().strip().split() + ["__EOS__"]
            sentences.append(words)
    return sentences

In [7]:
train_sentences = load_sentences(train_filename)
print(f"the number of sentences in train dataset: {len(train_sentences)}")
test_sentences = load_sentences(test_filename)
print(f"the number of sentences in test dataset : {len(test_sentences)}")

the number of sentences in train dataset: 1301
the number of sentences in test dataset : 171


In [10]:
vocabulary = Vocabulary(itertools.chain.from_iterable(train_sentences))

In [24]:
def get_dataset(sentences: list, n: int=3) -> list:
    return [ngrams(sentence, n) for sentence in sentences]

def get_model(sentences: list, n: int=3, model_type: str="mle") -> LanguageModel:
    dataset = get_dataset(sentences, n)

    if model_type == "mle":
        model = MLE(order=N, vocabulary=vocabulary)
    elif model_type == "laplace":
        model = Laplace(order=N, vocabulary=vocabulary)
    elif model_type == "kneser_ney":
        model = KneserNeyInterpolated(order=N, vocabulary=vocabulary)
    elif model_type == "witten_bell":
        model = WittenBellInterpolated(order=N, vocabulary=vocabulary)
    else:
        model = Laplace(order=N, vocabulary=vocabulary)
    
    model.fit(dataset)
    return model


In [25]:
mle_model = get_model(train_sentences, N, "mle")
lp_model = get_model(train_sentences, N, "laplace")
kn_model = get_model(train_sentences, N, "kneser_ney")
wb_model = get_model(train_sentences, N, "witten_bell")

In [21]:
def evaluate(model: LanguageModel, sentences: list, n: int=3):
    dataset = [[word for word in sentence] for sentence in get_dataset(sentences, n)]
    pp = model.perplexity(dataset)
    print(f"{model.__class__.__name__} perplexity: {pp}")

In [26]:
evaluate(mle_model, test_sentences, N)
evaluate(lp_model, test_sentences, N)
evaluate(kn_model, test_sentences, N)
evaluate(wb_model, test_sentences, N)

MLE perplexity: inf
Laplace perplexity: 4702.999999999891
KneserNeyInterpolated perplexity: 4702.999999999891
WittenBellInterpolated perplexity: inf
