In [8]:
import itertools
from typing import Tuple

from nltk.lm import Vocabulary
from nltk.util import ngrams

In [2]:
N = 3
train_filename = "wiki-en-train.word"

In [3]:
sentences = []
with open(train_filename, "rt") as f:
    for line in f:
        words = ["__BOS__"] + line.lower().split() + ["__EOS__"]
        sentences.append(words)

In [39]:
from nltk.lm.models import LanguageModel

def show(model: LanguageModel, vocab: Vocabulary, context: Tuple[str], top_n: int=5):
    scores = []
    for word in model.context_counts(model.vocab.lookup(context)):
        scores.append([word, model.score(word, context)])
    scores = sorted(scores, key=lambda x: x[1])[::-1]

    print(f"Model: {model.__class__}")
    print(f"The number of candidates: {len(scores)}")
    print(f"Show top {top_n} probabilities:")
    for score in scores[:top_n]:
        print(f"P({score[0]}|{','.join(context)})={score[1]}")
    print()

In [40]:
vocabulary = Vocabulary(itertools.chain.from_iterable(sentences))
context = ("natural", "language")

In [41]:
# MLE
from nltk.lm.models import MLE
mle = MLE(order=N, vocabulary=vocabulary)
text_ngram = [ngrams(sentence, N) for sentence in sentences]
mle.fit(text_ngram)

In [42]:
show(mle, vocabulary, context)

Model: <class 'nltk.lm.models.MLE'>
The number of candidates: 23
Show top 5 probabilities:
P(processing|natural,language)=0.4057971014492754
P(understanding|natural,language)=0.17391304347826086
P(generation|natural,language)=0.07246376811594203
P(input|natural,language)=0.057971014492753624
P(parsing|natural,language)=0.028985507246376812



In [43]:
# Witten Bell Smoothing
from nltk.lm.models import WittenBellInterpolated
wlm = WittenBellInterpolated(order=N, vocabulary=vocabulary)
text_ngram = [ngrams(sentence, N) for sentence in sentences]
wlm.fit(text_ngram)

In [44]:
show(wlm, vocabulary, context)

Model: <class 'nltk.lm.models.WittenBellInterpolated'>
The number of candidates: 23
Show top 5 probabilities:
P(processing|natural,language)=0.4055270709744075
P(understanding|natural,language)=0.17379731613188892
P(generation|natural,language)=0.07241554838828705
P(input|natural,language)=0.05793243871062964
P(parsing|natural,language)=0.02896621935531482



In [45]:
# Kneser-Ney smoothing
from nltk.lm.models import KneserNeyInterpolated
klm = KneserNeyInterpolated(order=N, vocabulary=vocabulary)
text_ngram = [ngrams(sentence, N) for sentence in sentences]
klm.fit(text_ngram)

In [46]:
show(klm, vocabulary, context)

Model: <class 'nltk.lm.models.KneserNeyInterpolated'>
The number of candidates: 23
Show top 5 probabilities:
P(processing|natural,language)=0.4043549137614905
P(understanding|natural,language)=0.172470855790476
P(generation|natural,language)=0.07102158042815718
P(input|natural,language)=0.05652882680496877
P(parsing|natural,language)=0.027543319558591958

