In [2]:
from nltk.corpus import brown
corpus_words = [w.lower() for w in brown.words()]
print(len(corpus_words), corpus_words[:10])

corpus_sents = brown.sents()
print(len(corpus_sents), corpus_sents[:10])

words = corpus_words[:10]

1161192 ['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of']
57340 [['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Ma

# ngram and data smooth algorithm

Practice ngram models in nltk and several smooth algorithms, including the simplest additive smooth, Knerser Ney algorithm.

## N-gram 
+ Using **_ngrams function_** in nltk to extract n-grams tuple from given word tokens.

In [3]:
from nltk.util import ngrams,bigrams, trigrams
print("Results for ngrams function")
print("n=1: ", list(ngrams(words, n=1)))
print("\nn=2: ", list(ngrams(words, n=2)))
print("\nn=3: ", list(ngrams(words, n=3)))

print("\nn=2 with left and right padding:")
print(list(ngrams(words, n=2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')))

print("\nResults for bigrams function")
print(list(bigrams(words)))

print("\nResults for trigrams function")
print(list(trigrams(words)))

Results for ngrams function
n=1:  [('the',), ('fulton',), ('county',), ('grand',), ('jury',), ('said',), ('friday',), ('an',), ('investigation',), ('of',)]

n=2:  [('the', 'fulton'), ('fulton', 'county'), ('county', 'grand'), ('grand', 'jury'), ('jury', 'said'), ('said', 'friday'), ('friday', 'an'), ('an', 'investigation'), ('investigation', 'of')]

n=3:  [('the', 'fulton', 'county'), ('fulton', 'county', 'grand'), ('county', 'grand', 'jury'), ('grand', 'jury', 'said'), ('jury', 'said', 'friday'), ('said', 'friday', 'an'), ('friday', 'an', 'investigation'), ('an', 'investigation', 'of')]

n=2 with left and right padding:
[('<s>', 'the'), ('the', 'fulton'), ('fulton', 'county'), ('county', 'grand'), ('grand', 'jury'), ('jury', 'said'), ('said', 'friday'), ('friday', 'an'), ('an', 'investigation'), ('investigation', 'of'), ('of', '</s>')]

Results for bigrams function
[('the', 'fulton'), ('fulton', 'county'), ('county', 'grand'), ('grand', 'jury'), ('jury', 'said'), ('said', 'friday'), (

## Data Smooth in NLTK
Using **_N-gram Module_** and related modules in NLTK to build language model with different data smooth algorithms. Because Knerser-Ney module in NLTK can only be used for trigrams, we extract the trigrams here.

Other open source language model tools for Knerser-Ney algorithm can be referenced [http://smithamilli.com/blog/kneser-ney/](http://smithamilli.com/blog/kneser-ney/).

In [77]:
# brown_grams = [gram for sent in corpus_sents[:1000] for gram in ngrams(sent[:-2], 3, pad_left=True, left_pad_symbol='BOS', pad_right=True, right_pad_symbol='EOS')]
brown_grams = [gram for sent in corpus_sents[:1000] for gram in ngrams(sent[:-2], 3)]
print(brown_grams[:10])

from nltk.probability import FreqDist
fdist = FreqDist(brown_grams)

[('The', 'Fulton', 'County'), ('Fulton', 'County', 'Grand'), ('County', 'Grand', 'Jury'), ('Grand', 'Jury', 'said'), ('Jury', 'said', 'Friday'), ('said', 'Friday', 'an'), ('Friday', 'an', 'investigation'), ('an', 'investigation', 'of'), ('investigation', 'of', "Atlanta's"), ('of', "Atlanta's", 'recent')]


In [78]:
def sort_samples_by_prob(probdist):
    sample_prob = dict((s, probdist.prob(s)) for s in probdist.samples())
    return sorted(sample_prob.items(), key=lambda x: x[1], reverse=True)

In [79]:
# maximum likelihood
from nltk.probability import MLEProbDist
mle = MLEProbDist(fdist)
sample_prob = sort_samples_by_prob(mle)
print("top 20 highest probability:")
for t in sample_prob[:20]: print(t)
print("top 10 lowest probability:")
for t in sample_prob[-11:]: print(t)

top 20 highest probability:
((',', 'he', 'said'), 0.0008783004885546468)
(('the', 'United', 'States'), 0.0008234067080199813)
(("''", ',', 'he'), 0.0007136191469506505)
(('said', ',', '``'), 0.0007136191469506505)
(('he', 'said', ','), 0.0007136191469506505)
(("''", ',', 'the'), 0.0006587253664159851)
(('is', 'expected', 'to'), 0.0004940440248119889)
(('chairman', 'of', 'the'), 0.00038425646374265796)
(('one', 'of', 'the'), 0.00038425646374265796)
(('session', 'of', 'the'), 0.00038425646374265796)
((',', 'the', 'jury'), 0.00038425646374265796)
(('in', 'the', 'state'), 0.00038425646374265796)
(('some', 'of', 'the'), 0.00038425646374265796)
(('of', 'the', 'Republican'), 0.00038425646374265796)
(('that', 'the', 'United'), 0.00032936268320799255)
(('Mr.', 'Hawksley', 'said'), 0.00032936268320799255)
(('president', 'of', 'the'), 0.00032936268320799255)
(('the', 'sales', 'tax'), 0.00032936268320799255)
((',', 'it', 'was'), 0.00032936268320799255)
(('``', 'This', 'is'), 0.00027446890267332713

In [86]:
# addictive smooth
add_fdist = dict(fdist)
print(fdist[('the', 'United', 'States')])
add_fdist = FreqDist({key: value+1 for key,value in add_fdist.items()})
print(add_fdist[('the', 'United', 'States')])

mle = MLEProbDist(add_fdist)
sample_prob = sort_samples_by_prob(mle)
print("top 20 highest probability:")
for t in sample_prob[:20]: print(t)
print("top 10 lowest probability:")
for t in sample_prob[-11:]: print(t)

15
16
top 20 highest probability:
((',', 'he', 'said'), 0.0004801310475329737)
(('the', 'United', 'States'), 0.00045188804473691644)
(("''", ',', 'he'), 0.00039540203914480187)
(('he', 'said', ','), 0.00039540203914480187)
(('said', ',', '``'), 0.00039540203914480187)
(("''", ',', 'the'), 0.0003671590363487446)
(('is', 'expected', 'to'), 0.0002824300279605728)
(('one', 'of', 'the'), 0.00022594402236845822)
(('in', 'the', 'state'), 0.00022594402236845822)
(('session', 'of', 'the'), 0.00022594402236845822)
(('chairman', 'of', 'the'), 0.00022594402236845822)
((',', 'the', 'jury'), 0.00022594402236845822)
(('some', 'of', 'the'), 0.00022594402236845822)
(('of', 'the', 'Republican'), 0.00022594402236845822)
(('that', 'the', 'United'), 0.00019770101957240093)
(('Mr.', 'Hawksley', 'said'), 0.00019770101957240093)
(('the', 'sales', 'tax'), 0.00019770101957240093)
(('president', 'of', 'the'), 0.00019770101957240093)
((',', 'it', 'was'), 0.00019770101957240093)
(('``', 'This', 'is'), 0.0001694580

可以观察到，进行了加性平滑后高概率的前20个元组的概率都下降了将近一半。说明训练集中词汇表的词汇量较多，从而导致加上的1被平均了很多。而出现概率小的那些词组的概率则上升了一些。并且，这些说明了加法平滑方法的”劫“的程度受到词汇量影响很大。

In [87]:
# Good Turing Smooth
from nltk.probability import SimpleGoodTuringProbDist
fdist = FreqDist(fdist)
gtp = SimpleGoodTuringProbDist(fdist)
print("discount = ", gtp.discount())
sample_prob = sort_samples_by_prob(gtp)
print("top 20 highest probability:")
for t in sample_prob[:20]: print(t, fdist[t[0]], fdist.Nr(fdist[t[0]]), fdist.Nr(fdist[t[0]]+1))
print("top 10 lowest probability:")
for t in sample_prob[-11:]: print(t, fdist[t[0]], fdist.Nr(fdist[t[0]]), fdist.Nr(fdist[t[0]]+1))

discount =  0.32453303447839277
top 20 highest probability:
((',', 'he', 'said'), 0.000709406063961109) 16 1 0
(('the', 'United', 'States'), 0.0006583934318078581) 15 1 1
(("''", ',', 'he'), 0.0005567072141643199) 13 3 0
(('he', 'said', ','), 0.0005567072141643199) 13 3 0
(('said', ',', '``'), 0.0005567072141643199) 13 3 0
(("''", ',', 'the'), 0.0005060835398362218) 12 1 3
(('is', 'expected', 'to'), 0.00035558725507260815) 9 1 0
(('one', 'of', 'the'), 0.00025720007638937444) 7 7 0
(('in', 'the', 'state'), 0.00025720007638937444) 7 7 0
(('session', 'of', 'the'), 0.00025720007638937444) 7 7 0
(('chairman', 'of', 'the'), 0.00025720007638937444) 7 7 0
((',', 'the', 'jury'), 0.00025720007638937444) 7 7 0
(('some', 'of', 'the'), 0.00025720007638937444) 7 7 0
(('of', 'the', 'Republican'), 0.00025720007638937444) 7 7 0
(('that', 'the', 'United'), 0.00020904794161914772) 6 5 7
(('Mr.', 'Hawksley', 'said'), 0.00020904794161914772) 6 5 7
(('the', 'sales', 'tax'), 0.00020904794161914772) 6 5 7
(('

从上述结果中可以观察到，对于出现频次高的词组，其概率相比最大似然有所下降，但是下降幅度远小于加法平滑方法。对于出现频次低的词组，其概率相比最大似然也下降了，并且下降的幅度非常大。这是因为出现次数为2的词组的次数是出现次数为1的词组的次数的百分之一（537/16489)。说明：古德图灵平滑方法中词的概率受到$N_{r+1}$的影响很大。但是我认为古德图灵方法只是利用了齐夫定律缩小词的个数，但是$N_r$和$N_{r+1}$之间没有可解释关系，即词出现的次数为什么要受到出现次数比其多1的词的个数影响？这个问题没有解释。

In [88]:
# Witten Bell smooth
from nltk.probability import WittenBellProbDist
wb = WittenBellProbDist(fdist)
sample_prob = sort_samples_by_prob(wb)
print("top 20 highest probability:")
for t in sample_prob[:20]: print(t)
print("top 10 lowest probability:")
for t in sample_prob[-11:]: print(t)

ZeroDivisionError: division by zero

In [89]:
from nltk.probability import KneserNeyProbDist
knerser_ney = KneserNeyProbDist(fdist)
sample_prob = sort_samples_by_prob(knerser_ney)
print("top 20 highest probability:")
for t in sample_prob[:20]: print(t)
print("top 10 lowest probability:")
for t in sample_prob[-11:]: print(t)

top 20 highest probability:
(('chairman', 'of', 'the'), 0.8928571428571429)
(('the', 'need', 'for'), 0.85)
(('precinct', 'of', 'the'), 0.8125)
(('the', 'Citizens', 'Group'), 0.8125)
(('a', 'number', 'of'), 0.8125)
(('the', 'full', 'amount'), 0.8125)
(('aged', 'care', 'plan'), 0.8125)
(('be', 'able', 'to'), 0.8125)
((',', 'according', 'to'), 0.8125)
(('the', 'United', 'States'), 0.7916666666666666)
(('session', 'of', 'the'), 0.78125)
(('to', 'attend', 'the'), 0.75)
(('the', 'views', 'of'), 0.75)
(('study', 'of', 'the'), 0.75)
(('there', 'would', 'be'), 0.75)
(('a', 'series', 'of'), 0.75)
((',', 'chairman', 'of'), 0.75)
(('the', 'alliance', ','), 0.75)
(('headed', 'by', 'the'), 0.75)
(('the', 'sales', 'tax'), 0.75)
top 10 lowest probability:
(('of', 'the', 'Kennedy'), 0.0012195121951219512)
(('of', 'the', 'concessionaires'), 0.0012195121951219512)
(('of', 'the', 'twenty'), 0.0012195121951219512)
(('of', 'the', 'avenue'), 0.0012195121951219512)
(('of', 'the', 'board'), 0.00121951219512195

从上面的结果可以观察到，经过Knerser Ney平滑之后，所有词组的概率都有了大幅度的提升。并且值得注意的是，对于出现频率高的词组，前20个词组中那些在句子边缘的词组已经被平滑掉了，现在是一些语义较强的词组。

## Language Model in NLTK
在nltk中有一个专门的lm模块用于语言模型，包括预处理，n-gram模型以及各种平滑方法。

In [79]:
# prepare data. Here we only use the first 100 sentences from brown corpora.
train_text = corpus_sents[:100]
print("a train text", train_text[:10])
n = 2 # bigrams

# Transform the text into bigrams.
from nltk.util import bigrams, everygrams
from nltk.lm.preprocessing import pad_both_ends, flatten 

# padding one sentence for bigrams and extract bigrams
pad_sent_bigrams = bigrams(pad_both_ends(train_text[0], n=n))
# print(list(pad_sent_bigrams))

# padding one sentence for bigrams and extract unigram, bigrams
all_grams = everygrams(list(pad_both_ends(train_text[0], n=n)))
# print(list(all_grams))

# padding all sentences and combine them as a list
pad_sents = flatten(pad_both_ends(sent, n=n) for sent in train_text[:2])
# print(list(pad_sents))

# All above can be simplify to one function
# from nltk.lm.preprocessing import padded_everygram_pipeline
# train_grams, pad_words = padded_everygram_pipeline(n, train_text)
# train_grams: list of everygrams from padded train_text 
# pad_words: flat stream of padded train_text


# print(id(train_grams))
# train_l = list(train_grams)
# print(id(train_l), len(train_l))
# print(id(train_grams))
# train_m = list(train_grams)
# print(id(train_m), len(train_m))
# 112218747912
# 112219322888 100
# 112218747912
# 112216366664 0
# !!! Cannot convert generator

# Because of the above reason, we need to reproduce train_grams and pad_words for every language model

a train text [['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.'], ['``', 'Only', 'a', 'relative', 'handful', 'of',

In [129]:
from nltk.lm.preprocessing import pad_both_ends
from nltk.util import bigrams
def test_lm(model):
    test_word = "the" # ('the', 128) 
    print("score of test word {} is: {}".format(test_word, model.score(test_word)))
    
    test_word = "chairman" # ('chairman', 2), probability of word with different smooth methods
    print("score of test word {} is: {}".format(test_word, model.score(test_word)))
    
    test_tuple = ("Fulton", "County") # probability of word `States` given context `United`
    print("score of {} given {} is: {}".format(test_tuple[1], test_tuple[0], 
                                               model.score(test_tuple[1], [test_tuple[0]])))
    
    
    # test words don't occur in training corpus
    test_word = "alliance"
    print("score of not exist word {} is: {}".format(test_word, model.score(test_word)))
          
    test_tuple = ("New", "Year")
    print("score of not exist {} given {} is: {}".format(test_tuple[1], test_tuple[0], 
                                               model.score(test_tuple[1], [test_tuple[0]])))
    
    test_sent = corpus_sents[150] # probability of a sentence, i.e., the total probability 
                                  # of n-gram tuples from this sentence
    print("test sentence is: ", test_sent)
    not_exist_words = [word for word in test_sent if model.counts[word] == 0]
    print("not_exist_words: ", not_exist_words)
    test_tokens = bigrams(pad_both_ends(test_sent, 2))
    print("entropy of test sentence is: ", model.entropy(test_tokens)) # cross-entropy
    print("perplexity of test sentence is: ", model.perplexity(test_tokens))
# 由于困惑度计算的时候除以两个词共同出现的次数，从而导致除0错误。？？？？？

In [138]:
from nltk.lm.preprocessing import padded_everygram_pipeline
train_grams, pad_words = padded_everygram_pipeline(n, train_text)

# Maximum Likelihood Model
from nltk.lm import MLE
mle = MLE(n) # 2 is the highest order
mle.fit(train_grams, pad_words)
print(mle.vocab)
test_lm(mle)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 863 items>
score of test word the is: 0.05186385737439222
score of test word chairman is: 0.0008103727714748784
score of County given Fulton is: 0.42857142857142855
score of not exist word alliance is: 0.0
score of not exist Year given New is: 0
test sentence is:  ['Opponents', 'generally', 'argued', 'that', 'the', 'ballot', "couldn't", 'give', 'enough', 'information', 'about', 'tax', 'proposals', 'for', 'the', 'voters', 'to', 'make', 'an', 'intelligent', 'choice', '.']
not_exist_words:  ['Opponents', 'argued', "couldn't", 'enough', 'information', 'proposals', 'intelligent', 'choice']
entropy of test sentence is:  inf


ZeroDivisionError: division by zero

最大似然对于未出现在训练集中的词语的计数为0，从而导致了计算熵的时候对数的底数为0。

In [139]:
from nltk.lm.preprocessing import padded_everygram_pipeline
train_grams, pad_words = padded_everygram_pipeline(n, train_text)

# Laplace (add-one) Smooth Model
from nltk.lm import Laplace
lm = Laplace(n)
lm.fit(train_grams, pad_words)
test_lm(lm)

score of test word the is: 0.0387271089762834
score of test word chairman is: 0.0009006304413089162
score of County given Fulton is: 0.00798175598631699
score of not exist word alliance is: 0.0003002101471029721
score of not exist Year given New is: 0.0011587485515643105
test sentence is:  ['Opponents', 'generally', 'argued', 'that', 'the', 'ballot', "couldn't", 'give', 'enough', 'information', 'about', 'tax', 'proposals', 'for', 'the', 'voters', 'to', 'make', 'an', 'intelligent', 'choice', '.']
not_exist_words:  ['Opponents', 'argued', "couldn't", 'enough', 'information', 'proposals', 'intelligent', 'choice']
entropy of test sentence is:  9.180327718582245


ZeroDivisionError: division by zero

In [140]:
from nltk.lm.preprocessing import padded_everygram_pipeline
train_grams, pad_words = padded_everygram_pipeline(n, train_text)

# Lidstone Smooth Model
from nltk.lm import Lidstone
lm = Lidstone(order=n, gamma=0.75)
lm.fit(train_grams, pad_words)
test_lm(lm)

score of test word the is: 0.04132894631249499
score of test word chairman is: 0.0008827541930824172
score of County given Fulton is: 0.010207939508506616
score of not exist word alliance is: 0.00024075114356793194
score of not exist Year given New is: 0.0011587485515643105
test sentence is:  ['Opponents', 'generally', 'argued', 'that', 'the', 'ballot', "couldn't", 'give', 'enough', 'information', 'about', 'tax', 'proposals', 'for', 'the', 'voters', 'to', 'make', 'an', 'intelligent', 'choice', '.']
not_exist_words:  ['Opponents', 'argued', "couldn't", 'enough', 'information', 'proposals', 'intelligent', 'choice']
entropy of test sentence is:  9.12161165507209


ZeroDivisionError: division by zero

In [141]:
from nltk.lm.preprocessing import padded_everygram_pipeline
train_grams, pad_words = padded_everygram_pipeline(n, train_text)

# Witten Bell Smooth Model
from nltk.lm import WittenBellInterpolated
lm = WittenBellInterpolated(n)
lm.fit(train_grams, pad_words)
test_lm(lm)

score of test word the is: 0.05186385737439222
score of test word chairman is: 0.0008103727714748784
score of County given Fulton is: 0.42732021301227135
score of not exist word alliance is: 0.0
score of not exist Year given New is: 0.0
test sentence is:  ['Opponents', 'generally', 'argued', 'that', 'the', 'ballot', "couldn't", 'give', 'enough', 'information', 'about', 'tax', 'proposals', 'for', 'the', 'voters', 'to', 'make', 'an', 'intelligent', 'choice', '.']
not_exist_words:  ['Opponents', 'argued', "couldn't", 'enough', 'information', 'proposals', 'intelligent', 'choice']
entropy of test sentence is:  inf


ZeroDivisionError: division by zero

In [143]:
from nltk.lm.preprocessing import padded_everygram_pipeline
train_grams, pad_words = padded_everygram_pipeline(n, train_text)

# Kneser-Ney Smooth Model
from nltk.lm import KneserNeyInterpolated
lm = KneserNeyInterpolated(n)
lm.fit(train_grams, pad_words)
test_lm(lm)

score of test word the is: 0.0011587485515643105
score of test word chairman is: 0.0011587485515643105
score of County given Fulton is: 0.42148650885614963
score of not exist word alliance is: 0.0011587485515643105


ZeroDivisionError: float division by zero

| Models         | the(128) | chairman(2) | Fulton County(6) | alliance(not exist) | New Year(not exist) | sentence(has not exist words) entropy | sentence(has not exist words) perplexity |
| -------------- | -------- | ----------- | ---------------- | ------------------- | ------------------- | ------------------------------------- | ---------------------------------------- |
| MLE            | 0.051863 | 0.000810    | 0.428571         | 0.0                 | 0                   | inf                                   | ZeroDivisionError                        |
| Laplace        | 0.038727 | 0.000900    | 0.007981         | 0.000300            | 0.001158            | 9.180327                              | ZeroDivisionError                        |
| Lidstone(0.75) | 0.041328 | 0.000882    | 0.010207         | 0.000240            | 0.0011587           | 9.121611                              | ZeroDivisionError                        |
| Witten Bell    | 0.051863 | 0.000810    | 0.427320         | 0.0                 | 0.0                 | inf                                   | ZeroDivisionError                        |
| Kneser Ney     | 0.001158 | 0.001158    | 0.421486         | 0.001158            | ZeroDivisionError   | ZeroDivisionError                     | ZeroDivisionError                        |



从上述汇总结果可以观察到：
+ 对于出现在训练集中，且次数较多的词语`the`，使用平滑方法之后其概率都有所下降。对于不同的平滑方法，Lidstone方法比Laolace方法下降幅度小，这是因为gamma参数值小，从而导致除数略小；Witten Bell方法没有下降；Kneserver Ney方法平滑程度最大，下降至0.001158。
+ 对于出现在训练集中，且次数较小的词语（chairman），使用平滑方法之后其概率有所上升。
+ 对于句子的entropy来说，由于句子中出现了频次为0的词语，导致MLE方法中entropy为inf，这是正常的。使用了Laplace和Lidstone方法之后，其entropy有值，并且Lidstone的entropy比Laplace小。Witten Bell平滑方法的entropy为什么也是inf，不是使用了平滑了吗？？？另外，Kneser Ney方法又为何出现了除0错误，其内部如何实现？？？
+ 对于句子的perplexity来说，由于计数为0，根据perplexity的计算公式导致了除0错误？？？
+ 对于Kneser Ney方法来说，不管是出现频次高（the），频次低（chairman）或者没有出现的单词（alliance），其对应score都相同，难道是nltk内部实现方法有问题？？

另外，由于一些平滑方法使用了低阶语法的信息，所以我们在构建模型的时候需要输入everygrams???

## 其他语言模型工具
### SRILM
[SRILM](http://www.speech.sri.com/projects/srilm/)由SRI实验室开发，用来构建和应用统计语言模型，主要用于语音识别，统计标注和切分，以及机器翻译。使用c++语言实现。语言模型使用的是ngram模型，包括最大似然及一些平滑方法（Good-Turing，Katz回退，插值Kneser-Ney）。

[http://www.52nlp.cn/language-model-training-tools-srilm-details](http://www.52nlp.cn/language-model-training-tools-srilm-details)
[https://www.cnblogs.com/welen/p/7593222.html](https://www.cnblogs.com/welen/p/7593222.html)

### IRSLTM
[IRSLTM](https://github.com/irstlm-team/irstlm)由Trento FBK-IRST实验室开发，主要用于在大规模数据上训练语言模型。IRSLTM通过采用训练子语言模型最终融合的方法，达到减少内存消耗，提高训练速度的目的：
+ 在训练语料上统计带词频词汇表；
+ 按照词频均衡的原则将词汇表划分为若干个子词汇表；
+ 对各个子词汇表统计 n-gram,这些 n-gram 必须以词汇表中的词汇开头；
+ 根据第四步的统计结果,建立多个子语言模型；
+ 把所有的子语言模型融合成最终语言模型；

[http://www.52nlp.cn/language-modeling-toolkit-irstlm-installation-and-trial-noting](http://www.52nlp.cn/language-modeling-toolkit-irstlm-installation-and-trial-noting)
