# N-Gram Language Model

In [117]:
import numpy as np
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from nltk.util import ngrams

In [118]:
text = open('at.txt', 'r').read()
sent_tokenize_list = sent_tokenize(text)

In [114]:
train_, test_ =  train_test_split(sent_tokenize_list, test_size = 0.2, shuffle = True)

In [119]:
#I process the sentence tokenized list. All sentences are first tokenized into words. 
#<s> and </s> characters are added to mark begin and end of each sentence.
#All sentences are then put in a single list called vocabullary.
ls = []
sent_tokenize_list = [token.lower() for token in train_]
vocabulary = []
for sentence in sent_tokenize_list:
    sentence = "< "+sentence+" >"
    words = word_tokenize(sentence)
    for word in words:
        if word == ',' or word == '.':
            pass
        elif word == '<':
            vocabulary.append("<s>")
        elif word == '>':
            vocabulary.append("</s>")
        else:
            vocabulary.append(word)       
print(len(vocabulary))

178675


In [120]:
#This is preprocessing step for MLE computation.
#For unigram and bigram inbuilt NgramCounter function is used.
#For trigram and quadgram a dictionary of all possible trigrams and quadgrams and 
#corresponding counts of each is storred.
text_unigrams = [ngrams(vocabulary, 1)]
text_bigrams = [ngrams(vocabulary, 2)]
from nltk.lm import NgramCounter
unigram_counts = NgramCounter(text_unigrams)
bigram_counts = NgramCounter(text_bigrams)
trigram_dict = {}
for i in range(len(vocabulary)-2):
    string_ = vocabulary[i]+" "+vocabulary[i+1]+" "+vocabulary[i+2] 
    if string_ not in trigram_dict:
        trigram_dict[string_] = 1
    else:
        trigram_dict[string_] += 1
quadgram_dict = {}
for i in range(len(vocabulary)-3):
    string_ = vocabulary[i]+" "+vocabulary[i+1]+" "+vocabulary[i+2]+" "+vocabulary[i+3] 
    if string_ not in quadgram_dict:
        quadgram_dict[string_] = 1
    else:
        quadgram_dict[string_] += 1

In [121]:
#Here MLE for unigram, bigram, trigram and quadgram is calculated.
def unigramMLE(phrase):
    return unigram_counts[phrase]/len(vocabulary)
def bigramMLE(phrase):
    phrase_list = phrase.split()
    return bigram_counts[[phrase_list[0]]][phrase_list[1]]/unigram_counts[phrase_list[0]]
def trigram(phrase):
    phrase_list = phrase.split()
    return trigram_dict[phrase]/bigram_counts[[phrase_list[0]]][phrase_list[1]]
def quadgram(phrase):
    phrase_list = phrase.split()
    str_ = " "
    return quadgram_dict[phrase]/trigram_dict[str_.join(phrase_list[:-1])]

In [122]:
#Demonstration for all four MLEs.
print(quadgram("goldman sachs and citibank"))
print(bigramMLE("america great"))
print(trigram("i will do"))
print(unigramMLE("america"))

1.0
0.25153374233128833
0.021505376344086023
0.000912270882887925


In [177]:
#
def generate(n):
    #n-gram model will be used to generate text. For unigram n=1.
    mle_dict = {}
    # Making MLE dictionary to store all n-1 grams and corresponding next words. Hard coded.
    for i in range(len(vocabulary)):
        try:
            string_=""
            for j in range(i,i+n-1):
                if vocabulary[j]=="</s>":
                    continue
                string_ = string_+vocabulary[j]+" "
            string_ = string_.strip()
            #A list of dictionaries corresponding to each n-1 gram.
            if string_ not in mle_dict.keys():
                mle_dict[string_] = {vocabulary[i+n-1]:1}
            else:
                if vocabulary[i+n-1] not in mle_dict[string_]:
                    mle_dict[string_][vocabulary[i+n-1]] = 1
                else:
                    mle_dict[string_][vocabulary[i+n-1]] += 1
        except Exception as e:
            break
    st_list = []
    for key in mle_dict:
        ct = sum(mle_dict[key].values())
        if key[0:3] == "<s>":
            st_list.append((key, ct))
        for i in mle_dict[key]:
            mle_dict[key][i] /= ct
    ct = 0
    generated_sentence = ""
    gen_prob = []
    for i in st_list:
        ct+=i[1]
    for i in st_list:   
        gen_prob.append(i[1]/ct)
    # Using multinomial to predict next word with probabilities provided.
    a = np.random.multinomial(1, gen_prob, size=1)
    index = list(a[0]).index(1)
    generated_sentence += st_list[index][0] + " "
    last_ngram_pick = st_list[index][0]
    while('</s>' not in generated_sentence[-6:-1] and len(generated_sentence)<150):
        gen_prob = []
        ct = sum(mle_dict[last_ngram_pick].values())
        for key in mle_dict[last_ngram_pick]:
            gen_prob.append(mle_dict[last_ngram_pick][key]/ct)
        a = np.random.multinomial(1, gen_prob, size=1)
        index = list(a[0]).index(1)
        generated_sentence += list(mle_dict[last_ngram_pick].keys())[index] + " "
    print("a generated sentence",generated_sentence)
#     print(mle_dict
    return mle_dict

In [178]:
mle_dict = generate(4)

a generated sentence <s> we have a some tremendous no no to to no people people cities an $ to to a interest borders a an very great – 2,000 – people a great a these third 


In [183]:
import math
# This function returns perplexity for ngram model. input n; prints perplexity; 
def perplexity_(n):
    mle_dict = generate(n)
    # First we pre process test set.
    ls = []
    sent_tokenize_list = [token.lower() for token in test_]

    final_perplexity = 0
    for sentence in sent_tokenize_list:
        
        sentence = "!"+sentence+"!"
        #We tokenize sentence
        words = word_tokenize(sentence)
        words[0] = "<s>"
        words[-1]="</s>"
#  
        for i in range(len(words)):
                perplexity = 1
                #Intializing proplexity by 1.
#             try:
                last_ngram_pick = " ".join(words[i:n-1+i])
                if last_ngram_pick not in mle_dict:
                    #If provided ngram doesn't exist in the train set.
                    perplexity*= 1/len(mle_dict)
                else:
                    ct = sum(mle_dict[last_ngram_pick].values())
                    for key in mle_dict[last_ngram_pick]:
                        if key == words[i+n-1]:
                            # Using log to operate in extreme values.
                            perplexity += math.log2(mle_dict[last_ngram_pick][key]/ct)
                
        final_perplexity += (2**(perplexity*-1/i))

    final_perplexity = final_perplexity
    print("final_perplexity: ",final_perplexity)
perplexity_(50)        

a generated sentence <s> i am totally in favor of vaccines but i want smaller doses over a longer period of time <s> so we have isis taking a lot of oil <s> now let ’ s take some questions <s> it ’ s just gross incompetence at 
final_perplexity:  3298.9986398257684


In [None]:
sentence from trigram: 
    <s> but they the you when don it then likewise we don those so a what when there 
    they they with the you – you you i bill i you we i we he carrier likewise 
sentence from bigram:
    <s> the you i the except so we you the i we but sometimes all and well you you i well 
    and how a so …it so but i i club people i and you i but our but </s>
sentence from quadgram:
    <s> and you adam the the don by i we that steve i somebody i you it then maybe you 
    what there we then i people i she then what we then you you our here 

Readability of trigram and quadgram is much better but still makes no sense. 
Speech corpus is comparatively small corpus, hence conventional approach is very less sensible.
    
Perplexity of trigram model: 3298
    