

# Build a Language Model Example to Understand how it works



!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install scikit-learn
!pip install nltk
!pip install ipywidgets
!pip install -U dill
!pip3 install requests
!pip3 install -U spacy

In [1]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import collections
plt.style.use(style='seaborn')
%matplotlib inline

In [2]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

In [3]:
from nltk.lm.preprocessing import pad_both_ends
from nltk.util import everygrams
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.lm.models import MLE
from nltk.lm.models import KneserNeyInterpolated

In [4]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize 

def generate_sent(model, num_words,char_seed, random_seed=42):
    """
    :param model: An ngram language model.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    for token in model.generate(num_words, text_seed=char_seed, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>'  or token == '< /s>' :
            break
        content.append(token)
    return detokenize(content)

In [7]:
def reverse_text(text):
    # first split the string into chars
    chars = text.split(' ')

    # then reverse the split string list and join with a space
    reversed_text = ' '.join(reversed(chars))
    return reversed_text

In [5]:
## Understanding how n-gram Models works - Made up an English Example to fully grasp the idea and play with it

In [6]:
#Understanding how n-gram Models works
s = "A cat is in the house. The cat is black in color. The house is green in color"
s = s.lower()

#simple tokens
simple_tokens = s.split()

print(simple_tokens)

# Freq distribution of ngrams
gut_ngrams = nltk.ngrams(simple_tokens,2)
freq_dist = nltk.FreqDist(gut_ngrams)
print("freq_dist:", freq_dist)

tokens = [word_tokenize(t) for t in sent_tokenize(s)]
this_train_data, this_padded_sent_list = padded_everygram_pipeline(5, tokens)

print_details= False
#If you iterate through this, the iterator is done with and model 
# fitting won't work subsequently
# so set print_details= False before trying the model

if(print_details):
    for ngramlize_sent in this_train_data:
        print(list(ngramlize_sent))
        print()

    print('----')
    #prints padded sentence i.e. sentence itself is padded
    print(list(this_padded_sent_list))

if(print_details==False):
    print("Fitting the model...")
    this_lm = MLE(5) #Try pentagram model
    this_lm.fit(this_train_data, this_padded_sent_list)

    print(this_lm.vocab)

    print(generate_sent(this_lm,  num_words=20,char_seed=['cat'], random_seed=48))
    print(generate_sent(this_lm,  num_words=20,char_seed=['house'], random_seed=48))
    print(generate_sent(this_lm,  num_words=20,char_seed=['house', 'is'], random_seed=48))
    print(generate_sent(this_lm,  num_words=20,char_seed=['house', 'is', 'green'], random_seed=48))
    print(generate_sent(this_lm,  num_words=20,char_seed=['house', 'is', 'green', 'in'], random_seed=48))

['a', 'cat', 'is', 'in', 'the', 'house.', 'the', 'cat', 'is', 'black', 'in', 'color.', 'the', 'house', 'is', 'green', 'in', 'color']
freq_dist: <FreqDist with 16 samples and 17 outcomes>
Fitting the model...
<Vocabulary with cutoff=1 unk_label='<UNK>' and 13 items>
is black in color.
is green in color
green in color
in color
color


In [17]:
#Understanding how n-gram Models works
# Reverse ngram
s = "A cat is in the house" # L to R text
s=  "house the in is cat A" # R to L text


s = s.lower()
s=reverse_text(s)

print(s)

#simple tokens
simple_tokens = s.split()

print(simple_tokens)

# Freq distribution of ngrams
gut_ngrams = nltk.ngrams(simple_tokens,2)
freq_dist = nltk.FreqDist(gut_ngrams)
print("freq_dist:", freq_dist)

tokens = [word_tokenize(t) for t in sent_tokenize(s)]
this_train_data, this_padded_sent_list = padded_everygram_pipeline(5, tokens)

print_details= False
#If you iterate through this, the iterator is done with and model 
# fitting won't work subsequently
# so set print_details= False before trying the model

if(print_details):
    for ngramlize_sent in this_train_data:
        print(list(ngramlize_sent))
        print()

    print('----')
    #prints padded sentence i.e. sentence itself is padded
    print(list(this_padded_sent_list))

if(print_details==False):
    print("Fitting the model...")
    this_lm = MLE(5) #Try pentagram model
    this_lm.fit(this_train_data, this_padded_sent_list)

    print(this_lm.vocab)
    

    print(generate_sent(this_lm,  num_words=20,char_seed=['cat'], random_seed=48))
    print(generate_sent(this_lm,  num_words=20,char_seed=['house'], random_seed=48))
    print(generate_sent(this_lm,  num_words=20,char_seed=['house', 'is'], random_seed=48))
    print(generate_sent(this_lm,  num_words=20,char_seed=['house', 'is', 'green'], random_seed=48))
    print(generate_sent(this_lm,  num_words=20,char_seed=['house', 'is', 'green', 'in'], random_seed=48))
    
    print(generate_sent(this_lm,  num_words=20,char_seed=['in'], random_seed=48))
    print(generate_sent(this_lm,  num_words=20,char_seed=['color'], random_seed=48))

a cat is in the house
['a', 'cat', 'is', 'in', 'the', 'house']
freq_dist: <FreqDist with 5 samples and 5 outcomes>
Fitting the model...
<Vocabulary with cutoff=1 unk_label='<UNK>' and 9 items>
is in the house

in the house
a cat is in the house
the house
the house
a cat is in the house
