# Indus Valley Script- Text Analysis for Decipherment 

# Language Models

Dataset was created as a csv file from ICIT web site from raw html files of ICIT code for each for the Text
Data labels were changes and a linearized copy of the original text was added

### Input:
Pickled data file from Input Data Processing

### Output:
Pickled model files from Language Models


!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install scikit-learn
!pip install nltk
!pip install ipywidgets
!pip install -U dill
!pip3 install requests
!pip3 install -U spacy

In [1]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import collections
import random
import traceback
import pickle

plt.style.use(style='seaborn')
%matplotlib inline

In [2]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from collections import defaultdict
from nltk.lm.preprocessing import pad_both_ends
from nltk.util import everygrams
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.lm.models import MLE
from nltk.lm.models import KneserNeyInterpolated

In [3]:
random_seed = 8
seed = 8
CONST_INITIAL = "Initial"
CONST_TERMINAL = "Terminal"
CONST_MEDIAL = "Medial"
CONST_NL = 10

In [4]:
"""" UnPickle the dataframes """

orig_sign_df = pd.read_pickle('pickle/orig_sign_df.pkl')
print("Original sign df: \n", orig_sign_df)

orig_df = pd.read_pickle('pickle/upd_orig_df.pkl')
print("Updated original text df: \n", orig_df)

df = pd.read_pickle('pickle/clean_df.pkl')
print("Cleaned text df: \n", df )

df_unclear = pd.read_pickle('pickle/unclear_df.pkl')
print("Unclear text df: \n", df_unclear)

df_multi_line = pd.read_pickle('pickle/multi_line_df.pkl')
print("Multi-line text df: \n", df_multi_line)

df_all_x = pd.read_pickle('pickle/all_x.pkl')
df_all_y = pd.read_pickle('pickle/all_y.pkl')

df_all_x_rev = pd.read_pickle('pickle/all_x_rev.pkl')
df_all_y_rev = pd.read_pickle('pickle/all_y_rev.pkl')

df_train_x = pd.read_pickle('pickle/train_x.pkl')
df_train_y = pd.read_pickle('pickle/train_y.pkl')

df_train_x_rev = pd.read_pickle('pickle/train_x_rev.pkl')
df_train_y_rev = pd.read_pickle('pickle/train_y_rev.pkl')

df_test_x = pd.read_pickle('pickle/test_x.pkl')
df_test_y= pd.read_pickle('pickle/test_y.pkl')

df_test_x_rev = pd.read_pickle('pickle/test_x_rev.pkl')
df_test_y_rev = pd.read_pickle('pickle/test_y_rev.pkl')

df_all = pd.read_pickle('pickle/all_df.pkl')
print("all df: \n", df_all)

df_train = pd.read_pickle('pickle/train_df.pkl')
print("train df: \n", df_train)

df_test = pd.read_pickle('pickle/test_df.pkl')
print("test df: \n", df_test)

df_all_rev = pd.read_pickle('pickle/all_rev_df.pkl')
print("all_rev df: \n", df_all_rev)

df_train_rev = pd.read_pickle('pickle/train_rev_df.pkl')
print("train_rev df: \n", df_train_rev)

df_test_rev = pd.read_pickle('pickle/test_rev_df.pkl')
print("test rev df: \n", df_test_rev)


Original sign df: 
     id_sign sign_class set   graph       type        image variants  \
0         1        SIM  01  stroke     stroke  sign001.jpg        1   
1         2        MKR  01  stroke     stroke  sign002.jpg        1   
2         3        SIM  01  stroke     stroke  sign003.jpg        1   
3         4        SIM  01  stroke     stroke  sign004.jpg        1   
4         5        SIM  01  stroke     stroke  sign005.jpg        1   
..      ...        ...  ..     ...        ...          ...      ...   
704     952        CMX  71  animal  uncertain  sign952.jpg        1   
705     953        CMX  71  animal       Pict  sign953.jpg        1   
706     956        SIM  71       -    att.d.e  sign956.jpg        1   
707     957        CMX  71       -  uncertain  sign957.jpg        1   
708     958        CMX  71       -  uncertain  sign958.jpg        1   

          function ligatur value frequency comment  
0    NUM, ITM, SHN       -     -       227       -  
1    ITM, SHN, EMS   

In [5]:
def reverse_text(text):
    # first split the string into chars
    chars = text.split(' ')

    # then reverse the split string list and join with a space
    reversed_text = ' '.join(reversed(chars))
    return reversed_text

In [6]:
# Get ngram as list given a text (pass direction_of_string as "R/L" or "L/R")
def get_ngrams_as_list(data,direction_of_string,num):
    
    if(direction_of_string=="R/L"):
        # We need to convert R/L text to L/R to be able to get ngrams using nltk
        data_string = reverse_text(data)
    
    else: data_string= data
    
    n_grams =  ngrams(nltk.word_tokenize(data_string), num)
    return  [ ' '.join(grams) for grams in n_grams]

## n-gram Models

In [7]:
""" Tokenize the text

If we need to generate ngrams from it from r to l text, ngrams would be in opposite direction, so
use reversed text to generate tokenized_text (l to r) and regular text to generate reverse_tokenized_text (r to l)
"""   
"""For all data"""
tokenized_text_all = list(df_all_x[df_all_x.l_to_r_text!=''].l_to_r_text.apply(word_tokenize))
reverse_tokenized_text_all = list(df_all_x_rev[df_all_x_rev.reversed_text!=''].reversed_text.apply(word_tokenize))


"""For Train data"""
tokenized_text = list(df_train_x[df_train_x.l_to_r_text!=''].l_to_r_text.apply(word_tokenize))
reverse_tokenized_text = list(df_train_x_rev[df_train_x_rev.reversed_text!=''].reversed_text.apply(word_tokenize))


#print("tokenized_text:",tokenized_text)
#print("Rev tokenized_text:",reverse_tokenized_text)

In [8]:
"""Preprocess the tokenized text for n-grams language modeling
Do this for all data an train data
"""
import array as arr
model_name_list = ["MLE","KneserNeyInterpolated", "Laplace", "Lidstone","StupidBackoff", "WittenBellInterpolated"]

all_data_list_fwd_unigram = [None,None, None, None, None,None]
all_padded_sents_list_fwd_unigram = [None,None, None, None, None,None]
all_data_list_rev_unigram = [None,None, None, None, None,None]
all_padded_sents_list_rev_unigram = [None,None, None, None, None,None]
train_data_list_fwd_unigram = [None,None, None, None, None,None]
padded_sents_list_fwd_unigram = [None,None, None, None, None,None]
train_data_list_rev_unigram = [None,None, None, None, None,None]
padded_sents_list_rev_unigram = [None,None, None, None, None,None]

all_data_list_fwd_bigram = [None,None, None, None, None,None]
all_padded_sents_list_fwd_bigram = [None,None, None, None, None,None]
all_data_list_rev_bigram = [None,None, None, None, None,None]
all_padded_sents_list_rev_bigram = [None,None, None, None, None,None]
train_data_list_fwd_bigram = [None,None, None, None, None,None]
padded_sents_list_fwd_bigram = [None,None, None, None, None,None]
train_data_list_rev_bigram = [None,None, None, None, None,None]
padded_sents_list_rev_bigram = [None,None, None, None, None,None]

all_data_list_fwd_trigram = [None,None, None, None, None,None]
all_padded_sents_list_fwd_trigram = [None,None, None, None, None,None]
all_data_list_rev_trigram = [None,None, None, None, None,None]
all_padded_sents_list_rev_trigram  = [None,None, None, None, None,None]
train_data_list_fwd_trigram = [None,None, None, None, None,None]
padded_sents_list_fwd_trigram = [None,None, None, None, None,None]
train_data_list_rev_trigram = [None,None, None, None, None,None]
padded_sents_list_rev_trigram = [None,None, None, None, None,None]

all_data_list_fwd_quadgram = [None,None, None, None, None,None]
all_padded_sents_list_fwd_quadgram = [None,None, None, None, None,None]
all_data_list_rev_quadgram = [None,None, None, None, None,None]
all_padded_sents_list_rev_quadgram = [None,None, None, None, None,None]
train_data_list_fwd_quadgram = [None,None, None, None, None,None]
padded_sents_list_fwd_quadgram = [None,None, None, None, None,None]
train_data_list_rev_quadgram = [None,None, None, None, None,None]
padded_sents_list_rev_quadgram = [None,None, None, None, None,None]

all_data_list_fwd_pentagram = [None,None, None, None, None,None]
all_padded_sents_list_fwd_pentagram = [None,None, None, None, None,None]
all_data_list_rev_pentagram = [None,None, None, None, None,None]
all_padded_sents_list_rev_pentagram = [None,None, None, None, None,None]
train_data_list_fwd_pentagram = [None,None, None, None, None,None]
padded_sents_list_fwd_pentagram = [None,None, None, None, None,None]
train_data_list_rev_pentagram = [None,None, None, None, None,None]
padded_sents_list_rev_pentagram = [None,None, None, None, None,None]

all_data_list_fwd_hexagram = [None,None, None, None, None,None]
all_padded_sents_list_fwd_hexagram = [None,None, None, None, None,None]
all_data_list_rev_hexagram = [None,None, None, None, None,None]
all_padded_sents_list_rev_hexagram = [None,None, None, None, None,None]
train_data_list_fwd_hexagram = [None,None, None, None, None,None]
padded_sents_list_fwd_hexagram = [None,None, None, None, None,None]
train_data_list_rev_hexagram = [None,None, None, None, None,None]
padded_sents_list_rev_hexagram = [None,None, None, None, None,None]

all_data_list_fwd_septagram = [None,None, None, None, None,None]
all_padded_sents_list_fwd_septagram = [None,None, None, None, None,None]
all_data_list_rev_septagram = [None,None, None, None, None,None]
all_padded_sents_list_rev_septagram = [None,None, None, None, None,None]
train_data_list_fwd_septagram = [None,None, None, None, None,None]
padded_sents_list_fwd_septagram = [None,None, None, None, None,None]
train_data_list_rev_septagram = [None,None, None, None, None,None]
padded_sents_list_rev_septagram = [None,None, None, None, None,None]

train_data_rev_list = [None,None, None, None, None,None]
padded_sents_rev_list = [None,None, None, None, None,None]


for index in range (0,6):

    all_data_list_fwd_unigram[index], all_padded_sents_list_fwd_unigram[index] = padded_everygram_pipeline(1, tokenized_text)
    all_data_list_rev_unigram[index], all_padded_sents_list_rev_unigram[index] = padded_everygram_pipeline(1, reverse_tokenized_text)
    train_data_list_fwd_unigram[index], padded_sents_list_fwd_unigram[index] = padded_everygram_pipeline(1, tokenized_text)
    train_data_list_rev_unigram[index], padded_sents_list_rev_unigram[index] = padded_everygram_pipeline(1, reverse_tokenized_text)
    
    all_data_list_fwd_bigram[index], all_padded_sents_list_fwd_bigram[index] = padded_everygram_pipeline(2, tokenized_text)
    all_data_list_rev_bigram[index], all_padded_sents_list_rev_bigram[index] = padded_everygram_pipeline(2, reverse_tokenized_text)
    train_data_list_fwd_bigram[index], padded_sents_list_fwd_bigram[index] = padded_everygram_pipeline(2, tokenized_text)
    train_data_list_rev_bigram[index], padded_sents_list_rev_bigram[index] = padded_everygram_pipeline(2, reverse_tokenized_text)
    
    all_data_list_fwd_trigram[index], all_padded_sents_list_fwd_trigram[index] = padded_everygram_pipeline(3, tokenized_text)
    all_data_list_rev_trigram[index], all_padded_sents_list_rev_trigram[index] = padded_everygram_pipeline(3, reverse_tokenized_text)
    train_data_list_fwd_trigram[index], padded_sents_list_fwd_trigram[index] = padded_everygram_pipeline(3, tokenized_text)
    train_data_list_rev_trigram[index], padded_sents_list_rev_trigram[index] = padded_everygram_pipeline(3, reverse_tokenized_text)
    
    all_data_list_fwd_quadgram[index], all_padded_sents_list_fwd_quadgram[index] = padded_everygram_pipeline(4, tokenized_text)
    all_data_list_rev_quadgram[index], all_padded_sents_list_rev_quadgram[index] = padded_everygram_pipeline(4, reverse_tokenized_text)
    train_data_list_fwd_quadgram[index], padded_sents_list_fwd_quadgram[index] = padded_everygram_pipeline(4, tokenized_text)
    train_data_list_rev_quadgram[index], padded_sents_list_rev_quadgram[index] = padded_everygram_pipeline(4, reverse_tokenized_text)

    all_data_list_fwd_pentagram[index], all_padded_sents_list_fwd_pentagram[index] = padded_everygram_pipeline(5, tokenized_text)
    all_data_list_rev_pentagram[index], all_padded_sents_list_rev_pentagram[index] = padded_everygram_pipeline(5, reverse_tokenized_text)
    train_data_list_fwd_pentagram[index], padded_sents_list_fwd_pentagram[index] = padded_everygram_pipeline(5, tokenized_text)
    train_data_list_rev_pentagram[index], padded_sents_list_rev_pentagram[index] = padded_everygram_pipeline(5, reverse_tokenized_text)

    all_data_list_fwd_hexagram[index], all_padded_sents_list_fwd_hexagram[index] = padded_everygram_pipeline(6, tokenized_text)
    all_data_list_rev_hexagram[index], all_padded_sents_list_rev_hexagram[index] = padded_everygram_pipeline(6, reverse_tokenized_text)
    train_data_list_fwd_hexagram[index], padded_sents_list_fwd_hexagram[index] = padded_everygram_pipeline(6, tokenized_text)
    train_data_list_rev_hexagram[index], padded_sents_list_rev_hexagram[index] = padded_everygram_pipeline(6, reverse_tokenized_text)
    
    all_data_list_fwd_septagram[index], all_padded_sents_list_fwd_septagram[index] = padded_everygram_pipeline(7, tokenized_text)
    all_data_list_rev_septagram[index], all_padded_sents_list_rev_septagram[index] = padded_everygram_pipeline(7, reverse_tokenized_text)
    train_data_list_fwd_septagram[index], padded_sents_list_fwd_septagram[index] = padded_everygram_pipeline(7, tokenized_text)
    train_data_list_rev_septagram[index], padded_sents_list_rev_septagram[index] = padded_everygram_pipeline(7, reverse_tokenized_text)

    
print_train_data_details= False
#If you iterate through this, the iterator is done with and model 
# fitting won't work subsequently
# so set print_train_data_details= False before trying the actual model

# Example
if(print_train_data_details):
    for ngramlize_sent in train_data_list_fwd_quadgram[0]:
        print(list(ngramlize_sent))
        print()
    print('#############')
    list(padded_sents_list_fwd_quadgram[0])
    
if(print_train_data_details):
    for ngramlize_sent in train_data_list_rev_quadgram[0]:
        print(list(ngramlize_sent))
        print()
    print('#############')
    list(padded_sents_list_rev_quadgram[0])
    

In [9]:
"""For All data: Unigram, Bigram, Trigram, Quadgram, Pentagram, Hexagram Models for both fwd text and reverse tex with the following
models. Ignoring AbsoluteDiscountingInterpolated model
"""
from nltk.lm.models import MLE
from nltk.lm.models import AbsoluteDiscountingInterpolated
from nltk.lm.models import KneserNeyInterpolated
from nltk.lm.models import Laplace
from nltk.lm.models import Lidstone
from nltk.lm.models import StupidBackoff
from nltk.lm.models import WittenBellInterpolated

gamma=0.75

model_MLE_list_fwd_all = []
model_KneserNeyInterpolated_list_fwd_all = []
model_Laplace_list_fwd_all = []
model_Lidstone_list_fwd_all = []
model_StupidBackoff_list_fwd_all = []
model_WittenBellInterpolated_list_fwd_all = []


model_MLE_list_rev_all = []
model_KneserNeyInterpolated_list_rev_all = []
model_Laplace_list_rev_all = []
model_Lidstone_list_rev_all = []
model_StupidBackoff_list_rev_all = []
model_WittenBellInterpolated_list_rev_all= []

for index in range(1, 8):
    model_MLE_list_fwd_all.append(MLE(index))
    model_KneserNeyInterpolated_list_fwd_all.append(KneserNeyInterpolated(index))
    model_Laplace_list_fwd_all.append(Laplace(index))
    model_Lidstone_list_fwd_all.append(Lidstone(gamma, index))
    model_StupidBackoff_list_fwd_all.append(StupidBackoff(index, index))
    model_WittenBellInterpolated_list_fwd_all.append(WittenBellInterpolated(index))
    
    
for index in range(1, 8):
    model_MLE_list_rev_all.append(MLE(index))
    model_KneserNeyInterpolated_list_rev_all.append(KneserNeyInterpolated(index))
    model_Laplace_list_rev_all.append(Laplace(index))
    model_Lidstone_list_rev_all.append(Lidstone(gamma, index))
    model_StupidBackoff_list_rev_all.append(StupidBackoff(index, index))
    model_WittenBellInterpolated_list_rev_all.append(WittenBellInterpolated(index))
    
models_list_fwd_unigram_all = [model_MLE_list_fwd_all[0] ,model_KneserNeyInterpolated_list_fwd_all[0] ,model_Laplace_list_fwd_all[0] , model_Lidstone_list_fwd_all[0] , model_StupidBackoff_list_fwd_all[0],model_WittenBellInterpolated_list_fwd_all[0]]
models_list_rev_unigram_all = [model_MLE_list_rev_all[0] ,model_KneserNeyInterpolated_list_rev_all[0] ,model_Laplace_list_rev_all[0] , model_Lidstone_list_rev_all[0] , model_StupidBackoff_list_rev_all[0], model_WittenBellInterpolated_list_rev_all[0]]

models_list_fwd_bigram_all = [model_MLE_list_fwd_all[1] ,model_KneserNeyInterpolated_list_fwd_all[1] ,model_Laplace_list_fwd_all[1] , model_Lidstone_list_fwd_all[1] , model_StupidBackoff_list_fwd_all[1],model_WittenBellInterpolated_list_fwd_all[1]]
models_list_rev_bigram_all = [model_MLE_list_rev_all[1] ,model_KneserNeyInterpolated_list_rev_all[1] ,model_Laplace_list_rev_all[1] , model_Lidstone_list_rev_all[1] , model_StupidBackoff_list_rev_all[1], model_WittenBellInterpolated_list_rev_all[1]]

models_list_fwd_trigram_all = [model_MLE_list_fwd_all[2] ,model_KneserNeyInterpolated_list_fwd_all[2] ,model_Laplace_list_fwd_all[2] , model_Lidstone_list_fwd_all[2] , model_StupidBackoff_list_fwd_all[2],model_WittenBellInterpolated_list_fwd_all[2]]
models_list_rev_trigram_all = [model_MLE_list_rev_all[2] ,model_KneserNeyInterpolated_list_rev_all[2] ,model_Laplace_list_rev_all[2] , model_Lidstone_list_rev_all[2] , model_StupidBackoff_list_rev_all[2],model_WittenBellInterpolated_list_rev_all[2]]

models_list_fwd_quadgram_all = [model_MLE_list_fwd_all[3] ,model_KneserNeyInterpolated_list_fwd_all[3] ,model_Laplace_list_fwd_all[3] , model_Lidstone_list_fwd_all[3] , model_StupidBackoff_list_fwd_all[3],model_WittenBellInterpolated_list_fwd_all[3]]
models_list_rev_quadgram_all = [model_MLE_list_rev_all[3] ,model_KneserNeyInterpolated_list_rev_all[3] ,model_Laplace_list_rev_all[3] , model_Lidstone_list_rev_all[3] , model_StupidBackoff_list_rev_all[3],model_WittenBellInterpolated_list_rev_all[3]]

models_list_fwd_pentagram_all = [model_MLE_list_fwd_all[4] ,model_KneserNeyInterpolated_list_fwd_all[4] ,model_Laplace_list_fwd_all[4] , model_Lidstone_list_fwd_all[4] , model_StupidBackoff_list_fwd_all[4],model_WittenBellInterpolated_list_fwd_all[4]]
models_list_rev_pentagram_all = [model_MLE_list_rev_all[4] ,model_KneserNeyInterpolated_list_rev_all[4] ,model_Laplace_list_rev_all[4] , model_Lidstone_list_rev_all[4] , model_StupidBackoff_list_rev_all[4],model_WittenBellInterpolated_list_rev_all[4]]

models_list_fwd_hexagram_all = [model_MLE_list_fwd_all[5] ,model_KneserNeyInterpolated_list_fwd_all[5] ,model_Laplace_list_fwd_all[5] , model_Lidstone_list_fwd_all[5] , model_StupidBackoff_list_fwd_all[5],model_WittenBellInterpolated_list_fwd_all[5]]
models_list_rev_hexagram_all = [model_MLE_list_rev_all[5] ,model_KneserNeyInterpolated_list_rev_all[5] ,model_Laplace_list_rev_all[5] , model_Lidstone_list_rev_all[5] , model_StupidBackoff_list_rev_all[5],model_WittenBellInterpolated_list_rev_all[5]]

models_list_fwd_septagram_all = [model_MLE_list_fwd_all[6] ,model_KneserNeyInterpolated_list_fwd_all[6] ,model_Laplace_list_fwd_all[6] , model_Lidstone_list_fwd_all[6] , model_StupidBackoff_list_fwd_all[6],model_WittenBellInterpolated_list_fwd_all[6]]
models_list_rev_septagram_all = [model_MLE_list_rev_all[6] ,model_KneserNeyInterpolated_list_rev_all[6] ,model_Laplace_list_rev_all[6] , model_Lidstone_list_rev_all[6] , model_StupidBackoff_list_rev_all[6],model_WittenBellInterpolated_list_rev_all[6]]



In [10]:
"""For Train data: Unigram, Bigram, Trigram, Quadgram, Pentagram, Hexagram Models for both fwd text and reverse tex with the following
models. Ignoring AbsoluteDiscountingInterpolated model
"""
from nltk.lm.models import MLE
from nltk.lm.models import AbsoluteDiscountingInterpolated
from nltk.lm.models import KneserNeyInterpolated
from nltk.lm.models import Laplace
from nltk.lm.models import Lidstone
from nltk.lm.models import StupidBackoff
from nltk.lm.models import WittenBellInterpolated

gamma=0.75

model_MLE_list_fwd = []
model_KneserNeyInterpolated_list_fwd = []
model_Laplace_list_fwd = []
model_Lidstone_list_fwd = []
model_StupidBackoff_list_fwd = []
model_WittenBellInterpolated_list_fwd= []


model_MLE_list_rev = []
model_KneserNeyInterpolated_list_rev = []
model_Laplace_list_rev = []
model_Lidstone_list_rev = []
model_StupidBackoff_list_rev = []
model_WittenBellInterpolated_list_rev= []

for index in range(1, 8):
    model_MLE_list_fwd.append(MLE(index))
    model_KneserNeyInterpolated_list_fwd.append(KneserNeyInterpolated(index))
    model_Laplace_list_fwd.append(Laplace(index))
    model_Lidstone_list_fwd.append(Lidstone(gamma, index))
    model_StupidBackoff_list_fwd.append(StupidBackoff(index, index))
    model_WittenBellInterpolated_list_fwd.append(WittenBellInterpolated(index))
    
    
for index in range(1, 8):
    model_MLE_list_rev.append(MLE(index))
    model_KneserNeyInterpolated_list_rev.append(KneserNeyInterpolated(index))
    model_Laplace_list_rev.append(Laplace(index))
    model_Lidstone_list_rev.append(Lidstone(gamma, index))
    model_StupidBackoff_list_rev.append(StupidBackoff(index, index))
    model_WittenBellInterpolated_list_rev.append(WittenBellInterpolated(index))
    
models_list_fwd_unigram = [model_MLE_list_fwd[0] ,model_KneserNeyInterpolated_list_fwd[0] ,model_Laplace_list_fwd[0] , model_Lidstone_list_fwd[0] , model_StupidBackoff_list_fwd[0],model_WittenBellInterpolated_list_fwd[0]]
models_list_rev_unigram = [model_MLE_list_rev[0] ,model_KneserNeyInterpolated_list_rev[0] ,model_Laplace_list_rev[0] , model_Lidstone_list_rev[0] , model_StupidBackoff_list_rev[0], model_WittenBellInterpolated_list_rev[0]]

models_list_fwd_bigram = [model_MLE_list_fwd[1] ,model_KneserNeyInterpolated_list_fwd[1] ,model_Laplace_list_fwd[1] , model_Lidstone_list_fwd[1] , model_StupidBackoff_list_fwd[1],model_WittenBellInterpolated_list_fwd[1]]
models_list_rev_bigram = [model_MLE_list_rev[1] ,model_KneserNeyInterpolated_list_rev[1] ,model_Laplace_list_rev[1] , model_Lidstone_list_rev[1] , model_StupidBackoff_list_rev[1], model_WittenBellInterpolated_list_rev[1]]

models_list_fwd_trigram = [model_MLE_list_fwd[2] ,model_KneserNeyInterpolated_list_fwd[2] ,model_Laplace_list_fwd[2] , model_Lidstone_list_fwd[2] , model_StupidBackoff_list_fwd[2],model_WittenBellInterpolated_list_fwd[2]]
models_list_rev_trigram = [model_MLE_list_rev[2] ,model_KneserNeyInterpolated_list_rev[2] ,model_Laplace_list_rev[2] , model_Lidstone_list_rev[2] , model_StupidBackoff_list_rev[2],model_WittenBellInterpolated_list_rev[2]]

models_list_fwd_quadgram = [model_MLE_list_fwd[3] ,model_KneserNeyInterpolated_list_fwd[3] ,model_Laplace_list_fwd[3] , model_Lidstone_list_fwd[3] , model_StupidBackoff_list_fwd[3],model_WittenBellInterpolated_list_fwd[3]]
models_list_rev_quadgram = [model_MLE_list_rev[3] ,model_KneserNeyInterpolated_list_rev[3] ,model_Laplace_list_rev[3] , model_Lidstone_list_rev[3] , model_StupidBackoff_list_rev[3],model_WittenBellInterpolated_list_rev[3]]

models_list_fwd_pentagram = [model_MLE_list_fwd[4] ,model_KneserNeyInterpolated_list_fwd[4] ,model_Laplace_list_fwd[4] , model_Lidstone_list_fwd[4] , model_StupidBackoff_list_fwd[4],model_WittenBellInterpolated_list_fwd[4]]
models_list_rev_pentagram = [model_MLE_list_rev[4] ,model_KneserNeyInterpolated_list_rev[4] ,model_Laplace_list_rev[4] , model_Lidstone_list_rev[4] , model_StupidBackoff_list_rev[4],model_WittenBellInterpolated_list_rev[4]]

models_list_fwd_hexagram = [model_MLE_list_fwd[5] ,model_KneserNeyInterpolated_list_fwd[5] ,model_Laplace_list_fwd[5] , model_Lidstone_list_fwd[5] , model_StupidBackoff_list_fwd[5],model_WittenBellInterpolated_list_fwd[5]]
models_list_rev_hexagram = [model_MLE_list_rev[5] ,model_KneserNeyInterpolated_list_rev[5] ,model_Laplace_list_rev[5] , model_Lidstone_list_rev[5] , model_StupidBackoff_list_rev[5],model_WittenBellInterpolated_list_rev[5]]

models_list_fwd_septagram = [model_MLE_list_fwd[6] ,model_KneserNeyInterpolated_list_fwd[6] ,model_Laplace_list_fwd[6] , model_Lidstone_list_fwd[6] , model_StupidBackoff_list_fwd[6],model_WittenBellInterpolated_list_fwd[6]]
models_list_rev_septagram = [model_MLE_list_rev[6] ,model_KneserNeyInterpolated_list_rev[6] ,model_Laplace_list_rev[6] , model_Lidstone_list_rev[6] , model_StupidBackoff_list_rev[6],model_WittenBellInterpolated_list_rev[6]]



In [11]:
def fit_and_train_models(models_list,model_type, this_train_data_list,this_padded_sents_list):
    for index in range (0,len(models_list)):
        models_list[index].fit(this_train_data_list[index], this_padded_sents_list[index])
        print("Fit:",model_name_list[index], type, "Order:", models_list[index].order, models_list[index].vocab)

In [12]:
print("Fitting the models for All data")
fit_and_train_models(models_list_fwd_unigram_all ,"fwd", all_data_list_fwd_unigram,all_padded_sents_list_fwd_unigram)
fit_and_train_models(models_list_rev_unigram_all , "rev", all_data_list_rev_unigram,all_padded_sents_list_rev_unigram)

fit_and_train_models(models_list_fwd_bigram_all ,"fwd", all_data_list_fwd_bigram,all_padded_sents_list_fwd_bigram)
fit_and_train_models(models_list_rev_bigram_all ,"rev", all_data_list_rev_bigram,all_padded_sents_list_rev_bigram)

fit_and_train_models(models_list_fwd_trigram_all ,"fwd", all_data_list_fwd_trigram,all_padded_sents_list_fwd_trigram)
fit_and_train_models(models_list_rev_trigram_all , "rev",all_data_list_rev_trigram,all_padded_sents_list_rev_trigram)

fit_and_train_models(models_list_fwd_quadgram_all , "fwd",all_data_list_fwd_quadgram,all_padded_sents_list_fwd_quadgram)
fit_and_train_models(models_list_rev_quadgram_all ,"rev", all_data_list_rev_quadgram,all_padded_sents_list_rev_quadgram)

fit_and_train_models(models_list_fwd_pentagram_all ,"fwd", all_data_list_fwd_pentagram,all_padded_sents_list_fwd_pentagram)
fit_and_train_models(models_list_rev_pentagram_all ,"rev", all_data_list_rev_pentagram,all_padded_sents_list_rev_pentagram)

fit_and_train_models(models_list_fwd_hexagram_all , "fwd",all_data_list_fwd_hexagram,all_padded_sents_list_fwd_hexagram)
fit_and_train_models(models_list_rev_hexagram_all , "rev",all_data_list_rev_hexagram,all_padded_sents_list_rev_hexagram)

fit_and_train_models(models_list_fwd_septagram_all ,"fwd", all_data_list_fwd_septagram,all_padded_sents_list_fwd_septagram)
fit_and_train_models(models_list_rev_septagram_all , "rev", all_data_list_rev_septagram,all_padded_sents_list_rev_septagram)


Fitting the models for All data
Fit: MLE <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: KneserNeyInterpolated <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: Laplace <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: Lidstone <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: StupidBackoff <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: WittenBellInterpolated <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: MLE <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: KneserNeyInterpolated <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: Laplace <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: Lidstone <class 'type'> Order: 1 <Vocabulary with cutoff=

Fit: Lidstone <class 'type'> Order: 7 <Vocabulary with cutoff=1 unk_label='<UNK>' and 583 items>
Fit: StupidBackoff <class 'type'> Order: 7 <Vocabulary with cutoff=1 unk_label='<UNK>' and 583 items>
Fit: WittenBellInterpolated <class 'type'> Order: 7 <Vocabulary with cutoff=1 unk_label='<UNK>' and 583 items>


In [13]:
print("Fitting the models for Train data")

fit_and_train_models(models_list_fwd_unigram ,"fwd", train_data_list_fwd_unigram,padded_sents_list_fwd_unigram)
fit_and_train_models(models_list_rev_unigram , "rev", train_data_list_rev_unigram,padded_sents_list_rev_unigram)

fit_and_train_models(models_list_fwd_bigram ,"fwd", train_data_list_fwd_bigram,padded_sents_list_fwd_bigram)
fit_and_train_models(models_list_rev_bigram ,"rev", train_data_list_rev_bigram,padded_sents_list_rev_bigram)

fit_and_train_models(models_list_fwd_trigram ,"fwd", train_data_list_fwd_trigram,padded_sents_list_fwd_trigram)
fit_and_train_models(models_list_rev_trigram , "rev",train_data_list_rev_trigram,padded_sents_list_rev_trigram)

fit_and_train_models(models_list_fwd_quadgram , "fwd",train_data_list_fwd_quadgram,padded_sents_list_fwd_quadgram)
fit_and_train_models(models_list_rev_quadgram ,"rev", train_data_list_rev_quadgram,padded_sents_list_rev_quadgram)

fit_and_train_models(models_list_fwd_pentagram ,"fwd", train_data_list_fwd_pentagram,padded_sents_list_fwd_pentagram)
fit_and_train_models(models_list_rev_pentagram ,"rev", train_data_list_rev_pentagram,padded_sents_list_rev_pentagram)

fit_and_train_models(models_list_fwd_hexagram , "fwd",train_data_list_fwd_hexagram,padded_sents_list_fwd_hexagram)
fit_and_train_models(models_list_rev_hexagram , "rev",train_data_list_rev_hexagram,padded_sents_list_rev_hexagram)

fit_and_train_models(models_list_fwd_septagram ,"fwd", train_data_list_fwd_septagram,padded_sents_list_fwd_septagram)
fit_and_train_models(models_list_rev_septagram , "rev", train_data_list_rev_septagram,padded_sents_list_rev_septagram)


Fitting the models for Train data
Fit: MLE <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: KneserNeyInterpolated <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: Laplace <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: Lidstone <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: StupidBackoff <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: WittenBellInterpolated <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: MLE <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: KneserNeyInterpolated <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: Laplace <class 'type'> Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 581 items>
Fit: Lidstone <class 'type'> Order: 1 <Vocabulary with cutof

Fit: Lidstone <class 'type'> Order: 7 <Vocabulary with cutoff=1 unk_label='<UNK>' and 583 items>
Fit: StupidBackoff <class 'type'> Order: 7 <Vocabulary with cutoff=1 unk_label='<UNK>' and 583 items>
Fit: WittenBellInterpolated <class 'type'> Order: 7 <Vocabulary with cutoff=1 unk_label='<UNK>' and 583 items>


In [14]:
"""Pickle all the trained Language Models"""
def pickle_models(models_list, model_type, data_type):
    for index in range (0,len(models_list)):
        file_name = "pickle/" + model_name_list[index]+ "_" + model_type + "_" + data_type + "_" + str(models_list[index].order) +".pkl"
        pickle.dump(models_list[index],open(file_name, 'wb'))
        print("Pickled",file_name)

pickle_models(models_list_fwd_unigram_all,"fwd","all")
pickle_models(models_list_rev_unigram_all,"rev","all")

pickle_models(models_list_fwd_bigram_all,"fwd","all")
pickle_models(models_list_rev_bigram_all,"rev","all")

pickle_models(models_list_fwd_trigram_all,"fwd","all")
pickle_models(models_list_rev_trigram_all,"rev","all")

pickle_models(models_list_fwd_quadgram_all,"fwd","all")
pickle_models(models_list_rev_quadgram_all,"rev","all")

pickle_models(models_list_fwd_pentagram_all,"fwd","all")
pickle_models(models_list_rev_pentagram_all,"rev","all")

pickle_models(models_list_fwd_hexagram_all,"fwd","all")
pickle_models(models_list_rev_hexagram_all,"rev","all")

pickle_models(models_list_fwd_septagram_all,"fwd","all")
pickle_models(models_list_rev_septagram_all,"rev","all")


pickle_models(models_list_fwd_unigram,"fwd","train")
pickle_models(models_list_rev_unigram,"rev","train")

pickle_models(models_list_fwd_bigram,"fwd","train")
pickle_models(models_list_rev_bigram,"rev","train")

pickle_models(models_list_fwd_trigram,"fwd","train")
pickle_models(models_list_rev_trigram,"rev","train")

pickle_models(models_list_fwd_quadgram,"fwd","train")
pickle_models(models_list_rev_quadgram,"rev","train")

pickle_models(models_list_fwd_pentagram,"fwd","train")
pickle_models(models_list_rev_pentagram,"rev","train")

pickle_models(models_list_fwd_hexagram,"fwd","train")
pickle_models(models_list_rev_hexagram,"rev","train")

pickle_models(models_list_fwd_septagram,"fwd","train")
pickle_models(models_list_rev_septagram,"rev","train")


Pickled pickle/MLE_fwd_all_1.pkl
Pickled pickle/KneserNeyInterpolated_fwd_all_1.pkl
Pickled pickle/Laplace_fwd_all_1.pkl
Pickled pickle/Lidstone_fwd_all_1.pkl
Pickled pickle/StupidBackoff_fwd_all_1.pkl
Pickled pickle/WittenBellInterpolated_fwd_all_1.pkl
Pickled pickle/MLE_rev_all_1.pkl
Pickled pickle/KneserNeyInterpolated_rev_all_1.pkl
Pickled pickle/Laplace_rev_all_1.pkl
Pickled pickle/Lidstone_rev_all_1.pkl
Pickled pickle/StupidBackoff_rev_all_1.pkl
Pickled pickle/WittenBellInterpolated_rev_all_1.pkl
Pickled pickle/MLE_fwd_all_2.pkl
Pickled pickle/KneserNeyInterpolated_fwd_all_2.pkl
Pickled pickle/Laplace_fwd_all_2.pkl
Pickled pickle/Lidstone_fwd_all_2.pkl
Pickled pickle/StupidBackoff_fwd_all_2.pkl
Pickled pickle/WittenBellInterpolated_fwd_all_2.pkl
Pickled pickle/MLE_rev_all_2.pkl
Pickled pickle/KneserNeyInterpolated_rev_all_2.pkl
Pickled pickle/Laplace_rev_all_2.pkl
Pickled pickle/Lidstone_rev_all_2.pkl
Pickled pickle/StupidBackoff_rev_all_2.pkl
Pickled pickle/WittenBellInterpolate

## Initial Terminal Character Model

In [15]:
# Build Model for relationship between Initial and Terminal characters
# This can be a bigram model. Pick a reasonably good model
# Remove all characters other than initial and terminal and then tokenize
tokenized_text_temp = list(df_train_x[df_train_x.l_to_r_text!=''].l_to_r_text.apply(word_tokenize))

#print(tokenized_text_temp)
      
tokenized_text_it = []
for i in range(len(tokenized_text_temp)):
    
    l= tokenized_text_temp[i]
    #single character text, ignore it
    if(len(l)>1):
        del l[1:len(l)-1]
        l[0],l[1] = l[1], l[0]  #swap
        tokenized_text_it.append(l)
        
#print(tokenized_text_it)

""" Instantiate the model"""
k=2
model_it_bigram_kn = KneserNeyInterpolated(k) #Bigram model
train_data_it, padded_sents_it = padded_everygram_pipeline(k, tokenized_text_it)


print_train_data_details= False
#If you iterate through this, the iterator is done with and model 
# fitting won't work subsequently
# so set print_train_data_details= False before trying the actual model

if(print_train_data_details):
    for ngramlize_sent in train_data_it:
        print(list(ngramlize_sent))
        print()
    print('#############')
    list(padded_sents_it)
    
model_it_bigram_kn.fit(train_data_it, padded_sents_it)
    
print(model_it_bigram_kn.vocab)
print(model_it_bigram_kn.counts)
print(model_it_bigram_kn.generate(1, ['804'], 8))
print(model_it_bigram_kn.generate(1, ['621'], 8))

<Vocabulary with cutoff=1 unk_label='<UNK>' and 437 items>
<NgramCounter with 2 ngram orders and 13125 ngrams>
231
817


In [16]:
"""Pickle this model"""
pickle.dump(model_it_bigram_kn,open("pickle/KneserNeyInterpolated_it_2.pkl", 'wb'))
print("Pickled","KneserNeyInterpolated_it_2.pkl")

Pickled KneserNeyInterpolated_it_2.pkl
