# Indus Valley Scripts - ICIT coded Text Analysis for Decipherment

# Language Model Development

Dataset was created as a csv file from ICIT web site from raw html files for each for the Text
Data labels were changes and a linearized copy of the original text was added

icit_text_text_corpus.csv


!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install scikit-learn
!pip install nltk
!pip install ipywidgets
!pip install -U dill
!pip3 install requests
!pip3 install -U spacy

In [1]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import collections

plt.style.use(style='seaborn')
%matplotlib inline

In [2]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

In [3]:
drop_duplicate_texts = False

# Set the filters on data here
filter_by_site = False
filter_by_keywords = False
filter_by_text_length= False

site = 'Mohenjo-daro'
#site = 'Harappa'
keyword = "Bull1"

num_rows_text_corpus= 4999

In [4]:
# Read the signs
orig_sign_df=pd.read_csv('../../IndusCorpusUtils/data/icit_corpus/icit_sign_corpus.csv',dtype=str)
# set the max columns to none
pd.set_option('display.max_columns', None)

orig_sign_df

Unnamed: 0,id_sign,sign_class,set,graph,type,image,variants,function,ligatur,value,frequency,comment
0,1,SIM,01,stroke,stroke,sign001.jpg,1,"NUM, ITM, SHN",-,-,227,-
1,2,MKR,01,stroke,stroke,sign002.jpg,1,"ITM, SHN, EMS",-,-,865,-
2,3,SIM,01,stroke,stroke,sign003.jpg,1,"NUM, SHN",-,-,260,-
3,4,SIM,01,stroke,stroke,sign004.jpg,1,"NUM, SHN",-,-,99,-
4,5,SIM,01,stroke,stroke,sign005.jpg,1,"NUM, SHN",-,-,49,-
...,...,...,...,...,...,...,...,...,...,...,...,...
704,952,CMX,71,animal,uncertain,sign952.jpg,1,LFS,-,-,1,-
705,953,CMX,71,animal,Pict,sign953.jpg,1,LFS,-,-,1,-
706,956,SIM,71,-,att.d.e,sign956.jpg,1,LOG,-,-,2,-
707,957,CMX,71,-,uncertain,sign957.jpg,1,LOG,-,-,2,-


In [5]:
# Read the Text Corpus
orig_df=pd.read_csv('../../IndusCorpusUtils/data/icit_corpus/icit_text_text_corpus.csv',dtype=str, nrows=num_rows_text_corpus)
# set the max columns to none
pd.set_option('display.max_columns', None)

orig_df

Unnamed: 0,icit_id,site,keywords,text_class,lines,direction,text,signs,complete,alignment,sign height,text_images,linearized_text,standardized_text
0,1,Alamgirpur,,SS,1,L/R,+410-017+,2,Y,Unordered,Unequal,,410 017,410 017
1,2,Alamgirpur,,SS,1,L/R,+410-017+,2,Y,,,,410 017,410 017
2,3,Alamgirpur,,SC,1,L/R,+405-017+,2,Y,,,,405 017,405 017
3,4,Allahdino,,??,1,,+220-000+,1,N,,,,220 000,000 220
4,5,Allahdino,Bull,UC,1,R/L,+740-235+,2,Y,,,,740 235,235 740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4994,4064,Harappa,,UC,1,,+000[,0,N,Indefinable,Indefinable,,000[,000[
4995,4065,Harappa,,VN,1,R/L,]700-032[,2,?,,,,700 032[,032[ 700
4996,4065,Harappa,,UC,1,R/L,]000-000[,0,N,,,,000 000[,000[ 000
4997,4066,Harappa,,UC,1,R/L,+368-000+,1,N,,,,368 000,000 368


In [6]:
#Reverse text and add that as a new column
# Add text length as a column
list_reversed_text = []
for text in orig_df[orig_df.standardized_text!=''].standardized_text:
    # Tokenize to words
    # first split the string into chars
    chars = text.split(' ')
    length = len(chars)

    # then reverse the split string list and join with a space
    reversed_text = ' '.join(reversed(chars))
    list_reversed_text.append(reversed_text)
    
orig_df['reversed_text']= list_reversed_text
orig_df['text_length']= length

print("Dataframe has ", len(orig_df.index), " rows")

Dataframe has  4999  rows


In [7]:
# Retain texts that are only wanted

#remove the values where the text is unclear
df = orig_df[orig_df['standardized_text'].str.contains('000') == False] 

print("After removing unclear texts, we have ", len(df.index), " rows")

if(drop_duplicate_texts):
    #Remove out duplicate inplace
    df.drop_duplicates(subset ="text",
                         keep = False, inplace = True)

    print("After removing duplicate texts, we have ", len(df.index), " rows")


#keep only the values that does not have multi-line text
df = df[df['text'].str.contains('/') == False] 

print("After removing multi-line text, we have ", len(df.index), " rows")


#keep only the values where the direction is known (if direction does not have a /, i,e. L/R or R/L)
#Btw standardized_text is Left to right as in English
df = df[df['direction'].str.contains('/') == True] 

print("After keeping only text with known direction, we have ", len(df.index), " rows")

#Remove Multipart texts that have [ or ]
df = df[df['standardized_text'].str.contains("\[") == False] 
df = df[df['standardized_text'].str.contains("\]") == False] 

print("After keeping only text without multipart, we have ", len(df.index), " rows")


After removing unclear texts, we have  3945  rows
After removing multi-line text, we have  3866  rows
After keeping only text with known direction, we have  3250  rows
After keeping only text without multipart, we have  3040  rows


In [8]:
if(filter_by_site==True):
    #keep only the values that matches the provided site
    df = df[df['site'].str.contains(site) == True] 
    print("After filtering by site ", site, " it has ", len(df.index), " rows")

if(filter_by_keywords==True):
     #keep only the values that matches the provided keyword
    df = df[df['keywords'].str.contains(keyword) == True] 
    print("After filtering by keywords ", keyword, " it has ", len(df.index), " rows")


In [9]:
#Keep the items with unclear text in another dataframe
df_unclear = orig_df[orig_df['standardized_text'].str.contains('000') == True]

#Note: Lot of the text with unclear text have direction empty

### Feature Extraction

In [10]:
y=df['site'].values
y.shape

# y axis is still the same
y_rev=df['site'].values
y_rev.shape

(3040,)

In [11]:
x=df['standardized_text'].values
x.shape

x_rev=df['reversed_text'].values
x_rev.shape

(3040,)

### Train-test split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
(x_train,x_test,y_train,y_test)=train_test_split(x,y,test_size=0.1, random_state=43)

(x_rev_train,x_rev_test,y_rev_train,y_rev_test)=train_test_split(x_rev,y_rev,test_size=0.1, random_state=43)

In [14]:
#Train data, fwd and reverse
df_train_x=pd.DataFrame(x_train)
df_train_x=df_train_x.rename(columns={0:'standardized_text'})

df_train_y=pd.DataFrame(y_train)
df_train_y=df_train_y.rename(columns={0:'site'})

df_train_x_rev=pd.DataFrame(x_rev_train)
df_train_x_rev=df_train_x_rev.rename(columns={0:'reversed_text'})

df_train_y_rev=pd.DataFrame(y_rev_train)
df_train_y_rev=df_train_y_rev.rename(columns={0:'site'})

#Test data, fwd and reverse
df_test_x=pd.DataFrame(x_test)
df_test_x=df_test_x.rename(columns={0:'standardized_text'})

df_test_y=pd.DataFrame(y_test)
df_test_y=df_test_y.rename(columns={0:'site'})

df_test_x_rev=pd.DataFrame(x_rev_test)
df_test_x_rev=df_test_x_rev.rename(columns={0:'reversed_text'})

df_test_y_rev=pd.DataFrame(y_rev_test)
df_test_y_rev=df_test_y_rev.rename(columns={0:'site'})

In [15]:
df_train=pd.concat([df_train_x,df_train_y],axis=1)
print(df_train.head())

df_test=pd.concat([df_test_x,df_test_y],axis=1)
print(df_test.head())


df_train_rev=pd.concat([df_train_x_rev,df_train_y_rev],axis=1)
print(df_train_rev.head())

df_test_rev=pd.concat([df_test_x_rev,df_test_y_rev],axis=1)
print(df_test_rev.head())

             standardized_text          site
0  220 017 585 095 520 032 407  Mohenjo-daro
1      032 172 002 705 033 520       Harappa
2              861 002 035 405       Harappa
3                      700 034       Harappa
4                  840 032 740       Harappa
         standardized_text     site
0  590 540 002 605 760 740   Lothal
1          240 921 070 520  Harappa
2          590 368 134 388  Harappa
3          817 002 048 740   Lothal
4                  090 700  Harappa
                 reversed_text          site
0  407 032 520 095 585 017 220  Mohenjo-daro
1      520 033 705 002 172 032       Harappa
2              405 035 002 861       Harappa
3                      034 700       Harappa
4                  740 032 840       Harappa
             reversed_text     site
0  740 760 605 002 540 590   Lothal
1          520 070 921 240  Harappa
2          388 134 368 590  Harappa
3          740 048 002 817   Lothal
4                  700 090  Harappa


### n-gram Models

In [16]:
from nltk.lm.preprocessing import pad_both_ends
from nltk.util import everygrams
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.lm.models import MLE
from nltk.lm.models import KneserNeyInterpolated

In [17]:
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize 

def generate_sent(model, num_words,char_seed, random_seed=42):
    """
    :param model: An ngram language model.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    try:
        if(model==None): print("No Model Passed **********")
        for token in model.generate(num_words, text_seed=char_seed, random_seed=random_seed):
            if token == '<s>':
                continue
            if token == '</s>'  or token == '< /s>' :
                break
            content.append(token)
    except Exception as e:
        print("Exception:", e.__class__, "in the generate_sent")
        traceback.print_exc()
    return detokenize(content)

In [18]:
#Tokenize the text

tokenized_text = list(df_train_x[df_train_x.standardized_text!=''].standardized_text.apply(word_tokenize))
reverse_tokenized_text = list(df_train_x_rev[df_train_x_rev.reversed_text!=''].reversed_text.apply(word_tokenize))

#print("tokenized_text:",tokenized_text)

In [19]:
# Preprocess the tokenized text for n-grams language modeling

import array as arr

model_name_list = ["MLE","KneserNeyInterpolated", "Laplace", "Lidstone","StupidBackoff", "WittenBellInterpolated"]
#model_name_list = ["MLE","KneserNeyInterpolated", "Laplace", "Lidstone","StupidBackoff"]

train_data_list_fwd_bigram = [None,None, None, None, None,None]
padded_sents_list_fwd_bigram = [None,None, None, None, None,None]
train_data_list_rev_bigram = [None,None, None, None, None,None]
padded_sents_list_rev_bigram = [None,None, None, None, None,None]

train_data_list_fwd_trigram = [None,None, None, None, None,None]
padded_sents_list_fwd_trigram = [None,None, None, None, None,None]
train_data_list_rev_trigram = [None,None, None, None, None,None]
padded_sents_list_rev_trigram = [None,None, None, None, None,None]

train_data_list_fwd_quadgram = [None,None, None, None, None,None]
padded_sents_list_fwd_quadgram = [None,None, None, None, None,None]
train_data_list_rev_quadgram = [None,None, None, None, None,None]
padded_sents_list_rev_quadgram = [None,None, None, None, None,None]

train_data_rev_list = [None,None, None, None, None,None]
padded_sents_rev_list = [None,None, None, None, None,None]


for index in range (0,6):
    train_data_list_fwd_bigram[index], padded_sents_list_fwd_bigram[index] = padded_everygram_pipeline(2, tokenized_text)
    train_data_list_rev_bigram[index], padded_sents_list_rev_bigram[index] = padded_everygram_pipeline(2, reverse_tokenized_text)
    
    train_data_list_fwd_trigram[index], padded_sents_list_fwd_trigram[index] = padded_everygram_pipeline(3, tokenized_text)
    train_data_list_rev_trigram[index], padded_sents_list_rev_trigram[index] = padded_everygram_pipeline(3, reverse_tokenized_text)
    
    train_data_list_fwd_quadgram[index], padded_sents_list_fwd_quadgram[index] = padded_everygram_pipeline(4, tokenized_text)
    train_data_list_rev_quadgram[index], padded_sents_list_rev_quadgram[index] = padded_everygram_pipeline(4, reverse_tokenized_text)

    
print_train_data_details= False
#If you iterate through this, the iterator is done with and model 
# fitting won't work subsequently
# so set print_train_data_details= False before trying the actual model

if(print_train_data_details):
    for ngramlize_sent in train_data_list_fwd_trigram[0]:
        print(list(ngramlize_sent))
        print()
    print('#############')
    list(padded_sents_list_fwd_trigram[0])
    

In [20]:
# Train Bigram, Trigram Models for both fwd text and reverse tex with the following
# models. Ignoring AbsoluteDiscountingInterpolated model
from nltk.lm.models import MLE
from nltk.lm.models import AbsoluteDiscountingInterpolated
from nltk.lm.models import KneserNeyInterpolated
from nltk.lm.models import Laplace
from nltk.lm.models import Lidstone
from nltk.lm.models import StupidBackoff
from nltk.lm.models import WittenBellInterpolated

gamma=1
order=1

model_MLE_list_fwd = []
model_KneserNeyInterpolated_list_fwd = []
model_Laplace_list_fwd = []
model_Lidstone_list_fwd = []
model_StupidBackoff_list_fwd = []
model_WittenBellInterpolated_list_fwd= []


model_MLE_list_rev = []
model_KneserNeyInterpolated_list_rev = []
model_Laplace_list_rev = []
model_Lidstone_list_rev = []
model_StupidBackoff_list_rev = []
model_WittenBellInterpolated_list_rev= []

for index in range(2, 5):
    model_MLE_list_fwd.append(MLE(index))
    model_KneserNeyInterpolated_list_fwd.append(KneserNeyInterpolated(index))
    model_Laplace_list_fwd.append(Laplace(index))
    model_Lidstone_list_fwd.append(Lidstone(index, gamma))
    model_StupidBackoff_list_fwd.append(StupidBackoff(index, order))
    model_WittenBellInterpolated_list_fwd.append(WittenBellInterpolated(index))
    
    
for index in range(2, 5):
    model_MLE_list_rev.append(MLE(index))
    model_KneserNeyInterpolated_list_rev.append(KneserNeyInterpolated(index))
    model_Laplace_list_rev.append(Laplace(index))
    model_Lidstone_list_rev.append(Lidstone(index, gamma))
    model_StupidBackoff_list_rev.append(StupidBackoff(index, order))
    model_WittenBellInterpolated_list_rev.append(WittenBellInterpolated(index))
    

models_list_fwd_bigram = [model_MLE_list_fwd[0] ,model_KneserNeyInterpolated_list_fwd[0] ,model_Laplace_list_fwd[0] , model_Lidstone_list_fwd[0] , model_StupidBackoff_list_fwd[0],model_WittenBellInterpolated_list_fwd[0]]
models_list_rev_bigram = [model_MLE_list_rev[0] ,model_KneserNeyInterpolated_list_rev[0] ,model_Laplace_list_rev[0] , model_Lidstone_list_rev[0] , model_StupidBackoff_list_rev[0], model_WittenBellInterpolated_list_rev[0]]

models_list_fwd_trigram = [model_MLE_list_fwd[1] ,model_KneserNeyInterpolated_list_fwd[1] ,model_Laplace_list_fwd[1] , model_Lidstone_list_fwd[1] , model_StupidBackoff_list_fwd[1],model_WittenBellInterpolated_list_fwd[1]]
models_list_rev_trigram = [model_MLE_list_rev[1] ,model_KneserNeyInterpolated_list_rev[1] ,model_Laplace_list_rev[1] , model_Lidstone_list_rev[1] , model_StupidBackoff_list_rev[1],model_WittenBellInterpolated_list_rev[1]]

models_list_fwd_quadgram = [model_MLE_list_fwd[2] ,model_KneserNeyInterpolated_list_fwd[2] ,model_Laplace_list_fwd[2] , model_Lidstone_list_fwd[2] , model_StupidBackoff_list_fwd[2],model_WittenBellInterpolated_list_fwd[2]]
models_list_rev_quadgram = [model_MLE_list_rev[2] ,model_KneserNeyInterpolated_list_rev[2] ,model_Laplace_list_rev[2] , model_Lidstone_list_rev[2] , model_StupidBackoff_list_rev[2],model_WittenBellInterpolated_list_rev[2]]



In [21]:
def fit_and_train_models(name, models_list, train_data_list,padded_sents_list):
    for index in range (0,len(models_list)):
        models_list[index].fit(train_data_list[index], padded_sents_list[index])
        #print("Fit & Train:", name, model_name_list[index], models_list[index].vocab)

In [22]:
fit_and_train_models("Fwd Bigram Model:", models_list_fwd_bigram , train_data_list_fwd_bigram,padded_sents_list_fwd_bigram)
fit_and_train_models("Rev Bigram Model:", models_list_rev_bigram , train_data_list_rev_bigram,padded_sents_list_rev_bigram)

fit_and_train_models("Fwd Trigram Model:", models_list_fwd_trigram , train_data_list_fwd_trigram,padded_sents_list_fwd_trigram)
fit_and_train_models("Rev Trigram Model:", models_list_rev_trigram , train_data_list_rev_trigram,padded_sents_list_rev_trigram)

fit_and_train_models("Fwd Quadgram Model:", models_list_fwd_quadgram , train_data_list_fwd_quadgram,padded_sents_list_fwd_quadgram)
fit_and_train_models("Rev Quadgram Model:", models_list_rev_quadgram , train_data_list_rev_quadgram,padded_sents_list_rev_quadgram)

    

In [23]:
#Check one of the models and play with it
k=4
model = KneserNeyInterpolated(k) 
model_rev = KneserNeyInterpolated(k)
train_data, padded_sents = padded_everygram_pipeline(k, tokenized_text)
train_data_rev, padded_sents_rev = padded_everygram_pipeline(k, reverse_tokenized_text)

model.fit(train_data, padded_sents)
print(model.vocab)

model_rev.fit(train_data_rev, padded_sents_rev)
print(model_rev.vocab)


print(model.vocab.lookup(tokenized_text[0]))
print(model.counts)

#'standardized_text' : "634 368 002 061 717 390"
    
print("count of 390:", model.counts['390'])
print(model.counts[['717']]['390'])
print(model.counts[['368', '002']]['061'])
print(model.counts[['002', '061']]['717'])
print(model.score('390'))
# lm.score("b", ["a"]) what is the chance that “b” is preceded by “a”.
print("---", model.score('390', ['717']))

print(model.score('390', '717'.split()))  # P('390'|'717) Given 415 occurs what is the prob of 390
print(model.score('061', '717'.split()))  # P('740'|'390)
print(model.score('368', '002 061'.split()))  # P('368|'002 061')
print(model.score('002', '0061 717'.split()))

print("Entropy and Perplexity")

test = [('634', '368'), ('002', '061')]
print(model.entropy(test))
print(model.perplexity(test))

<Vocabulary with cutoff=1 unk_label='<UNK>' and 561 items>
<Vocabulary with cutoff=1 unk_label='<UNK>' and 561 items>
('220', '017', '585', '095', '520', '032', '407')
<NgramCounter with 4 ngram orders and 92928 ngrams>
count of 390: 173
1
1
1
0.015281173594132029
--- 0.03446527211808385
0.03446527211808385
0.0003395816354251562
2.6568404089797736e-05
0.03584623743547949
Entropy and Perplexity
2.7312032638949706
6.640092153325539


In [24]:
print("-----Train Data--------------------")

print("-----RtoL: Beginning char----- Correct Ans: 634")
# this one user reverse model and reverse string
print("Send one character")
print(generate_sent(model_rev,1,['368'],42))
print("Send two characters")
print(generate_sent(model_rev,1,['002','368'],42))
print("Send three characters")
print(generate_sent(model_rev,1,['061','002', '368'],42))

print("-----RtoL: Medial char----- Correct Ans: 061")
print("Send one character")
print(generate_sent(model,1,['002'],42))
print("Send two characters")
print(generate_sent(model,1,['368','002'],42))
print("Send three characters")
print(generate_sent(model,1,['634','368', '002'],42))

print("-----RtoL: Terminal char----- Correct Ans:390")
print("Send one character")
print(generate_sent(model,1,['717'],42))
print("Send two characters")
print(generate_sent(model,1,['061','717'],42))
print("Send three characters")
print(generate_sent(model,1,['002','061','717'],42))

print("-----Test Data--------------------")
print("-----RtoL: Medial char----- Correct Ans: 176")
print("Send one character")
print(generate_sent(model,1,['032'],42))
print("Send two characters")
print(generate_sent(model,1,['002','032'],42))
print("Send three characters")
print(generate_sent(model,1,['861','002', '032'],42))

-----Train Data--------------------
-----RtoL: Beginning char----- Correct Ans: 634
Send one character
8 1 7
Send two characters
5 5 0
Send three characters
6 3 4
-----RtoL: Medial char----- Correct Ans: 061
Send one character
4 1 5
Send two characters
4 0 1
Send three characters
0 6 1
-----RtoL: Terminal char----- Correct Ans:390
Send one character
7 4 0
Send two characters
3 9 0
Send three characters
3 9 0
-----Test Data--------------------
-----RtoL: Medial char----- Correct Ans: 176
Send one character
7 4 0
Send two characters
2 2 0
Send three characters
2 2 0


In [25]:
def pack_list(first_param, second_param, third_param):
    packed_list = []
    
    if(third_param!=-1):
        packed_list.append(third_param)
        
    if(second_param!=-1):
        packed_list.append(second_param)
        
    if(first_param!=-1):
        packed_list.append(first_param)
  
    return packed_list

In [26]:
def get_list_token(beginning, j, list_tokens, index_unclear_signs, k,l,m):
    one_before=-1
    two_before=-1
    three_before=-1
    param =[]
    last_token_index = len(list_tokens)-1 
    #print(terminal, j, list_tokens, index_unclear_signs, k,l,m)
    
    try:
        if(beginning):     
            # You need to send reverse of the characters
            if(index_unclear_signs[j]+k<=last_token_index):
                one_before = list_tokens[index_unclear_signs[j]+k]

            if(index_unclear_signs[j]+l<=last_token_index):
                two_before = list_tokens[index_unclear_signs[j]+l]
            
            if(index_unclear_signs[j]+m<=last_token_index):
                three_before = list_tokens[index_unclear_signs[j]+m]
                
        else:
            if(index_unclear_signs[j]+k>=0):
                one_before= list_tokens[index_unclear_signs[j]+k]

            if(index_unclear_signs[j]+l>=0):
                two_before = list_tokens[index_unclear_signs[j]+l]
                
            if(index_unclear_signs[j]+m>=0):
                three_before = list_tokens[index_unclear_signs[j]+m]


        param = pack_list(one_before, two_before, three_before)
    except Exception as e:
        print("Exception:", e.__class__, "occurred in get_list_token.")
    return param

In [27]:
verbose_debug = False

def find_unclear_characters(model1, model2, df_Items, seed=8):
    #For each of the text with unclear character go through it
    
    ans=-1
    unclear_chars = []
    
    for text in a :
        try:
            #Identify the position of the unclear text and get its immediate neighbors
            list_tokens = nltk.word_tokenize(text)
            last_token_index = len(list_tokens)-1  
            index_unclear_signs = []
            param =[]

            for i in range(0, last_token_index+1) :
                    if(list_tokens[i]=='000'):
                        index_unclear_signs.append(i)

            if(verbose_debug): print("Text:Index of Unclear signs:", str(text) + ": " + str(index_unclear_signs))

            #assuming one unclear sign in a text. TBD: Extend this later to more than one unclear sign
            j=0
            if(index_unclear_signs[j]==0):

                #print("L to R: Beginning char is unclear")
                
                param = get_list_token(True, j, list_tokens, index_unclear_signs, 1,2,3)
                if(verbose_debug): print("L to R: Beginning char is unclear. Sending: ", param , " to generate next char")
                #Use reverse model
                ans = generate_sent(model2, 1, param , random_seed=seed)
                unclear_chars.append(ans)

            elif(index_unclear_signs[j]==last_token_index):
                #print("L to R: Terminal char is unclear")
                param = get_list_token(False, j, list_tokens, index_unclear_signs, -1,-2,-3)
                if(verbose_debug): print("L to R: Terminal char is unclear. Sending: ", param , " to generate next char")
                try:
                    ans = generate_sent(model1, 1, param, random_seed=seed)
                except Exception as e:
                    print("Exception:", e.__class__, "find_unclear_characters:generate_sent")
                unclear_chars.append(ans)

            else:

                #Not proceeding if more than one char is unclear

                if(len(index_unclear_signs)>1):
                    #print("Many chars are unclear in the text, not able to decipher the text. Moving on ...\n")
                    continue
                
                #print("L to R: One of the middle char is unclear")
                
                param = get_list_token(False,j, list_tokens, index_unclear_signs, -1,-2,-3)
                if(verbose_debug): print("L to R: One of the middle char is unclear. Sending: ", param , " to generate next char")
                ans = generate_sent(model1, 1, param, random_seed=seed)
                unclear_chars.append(ans)
        except Exception as e:
            if(verbose_debug): print("Exception:", e.__class__, "find_unclear_characters.")
            
    return  unclear_chars


In [28]:
CONST_BEGINNING = "Beginning"
CONST_TERMINAL = "Terminal"
CONST_MEDIAL = "Medial"


In [29]:
def get_group_for_sign(id_sign):
    for graph in orig_sign_df[orig_sign_df.id_sign==id_sign].graph :
        return(graph)

In [30]:
def add_answers(text,answer_list, answer, type_unclear_char):
    dict_row = {'text':text, 'len_text':len(text), 'answer':answer, 'type': type_unclear_char}
    answer_list.append(dict_row)

In [31]:
def add_wrong_answers(wrong_answer_list, text, predicted_answer,predicted_answer_group, correct_answer,correct_answer_group, type_unclear_char):
    chars = text.split(' ')
    dict_row = {'text':text, 'len_text':len(chars),'pred_answer':predicted_answer, 'pred_answer_group': predicted_answer_group, 'correct_answer': correct_answer,'correct_answer_group': correct_answer_group, 'type': type_unclear_char}
    wrong_answer_list.append(dict_row)
    

In [32]:
def check_answers(ans, test_correct_answers):

    try:
        beg_hit,ter_hit, med_hit,total_hit=0,0,0,0
        beg_count,ter_count,med_count,total_count=0,0,0,0
        beg_percent,ter_percent,med_percent,total_percent=0,0,0,0
        wrong_answer_list = []
        
        i=0
        for answers in ans:
            correct_ans = test_correct_answers[i].get('answer')
            correct_ans_type = test_correct_answers[i].get('type')
            correct_ans_text = test_correct_answers[i].get('text')
            
            this_ans =answers.replace(" ", "")
            type_unclear_char = correct_ans_type
            
            if(this_ans==correct_ans):
                if(type_unclear_char==CONST_BEGINNING):
                    beg_hit= beg_hit+1
                    beg_count= beg_count+1
                elif(type_unclear_char==CONST_TERMINAL):
                    ter_hit= ter_hit+1
                    ter_count= ter_count+1
                elif(type_unclear_char==CONST_MEDIAL):
                    med_hit= med_hit+1
                    med_count= med_count+1
                    
                total_hit=total_hit+1
                total_count= total_count+1
            else:
                if(type_unclear_char==CONST_BEGINNING):
                    beg_count= beg_count+1
                elif(type_unclear_char==CONST_TERMINAL):
                    ter_count= ter_count+1
                elif(type_unclear_char==CONST_MEDIAL):
                    med_count= med_count+1
                    
                total_count= total_count+1
                
                add_wrong_answers(wrong_answer_list, correct_ans_text, this_ans, get_group_for_sign(this_ans), correct_ans, get_group_for_sign(correct_ans),type_unclear_char)
                
            i=i+1
        
        if(beg_count>0):
            beg_percent = (beg_hit/beg_count)*100
        else:
            beg_percent = -1
            
        if(ter_count>0):
            ter_percent = (ter_hit/ter_count)*100
        else:
            ter_percent = -1
            
        if(med_count>0):
            med_percent = (med_hit/med_count)*100
        else:
            med_percent = -1
            
        if(total_count>0):
            total_percent = (total_hit/total_count)*100
        else:
            total_percent = -1
            
        
    except Exception as e:
            print("Exception:", e.__class__, "in check_answers")
            traceback.print_exc()

    return beg_percent,ter_percent,med_percent,total_percent, wrong_answer_list

In [33]:
def reverse_single_text(text):
    list_reversed_text = []
    # Tokenize to words
    # first split the string into chars
    chars = text.split(' ')

    # then reverse the split string list and join with a space
    reversed_text = ' '.join(reversed(chars))


In [34]:
def reverse_text(a):
    list_reversed_text = []
    for text in a :
        # Tokenize to words
        # first split the string into chars
        chars = text.split(' ')

        # then reverse the split string list and join with a space
        reversed_text = ' '.join(reversed(chars))
        list_reversed_text.append(reversed_text)
    return list_reversed_text 
    

## TESTING
1. Test 1: Use random sample of Training data but one character is made unclear: Take a few samples from Training data, make some characters as 000 (unclear). Do it in begining, terminal and medial regions of the text and see if the model is able to figure out the unclear characters
2. Test 2: Use Testing data: Take the Testing data and make some charcters as 000 (unclear). Do it in begining, terminal and medial regions of the text to test and see if the model is able to figure out the unclear characters
3. Test 3: Real Unclear texts: Use the actual texts with unclear data and what the model is able to come up with for the unclear characters

## Test 0 - Manual test


In [35]:
print("-----Test Data--------------------")

print("-----RtoL: Terminal char----- Correct Ans: 520")
print("Send three characters")
print(generate_sent(model,1,['060','705', '033'],42))

-----Test Data--------------------
-----RtoL: Terminal char----- Correct Ans: 520
Send three characters
9 2 3


In [36]:
# Function for Data Preparation
# Data preparation for testing
# Take n rows from given set, convert a known sign to unclear sign and produce a dataframe

import random

def prepare_data(a,max_text_chars,min_text_chars,max_num_of_rows, seed):

    list_changed_texts = []
    list_changed_reversed_text = []
    test_correct_answers= []
    row_count=0
    ls_made_up_row = []
    random.seed(seed)

    for text in a:
        # Tokenize to words, first split the string into chars
        chars = text.split(' ')
        new_text = chars

        if(len(chars)<=max_text_chars):
            if(len(chars)>min_text_chars):
                #randomly pick an index in the tokenized_text and change it to unclear
                r = random.randrange(0, len(chars))
            else: r=0

            if(r==0): type_unclear_char = CONST_BEGINNING 
            elif(r==len(chars)-1): type_unclear_char = CONST_TERMINAL
            else: type_unclear_char = CONST_MEDIAL

            add_answers(text, test_correct_answers, chars[r], type_unclear_char)

            new_text[r]= '000'
            # then join with a space
            changed_text = ' '.join((new_text))

            made_up_row= {'site' : 'fake_site',
               'changed_reversed_text'  : reverse_single_text(changed_text),
               'changed_text' : changed_text}

            ls_made_up_row.append(made_up_row)


            row_count=row_count+1
            if(row_count>=max_num_of_rows): break


    df_made_up = pd.DataFrame(ls_made_up_row)
    
    return df_made_up, test_correct_answers

In [37]:
# Function for running a test
def run_test(test_name,a, a_rev,check_the_answers, test_correct_answers, try_reverse,wrong_answer_details_verbose, seed):
    
    verbose_debug= False

    # Try unclear texts in fwd direction
    print("_____________________________")
    print("_____ Running ", test_name, "_________")
    print("_____________________________")


    beg_hit,term_hit, med_hit, total_hit=0,0,0,0
    beg_percent,ter_percent,med_percent,total_percent=0,0,0,0
    wrong_answer_list = []

    try_reverse = False

    try:
        print("\n***********************Trying unclear texts in forward direction: Trigram Models************************")
        for index in range(0, len(model_name_list)):

            print("\n****************Model Name:", model_name_list[index])
            ans = find_unclear_characters(models_list_fwd_trigram[index], models_list_rev_trigram[index],a, seed)
            
            print("Fwd_Trigram_model:", model_name_list[index])
            if(check_the_answers==True):
                beg_percent,ter_percent,med_percent,total_percent, wrong_answer_list = check_answers(ans,test_correct_answers)

                print("beg%:",round(beg_percent,2), " ter%:",round(ter_percent,2), " med%:", round(med_percent,2), " tot%:", round(total_percent,2)," and a total of ", round((total_percent/100)*len(test_correct_answers)), " out of", len(test_correct_answers))
                if(wrong_answer_details_verbose): print("\n Wrong Answers:", wrong_answer_list)
            else:
                print("Answers:", ans)
    except Exception as e:
                if(verbose_debug): print("Exception:", e.__class__, "Test 1")
                #traceback.print_exc()


    if(try_reverse):
        # Try unclear strings in reverse
        print("\n_____Trying unclear texts in reverse:______")
        try:
            for index in range(0, len(model_name_list)):
                print("\n****************Model Name:", model_name_list[index])
                ans = find_unclear_characters(models_list_rev_trigram[index], models_list_rev_trigram[index],a_rev, 8)
                
                print("\nRev_Trigram_model:", model_name_list[index]) 
                if(check_the_answers==True):
                    beg_percent,ter_percent,med_percent,total_percent, wrong_answer_list = check_answers(ans,test_correct_answers)
                    print(" beg%:",round(beg_percent,2), " ter%:",round(ter_percent,2), " med%:", round(med_percent,2), " tot%:", round(total_percent,2)," and a total of ", round((total_percent/100)*len(test_correct_answers)), " out of", len(test_correct_answers))
                    if(wrong_answer_details_verbose): print("\n Wrong Answers:", wrong_answer_list)
                else:
                    print("Answers:", ans)
        except Exception as e:
                    print("Exception:", e.__class__)


    try:
        print("\n***********************Trying unclear texts in forward direction: Quadgram Models************************")
        a = df_made_up_from_train[df_made_up_from_train.changed_text!=''].changed_text
        for index in range(0, len(model_name_list)):
            print("\n****************Model Name:", model_name_list[index])
            ans = find_unclear_characters(models_list_fwd_quadgram[index], models_list_rev_quadgram[index],a, 8)
            
            print("\nFwd_Quadgram_model:",model_name_list[index])
            if(check_the_answers==True):
                beg_percent,ter_percent,med_percent,total_percent, wrong_answer_list = check_answers(ans,test_correct_answers)

                print("beg%:",round(beg_percent,2), " ter%:",round(ter_percent,2), " med%:", round(med_percent,2), " tot%:", round(total_percent,2)," and a total of ", round((total_percent/100)*len(test_correct_answers)), " out of", len(test_correct_answers))
                if(wrong_answer_details_verbose): print("\n Wrong Answers:", wrong_answer_list)
            else:
                print("Answers:", ans)
    except Exception as e:
                print("Exception:", e.__class__)

    if(try_reverse):
        # Try unclear strings in reverse
        print("\n_____Trying unclear texts in reverse:______")
        try:
            a= df_made_up_from_train[df_made_up_from_train.changed_reversed_text!=''].changed_reversed_text
            for index in range(0, len(model_name_list)):
                print("\n****************Model Name:", model_name_list[index])
                ans = find_unclear_characters(models_list_rev_quadgram[index], models_list_rev_quadgram[index],a_rev, 8)
                
                print("\nRev_Quadgram_model:", model_name_list[index])
                if(check_the_answers==True):
                    beg_percent,ter_percent,med_percent,total_percent, wrong_answer_list = check_answers(ans,test_correct_answers)
                    print("beg%:",round(beg_percent,2), " ter%:",round(ter_percent,2), " med%:", round(med_percent,2), " tot%:", round(total_percent,2)," and a total of ", round((total_percent/100)*len(test_correct_answers)), " out of", len(test_correct_answers))
                    if(wrong_answer_details_verbose): print("\n Wrong Answers:", wrong_answer_list)
                else:
                    print("Answers:", ans)

        except Exception as e:
                print("Exception:", e.__class__)
                
    return ans


# Test 1

In [38]:
# Test1
# Data preparation for this test
# Take n rows from train set, convert a known sign to unclear sign and produce a dataframe

min_text_chars = 1
max_text_chars = 40
max_num_of_rows=30
seed=10
    
df_made_up_from_train, test1_correct_answers =prepare_data(df_train_x[df_train_x.standardized_text!=''].standardized_text, max_text_chars,min_text_chars,max_num_of_rows, seed)

if(verbose_debug):
    print(df_made_up_from_train.changed_text)
    print("test1_correct_answers: \n", test1_correct_answers)


In [39]:
# Test1
# Run the test
a= df_made_up_from_train[df_made_up_from_train.changed_text!=''].changed_text
a_rev= df_made_up_from_train[df_made_up_from_train.changed_reversed_text!=''].changed_reversed_text
check_the_answers = True
try_reverse = False
wrong_answer_details_verbose = False
seed = 8

ans = run_test("Test-1",a, a_rev, check_the_answers, test1_correct_answers, try_reverse, wrong_answer_details_verbose, seed, )
    

_____________________________
_____ Running  Test-1 _________
_____________________________

***********************Trying unclear texts in forward direction: Trigram Models************************

****************Model Name: MLE
Fwd_Trigram_model: MLE
beg%: 50.0  ter%: 66.67  med%: 40.0  tot%: 53.33  and a total of  16  out of 30

****************Model Name: KneserNeyInterpolated
Fwd_Trigram_model: KneserNeyInterpolated
beg%: 50.0  ter%: 58.33  med%: 40.0  tot%: 50.0  and a total of  15  out of 30

****************Model Name: Laplace
Fwd_Trigram_model: Laplace
beg%: 50.0  ter%: 58.33  med%: 40.0  tot%: 50.0  and a total of  15  out of 30

****************Model Name: Lidstone
Fwd_Trigram_model: Lidstone
beg%: 37.5  ter%: 50.0  med%: 50.0  tot%: 46.67  and a total of  14  out of 30

****************Model Name: StupidBackoff
Fwd_Trigram_model: StupidBackoff
beg%: 50.0  ter%: 66.67  med%: 40.0  tot%: 53.33  and a total of  16  out of 30

****************Model Name: WittenBellInterpolated

## Test 2

In [40]:
# Test 2
# Data preparation for this test
# Take n rows from test set, convert a known sign to unclear sign and produce a dataframe

verbose_debug = False
min_text_chars = 1
max_text_chars = 40
max_num_of_rows=300
seed=11
    
df_made_up_from_test, test2_correct_answers =prepare_data(df_test_x[df_test_x.standardized_text!=''].standardized_text, max_text_chars,min_text_chars,max_num_of_rows,seed)

if(verbose_debug):
    print(df_made_up_from_test.changed_text)
    print("test2_correct_answers: \n", test2_correct_answers)


In [41]:
# Test2
# Run the test
a= df_made_up_from_test[df_made_up_from_test.changed_text!=''].changed_text
a_rev= df_made_up_from_test[df_made_up_from_test.changed_reversed_text!=''].changed_reversed_text
check_the_answers = True
try_reverse = False
wrong_answer_details_verbose = False
seed = 8

ans = run_test("Test-2",a, a_rev, check_the_answers, test2_correct_answers, try_reverse, wrong_answer_details_verbose, seed)

_____________________________
_____ Running  Test-2 _________
_____________________________

***********************Trying unclear texts in forward direction: Trigram Models************************

****************Model Name: MLE
Fwd_Trigram_model: MLE
beg%: 24.27  ter%: 28.57  med%: 28.32  tot%: 27.0  and a total of  81  out of 300

****************Model Name: KneserNeyInterpolated
Fwd_Trigram_model: KneserNeyInterpolated
beg%: 22.33  ter%: 25.0  med%: 30.09  tot%: 26.0  and a total of  78  out of 300

****************Model Name: Laplace
Fwd_Trigram_model: Laplace
beg%: 26.21  ter%: 28.57  med%: 29.2  tot%: 28.0  and a total of  84  out of 300

****************Model Name: Lidstone
Fwd_Trigram_model: Lidstone
beg%: 23.3  ter%: 28.57  med%: 30.09  tot%: 27.33  and a total of  82  out of 300

****************Model Name: StupidBackoff
Fwd_Trigram_model: StupidBackoff
beg%: 24.27  ter%: 28.57  med%: 28.32  tot%: 27.0  and a total of  81  out of 300

****************Model Name: WittenBellI

## Test 3

In [42]:
# Test3
# Run the test
a = df_unclear[df_unclear.standardized_text!=''].standardized_text
a= df_unclear[df_unclear.reversed_text!=''].reversed_text
try_reverse = False
check_the_answers = False
wrong_answer_details_verbose = False
seed = 8

ans = run_test("Test-3",a, a_rev, check_the_answers, None, try_reverse, wrong_answer_details_verbose, seed)

_____________________________
_____ Running  Test-3 _________
_____________________________

***********************Trying unclear texts in forward direction: Trigram Models************************

****************Model Name: MLE
Fwd_Trigram_model: MLE
Answers: ['2 3 3', '0 9 6', '0 0 2', '2 2 6', '2 2 0', '4 0 7', '3 9 0', '1 7 6', '</ s>', '3 9 0', '4 0 0', '3 2 6', '3 9 0', '2 3 3', '3 9 0', '7 4 0', '0 3 2', '3 9 0', '2 9 9', '3 9 0', '0 6 1', '4 1 5', '2 2 0', '3 5 0', '2 3 3', '0 3 3', '4 0 0', '4 0 0', '</ s>', '4 0 0', '1 5 8', '8 3 2', '0 0 2', '5 0 3', '2 2 0', '4 0 0', '</ s>', '5 2 0', '0 3 2', '0 0 2', '1 0 4', '4 0 0', '7 0 0', '0 0 1', '3 9 0', '1 0 0', '0 3 3', '2 2 0', '3 9 0', '0 6 1', '3 9 0', '3 9 0', '1 7 6', '0 0 2', '4 0 0', '4 0 0', '4 0 0', '3 9 0', '3 9 0', '7 0 0', '7 0 5', '3 3 6', '3 9 0', '3 9 0', '7 4 0', '3 9 0', '2 2 0', '1 5 3', '0 3 3', '0 3 2', '4 0 0', '7 4 0', '3 9 0', '3 9 0', '3 3 5', '3 9 0', '0 0 2', '3 9 0', '3 9 0', '3 5 0', '1 7 6', '0 0 2'

Fwd_Trigram_model: KneserNeyInterpolated
Answers: ['1 7 6', '0 9 6', '0 6 0', '0 6 0', '2 2 0', '1 5 1', '1 3 1', '1 7 6', '3 9 0', '1 3 1', '2 4 0', '0 3 6', '1 3 1', '1 7 6', '1 3 1', '7 4 0', '0 3 2', '1 3 1', '2 9 9', '1 3 1', '0 6 1', '4 1 5', '0 3 3', '3 5 0', '1 7 6', '0 3 3', '2 4 0', '4 0 0', '</ s>', '2 4 0', '1 5 3', '8 3 2', '0 0 2', '3 5 2', '0 6 0', '2 4 0', '</ s>', '5 2 0', '0 3 2', '0 0 2', '0 4 9', '2 4 0', '7 0 0', '0 3 1', '1 3 1', '1 0 0', '0 3 3', '0 6 0', '1 3 1', '0 6 0', '1 3 1', '1 3 1', '1 7 6', '0 0 2', '2 4 0', '2 4 0', '2 4 0', '1 3 1', '1 3 1', '7 0 0', '5 0 3', '1 4 0', '1 3 1', '1 3 1', '7 4 0', '1 3 1', '2 2 0', '1 5 3', '0 3 3', '0 3 2', '2 4 0', '2 3 5', '1 3 1', '1 3 1', '2 2 0', '1 3 1', '0 0 2', '1 3 1', '1 3 1', '3 5 0', '1 7 6', '0 0 2', '3 9 0', '5 2 0', '0 3 6', '3 9 0', '1 3 1', '1 3 1', '0 3 6', '8 2 4', '0 0 2', '1 5 1', '5 0 3', '5 0 3', '2 4 0', '1 3 1', '1 3 1', '2 4 0', '2 2 0', '0 4 9', '1 3 1', '1 3 1', '3 8 2', '0 3 2', '1 7 6', '2 2

Fwd_Trigram_model: Laplace
Answers: ['2 3 2', '0 9 6', '0 0 2', '1 9 1', '2 2 0', '3 9 0', '3 9 0', '1 7 6', '7 4 0', '3 9 0', '4 0 0', '2 2 0', '3 9 0', '2 3 2', '3 9 0', '7 4 0', '0 3 2', '3 9 0', '2 9 9', '3 9 0', '0 6 1', '4 1 5', '1 6 8', '3 5 0', '2 3 2', '0 3 3', '4 0 0', '4 0 0', '</ s>', '4 0 0', '1 5 8', '8 3 2', '0 0 2', '4 9 5', '1 5 4', '4 0 0', '</ s>', '5 2 0', '0 3 2', '0 0 2', '0 4 9', '4 0 0', '7 0 0', '0 0 1', '3 9 0', '1 0 0', '0 3 3', '2 2 0', '3 9 0', '0 6 1', '3 9 0', '3 9 0', '1 7 6', '0 0 2', '4 0 0', '4 0 0', '4 0 0', '3 9 0', '3 9 0', '7 0 0', '7 0 0', '2 4 0', '3 9 0', '3 9 0', '7 4 0', '3 9 0', '2 2 0', '1 5 3', '0 3 3', '0 3 2', '4 0 0', '2 4 0', '3 9 0', '3 9 0', '2 5 2', '3 9 0', '0 0 2', '3 9 0', '3 9 0', '3 5 0', '1 7 6', '0 0 2', '</ s>', '5 2 0', '2 2 0', '3 9 0', '3 9 0', '3 9 0', '2 2 0', '8 2 4', '0 0 2', '3 9 0', '7 0 0', '7 0 0', '4 0 0', '3 9 0', '3 9 0', '4 0 0', '6 2 1', '0 4 9', '3 9 0', '3 9 0', '3 8 2', '0 3 2', '1 7 6', '1 7 6', '1 5 6', 

Fwd_Trigram_model: Lidstone
Answers: ['2 2 2', '0 9 6', '0 6 0', '1 7 6', '1 4 0', '1 5 1', '3 6 8', '1 7 6', '4 4 0', '3 6 8', '4 0 0', '1 2 5', '3 6 8', '2 2 2', '3 6 8', '7 4 0', '0 3 2', '3 6 8', '2 9 9', '3 6 8', '0 6 1', '0 0 2', '1 6 8', '3 5 0', '2 2 2', '0 3 3', '4 0 0', '2 4 0', '</ s>', '4 0 0', '1 5 6', '8 3 2', '0 0 2', '4 4 0', '0 6 3', '4 0 0', '</ s>', '5 2 0', '0 3 2', '0 3 1', '0 4 9', '4 0 0', '7 0 0', '0 0 2', '3 6 8', '1 0 0', '0 3 3', '1 5 6', '3 6 8', '0 6 1', '3 6 8', '3 6 8', '1 7 6', '0 0 2', '4 0 0', '4 0 0', '4 0 0', '3 6 8', '3 6 8', '7 0 0', '7 0 0', '2 2 0', '3 6 8', '3 6 8', '7 4 0', '3 6 8', '2 2 0', '1 5 3', '0 3 3', '0 3 2', '4 0 0', '2 3 6', '3 6 8', '3 6 8', '2 4 0', '3 6 8', '0 0 2', '3 6 8', '3 6 8', '3 5 0', '1 7 6', '0 0 2', '8 0 8', '5 2 0', '1 2 5', '3 9 0', '3 6 8', '3 6 8', '1 2 5', '8 2 4', '0 0 2', '3 6 8', '7 0 0', '7 0 0', '4 0 0', '3 6 8', '3 6 8', '4 0 0', '4 0 0', '0 6 1', '3 6 8', '3 6 8', '3 8 2', '0 3 2', '1 7 6', '1 7 6', '1 5 6',

Fwd_Trigram_model: StupidBackoff
Answers: ['2 3 3', '0 9 6', '0 0 2', '2 2 6', '2 2 0', '4 0 7', '3 9 0', '1 7 6', '</ s>', '3 9 0', '4 0 0', '3 2 6', '3 9 0', '2 3 3', '3 9 0', '7 4 0', '0 3 2', '3 9 0', '2 9 9', '3 9 0', '0 6 1', '4 1 5', '2 2 0', '3 5 0', '2 3 3', '0 3 3', '4 0 0', '4 0 0', '</ s>', '4 0 0', '1 5 8', '8 3 2', '0 0 2', '5 0 3', '2 2 0', '4 0 0', '</ s>', '5 2 0', '0 3 2', '0 0 2', '1 0 4', '4 0 0', '7 0 0', '0 0 1', '3 9 0', '1 0 0', '0 3 3', '2 2 0', '3 9 0', '0 6 1', '3 9 0', '3 9 0', '1 7 6', '0 0 2', '4 0 0', '4 0 0', '4 0 0', '3 9 0', '3 9 0', '7 0 0', '7 0 5', '3 3 6', '3 9 0', '3 9 0', '7 4 0', '3 9 0', '2 2 0', '1 5 3', '0 3 3', '0 3 2', '4 0 0', '7 4 0', '3 9 0', '3 9 0', '3 3 5', '3 9 0', '0 0 2', '3 9 0', '3 9 0', '3 5 0', '1 7 6', '0 0 2', '</ s>', '5 2 0', '3 2 6', '3 9 0', '3 9 0', '3 9 0', '3 2 6', '8 2 4', '0 0 2', '3 9 0', '7 0 5', '7 0 5', '4 0 0', '3 9 0', '3 9 0', '4 0 0', '</ s>', '0 4 8', '3 9 0', '3 9 0', '3 8 2', '0 3 2', '1 7 6', '1 7 6', '1 

Fwd_Trigram_model: WittenBellInterpolated
Answers: ['2 3 3', '0 9 6', '0 0 2', '2 2 6', '2 3 1', '4 0 7', '3 9 0', '1 7 6', '</ s>', '3 9 0', '4 0 0', '3 2 6', '3 9 0', '2 3 3', '3 9 0', '7 4 0', '0 3 2', '3 9 0', '2 9 9', '3 9 0', '0 6 1', '4 1 5', '2 2 0', '3 5 0', '2 3 3', '0 3 3', '4 0 0', '4 0 0', '</ s>', '4 0 0', '1 5 8', '8 3 2', '0 0 2', '5 0 3', '2 2 0', '4 0 0', '</ s>', '5 2 0', '0 3 2', '0 0 2', '1 0 4', '4 0 0', '7 0 0', '0 0 1', '3 9 0', '1 0 0', '0 3 3', '2 2 0', '3 9 0', '0 9 0', '3 9 0', '3 9 0', '1 7 6', '0 0 2', '4 0 0', '4 0 0', '4 0 0', '3 9 0', '3 9 0', '7 0 0', '7 0 5', '3 3 7', '3 9 0', '3 9 0', '7 4 0', '3 9 0', '2 2 0', '1 5 3', '0 3 3', '0 3 3', '4 0 0', '7 4 0', '3 9 0', '3 9 0', '3 3 5', '3 9 0', '0 0 2', '3 9 0', '3 9 0', '3 5 0', '1 7 6', '0 0 2', '</ s>', '5 2 0', '3 2 6', '</ s>', '3 9 0', '3 9 0', '3 2 6', '8 2 4', '0 0 2', '3 9 0', '7 0 5', '7 0 5', '4 0 0', '3 9 0', '3 9 0', '4 0 0', '</ s>', '0 4 8', '3 9 0', '3 9 0', '3 8 2', '0 3 2', '1 7 6', '1 


Fwd_Quadgram_model: MLE
Answers: ['2 3 3', '0 9 6', '0 0 2', '2 2 6', '2 2 0', '4 0 7', '5 2 0', '1 7 6', '</ s>', '5 2 0', '4 0 0', '3 2 6', '5 2 0', '2 3 3', '5 2 0', '7 4 0', '0 3 2', '5 2 0', '2 9 9', '5 2 0', '0 6 1', '4 1 5', '2 2 0', '3 5 0', '2 3 3', '0 3 3', '4 0 0', '4 0 0', '</ s>', '4 0 0', '1 5 8', '8 3 2', '0 0 2', '5 0 3', '2 2 0', '4 0 0', '</ s>', '5 2 0', '0 3 2', '0 0 2', '1 0 4', '4 0 0', '7 0 0', '0 0 1', '5 2 0', '1 0 0', '0 3 3', '2 2 0', '5 2 0', '0 6 1', '5 2 0', '5 2 0', '1 7 6', '0 0 2', '4 0 0', '4 0 0', '4 0 0', '5 2 0', '5 2 0', '7 0 0', '7 0 5', '3 3 6', '5 2 0', '5 2 0', '7 4 0', '5 2 0', '2 2 0', '1 5 3', '0 3 3', '0 3 2', '4 0 0', '7 4 0', '5 2 0', '5 2 0', '3 3 5', '5 2 0', '0 0 2', '5 2 0', '5 2 0', '3 5 0', '1 7 6', '0 0 2', '</ s>', '5 2 0', '3 2 6', '3 9 0', '5 2 0', '5 2 0', '3 2 6', '8 2 4', '0 0 2', '5 2 0', '7 0 5', '7 0 5', '4 0 0', '5 2 0', '5 2 0', '4 0 0', '</ s>', '0 4 8', '5 2 0', '5 2 0', '3 8 2', '0 3 2', '1 7 6', '1 7 6', '1 5 6', '4


Fwd_Quadgram_model: KneserNeyInterpolated
Answers: ['1 7 6', '0 9 6', '0 6 0', '0 6 0', '2 2 0', '1 5 1', '1 3 1', '1 7 6', '3 9 0', '1 3 1', '2 4 0', '0 3 6', '1 3 1', '1 7 6', '1 3 1', '7 4 0', '0 3 2', '1 3 1', '2 9 9', '1 3 1', '0 6 1', '0 0 2', '0 3 3', '3 5 0', '1 7 6', '0 3 3', '2 4 0', '2 4 0', '</ s>', '2 4 0', '1 5 3', '8 3 2', '0 0 2', '3 5 2', '0 6 0', '2 4 0', '</ s>', '5 2 0', '0 3 2', '0 0 2', '0 4 9', '2 4 0', '7 0 0', '0 3 1', '1 3 1', '1 0 0', '0 3 3', '0 6 0', '1 3 1', '0 6 0', '1 3 1', '1 3 1', '1 0 0', '0 0 2', '2 4 0', '2 4 0', '2 4 0', '1 3 1', '1 3 1', '7 0 0', '5 0 3', '1 4 0', '1 3 1', '1 3 1', '7 4 0', '1 3 1', '2 2 0', '1 5 3', '0 3 3', '0 3 2', '2 4 0', '2 3 5', '1 3 1', '1 3 1', '2 2 0', '1 3 1', '0 0 2', '1 3 1', '1 3 1', '3 5 2', '1 0 0', '0 0 2', '3 9 0', '5 2 0', '0 3 6', '3 9 0', '1 3 1', '1 3 1', '0 3 6', '8 2 4', '0 0 2', '1 5 1', '5 0 3', '5 0 3', '2 4 0', '1 3 1', '1 3 1', '2 4 0', '2 2 0', '0 4 9', '1 3 1', '1 3 1', '3 8 2', '0 3 2', '1 0 0', '2


Fwd_Quadgram_model: Laplace
Answers: ['2 3 2', '0 9 6', '0 0 2', '1 9 1', '2 2 0', '3 9 0', '5 0 1', '1 7 6', '7 4 0', '5 0 1', '4 0 0', '2 2 0', '5 0 1', '2 3 2', '5 0 1', '7 4 0', '0 3 2', '5 0 1', '2 9 9', '5 0 1', '0 6 1', '4 1 5', '1 6 8', '3 5 0', '2 3 2', '0 3 3', '4 0 0', '4 0 0', '</ s>', '4 0 0', '1 5 8', '8 3 2', '0 0 2', '4 9 5', '1 5 4', '4 0 0', '</ s>', '5 2 0', '0 3 2', '0 0 2', '0 4 9', '4 0 0', '7 0 0', '0 0 1', '5 0 1', '1 0 0', '0 3 3', '2 2 0', '5 0 1', '0 6 1', '5 0 1', '5 0 1', '1 7 6', '0 0 2', '4 0 0', '4 0 0', '4 0 0', '5 0 1', '5 0 1', '7 0 0', '7 0 0', '2 4 0', '5 0 1', '5 0 1', '7 4 0', '5 0 1', '2 2 0', '1 5 3', '0 3 3', '0 3 2', '4 0 0', '2 4 0', '5 0 1', '5 0 1', '2 5 2', '5 0 1', '0 0 2', '5 0 1', '5 0 1', '3 5 0', '1 7 6', '0 0 2', '</ s>', '5 2 0', '2 2 0', '3 9 0', '5 0 1', '5 0 1', '2 2 0', '8 2 4', '0 0 2', '5 0 1', '7 0 0', '7 0 0', '4 0 0', '5 0 1', '5 0 1', '4 0 0', '6 2 1', '0 4 9', '5 0 1', '5 0 1', '3 8 2', '0 3 2', '1 7 6', '1 7 6', '1 5 6'


Fwd_Quadgram_model: Lidstone
Answers: ['2 2 2', '0 9 6', '0 6 0', '1 0 0', '1 4 0', '1 5 1', '4 1 5', '1 7 6', '4 0 0', '4 1 5', '4 0 0', '1 2 5', '4 1 5', '2 2 2', '4 1 5', '7 4 0', '0 3 2', '4 1 5', '2 9 9', '4 1 5', '0 6 1', '0 0 2', '0 6 0', '3 5 0', '2 2 2', '0 3 3', '4 0 0', '2 4 0', '</ s>', '4 0 0', '1 5 6', '8 3 2', '0 0 2', '3 5 2', '0 6 3', '4 0 0', '</ s>', '5 2 0', '0 3 2', '0 5 5', '0 4 9', '4 0 0', '7 0 0', '0 3 1', '4 1 5', '1 0 0', '0 3 3', '1 5 6', '4 1 5', '0 9 0', '4 1 5', '4 1 5', '1 7 6', '0 0 2', '4 0 0', '4 0 0', '4 0 0', '4 1 5', '4 1 5', '7 0 0', '6 0 5', '1 7 5', '4 1 5', '4 1 5', '1 0 1', '4 1 5', '2 2 0', '1 5 3', '0 3 3', '0 3 2', '4 0 0', '2 3 5', '4 1 5', '4 1 5', '2 4 0', '4 1 5', '0 0 2', '4 1 5', '4 1 5', '3 5 0', '1 7 6', '0 0 2', '6 9 0', '5 2 0', '1 2 5', '3 9 0', '4 1 5', '4 1 5', '1 2 5', '8 2 4', '0 0 2', '4 1 5', '6 0 5', '6 0 5', '4 0 0', '4 1 5', '4 1 5', '4 0 0', '4 0 0', '0 7 0', '4 1 5', '4 1 5', '3 8 2', '0 3 2', '1 7 6', '1 7 6', '1 5 6


Fwd_Quadgram_model: StupidBackoff
Answers: ['2 3 3', '0 9 6', '0 0 2', '2 2 6', '2 2 0', '4 0 7', '5 2 0', '1 7 6', '</ s>', '5 2 0', '4 0 0', '3 2 6', '5 2 0', '2 3 3', '5 2 0', '7 4 0', '0 3 2', '5 2 0', '2 9 9', '5 2 0', '0 6 1', '4 1 5', '2 2 0', '3 5 0', '2 3 3', '0 3 3', '4 0 0', '4 0 0', '</ s>', '4 0 0', '1 5 8', '8 3 2', '0 0 2', '5 0 3', '2 2 0', '4 0 0', '</ s>', '5 2 0', '0 3 2', '0 0 2', '1 0 4', '4 0 0', '7 0 0', '0 0 1', '5 2 0', '1 0 0', '0 3 3', '2 2 0', '5 2 0', '0 6 1', '5 2 0', '5 2 0', '1 7 6', '0 0 2', '4 0 0', '4 0 0', '4 0 0', '5 2 0', '5 2 0', '7 0 0', '7 0 5', '3 3 6', '5 2 0', '5 2 0', '7 4 0', '5 2 0', '2 2 0', '1 5 3', '0 3 3', '0 3 2', '4 0 0', '7 4 0', '5 2 0', '5 2 0', '3 3 5', '5 2 0', '0 0 2', '5 2 0', '5 2 0', '3 5 0', '1 7 6', '0 0 2', '</ s>', '5 2 0', '3 2 6', '3 9 0', '5 2 0', '5 2 0', '3 2 6', '8 2 4', '0 0 2', '5 2 0', '7 0 5', '7 0 5', '4 0 0', '5 2 0', '5 2 0', '4 0 0', '</ s>', '0 4 8', '5 2 0', '5 2 0', '3 8 2', '0 3 2', '1 7 6', '1 7 6', '


Fwd_Quadgram_model: WittenBellInterpolated
Answers: ['2 3 3', '0 9 6', '0 0 2', '2 3 5', '2 3 1', '4 0 7', '5 2 0', '1 7 6', '</ s>', '5 2 0', '4 0 0', '3 9 0', '5 2 0', '2 3 3', '5 2 0', '7 4 0', '0 3 2', '5 2 0', '2 9 9', '5 2 0', '0 6 1', '4 1 5', '2 2 0', '3 5 0', '2 3 3', '0 3 3', '4 0 0', '4 0 0', '</ s>', '4 0 0', '1 5 8', '8 3 2', '0 0 2', '5 0 3', '2 2 0', '4 0 0', '</ s>', '5 2 0', '0 3 2', '0 0 2', '1 0 4', '4 0 0', '7 0 0', '0 0 1', '5 2 0', '1 0 0', '0 3 3', '2 2 0', '5 2 0', '0 9 0', '5 2 0', '5 2 0', '1 7 6', '0 0 2', '4 0 0', '4 0 0', '4 0 0', '5 2 0', '5 2 0', '7 0 0', '7 0 5', '3 3 7', '5 2 0', '5 2 0', '7 4 0', '5 2 0', '2 2 0', '1 5 3', '0 3 3', '0 3 3', '4 0 0', '7 4 0', '5 2 0', '5 2 0', '3 3 5', '5 2 0', '0 0 2', '5 2 0', '5 2 0', '3 5 0', '1 7 6', '0 0 2', '</ s>', '5 2 0', '3 9 0', '</ s>', '5 2 0', '5 2 0', '3 9 0', '8 2 4', '0 0 2', '5 2 0', '7 0 5', '7 0 5', '4 0 0', '5 2 0', '5 2 0', '4 0 0', '</ s>', '0 4 8', '5 2 0', '5 2 0', '3 8 2', '0 3 2', '1 7 6', '

In [43]:
# Actual Unclear texts
# Try unclear texts in fwd direction

print("_____________________________")
print("_____ Running TEST-3_________")
print("_____________________________")

print("_____Trying unclear texts in forward direction:_______")

try:
    fwd_quadgram_model_hit = [0,0,0,0,0]
    a = df_unclear[df_unclear.standardized_text!=''].standardized_text
    for index in range(0, len(model_name_list)):
        print("***********Model Name*****************:", model_name_list[index])
        ans = find_unclear_characters(models_list_fwd_quadgram[index], models_list_rev_quadgram[index],a, 8)
        print("Fwd_Quadgram_model:", model_name_list[index], " identified", "Answers:", ans)
except Exception as e:
    print("Exception:", e.__class__)


 # Try unclear strings in reverse
print("_____Trying unclear texts in reverse:______")
    
try:
    rev_quadgram_model_hit = [0,0,0,0,0]
    a= df_unclear[df_unclear.reversed_text!=''].reversed_text
    for index in range(0, len(model_name_list)):
        print("***********Model Name*****************:", model_name_list[index])
        ans = find_unclear_characters(models_list_rev_quadgram[index], models_list_rev_quadgram[index],a, 8)
        print("Rev_Quadgram_model:", model_name_list[index], " identified", "Answers:", ans)

except Exception as e:
    print("Exception:", e.__class__)

_____________________________
_____ Running TEST-3_________
_____________________________
_____Trying unclear texts in forward direction:_______
***********Model Name*****************: MLE
Fwd_Quadgram_model: MLE  identified Answers: ['0 3 2', '2 2 0', '</ s>', '3 9 0', '1 0 0', '5 2 0', '0 3 2', '5 2 0', '0 4 9', '</ s>', '5 2 0', '0 3 3', '5 2 0', '6 3 2', '7 4 0', '2 3 3', '7 4 1', '5 2 0', '2 3 3', '2 3 3', '0 1 3', '4 0 0', '2 2 0', '5 2 0', '2 4 0', '5 2 0', '1 7 6', '5 2 0', '0 4 8', '</ s>', '7 4 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '3 5 3', '0 3 2', '5 2 0', '5 2 0', '5 2 0', '</ s>', '7 4 0', '5 2 0', '0 9 0', '3 2 6', '3 3 6', '0 6 0', '5 2 0', '5 2 0', '2 3 3', '5 2 0', '4 0 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '1 7 5', '5 5 0', '2 3 1', '5 2 0', '2 2 0', '2 2 0', '5 2 0', '5 2 0', '1 7 6', '0 1 7', '5 2 0', '</ s>', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '</ s>', '5 2 0', '5 2 0', '0 0 2', '0 6 1', '0 3 3', '0 0 2', '5 2 0', '5 2 0', '0 3 3', '8 2 4', 

Fwd_Quadgram_model: KneserNeyInterpolated  identified Answers: ['0 3 2', '2 2 0', '6 2 6', '3 9 0', '1 0 0', '1 3 1', '0 3 2', '1 3 1', '0 4 9', '</ s>', '1 3 1', '0 3 3', '1 3 1', '6 3 2', '7 4 0', '1 7 6', '7 4 1', '1 3 1', '2 3 3', '2 3 3', '0 1 3', '2 4 0', '1 4 0', '1 5 1', '2 4 0', '1 3 1', '2 2 0', '5 2 0', '0 6 1', '3 9 0', '1 5 4', '1 3 1', '1 5 1', '1 3 1', '1 3 1', '3 5 0', '0 3 2', '1 3 1', '1 3 1', '1 3 1', '4 0 0', '7 4 0', '1 3 1', '0 9 0', '0 3 6', '1 4 0', '2 3 3', '1 3 1', '1 3 1', '1 7 6', '1 3 1', '3 5 0', '1 3 1', '1 3 1', '1 3 1', '1 4 2', '1 5 1', '1 7 5', '5 4 0', '2 3 1', '1 3 1', '0 6 0', '2 2 0', '1 5 1', '1 3 1', '2 2 0', '0 1 7', '1 3 1', '3 9 0', '1 3 1', '1 3 1', '1 3 1', '1 3 1', '1 3 1', '</ s>', '1 3 1', '1 3 1', '0 0 2', '0 6 1', '0 3 3', '0 0 2', '1 3 1', '1 3 1', '0 3 3', '8 2 4', '3 9 0', '1 3 1', '1 4 2', '1 4 2', '1 3 1', '1 3 1', '0 5 5', '0 6 6', '2 7 7', '1 3 1', '1 3 1', '1 3 1', '1 0 0', '3 8 2', '1 7 6', '1 3 1', '2 4 0', '1 3 1', '1 3 1', 

Fwd_Quadgram_model: Laplace  identified Answers: ['0 3 2', '2 2 0', '6 8 3', '3 9 0', '1 0 0', '5 0 1', '0 3 2', '5 0 1', '0 4 9', '</ s>', '5 0 1', '0 3 3', '5 0 1', '6 3 2', '7 4 0', '2 3 2', '7 4 1', '5 0 1', '2 3 3', '2 3 3', '0 1 3', '4 0 0', '1 9 3', '5 0 1', '2 4 0', '5 0 1', '1 7 6', '5 2 0', '0 4 8', '7 4 0', '5 5 0', '5 0 1', '5 0 1', '5 0 1', '5 0 1', '3 5 0', '0 3 2', '5 0 1', '5 0 1', '5 0 1', '9 9 9', '7 4 0', '5 0 1', '0 9 0', '2 2 0', '2 4 0', '0 6 0', '5 0 1', '5 0 1', '2 3 2', '5 0 1', '3 5 0', '5 0 1', '5 0 1', '5 0 1', '5 2 0', '5 0 1', '1 7 5', '5 4 0', '2 3 1', '5 0 1', '1 5 4', '2 2 0', '5 0 1', '5 0 1', '1 7 6', '0 1 7', '5 0 1', '7 4 0', '5 0 1', '5 0 1', '5 0 1', '5 0 1', '5 0 1', '</ s>', '5 0 1', '5 0 1', '0 0 2', '0 6 1', '0 3 3', '0 0 2', '5 0 1', '5 0 1', '0 3 3', '8 2 4', '</ s>', '5 0 1', '5 2 0', '5 2 0', '5 0 1', '5 0 1', '0 5 5', '0 6 6', '3 2 4', '5 0 1', '5 0 1', '5 0 1', '1 7 6', '3 8 2', '2 3 2', '5 0 1', '4 0 0', '5 0 1', '5 0 1', '4 0 0', '5 0 

Fwd_Quadgram_model: Lidstone  identified Answers: ['0 3 2', '1 4 0', '4 1 6', '3 9 0', '1 0 0', '4 1 5', '0 3 1', '4 1 5', '0 5 5', '</ s>', '4 1 5', '0 3 3', '4 1 5', '6 3 2', '7 4 0', '2 2 2', '7 4 1', '4 1 5', '2 3 3', '2 3 3', '0 1 3', '4 0 0', '1 4 0', '4 1 5', '2 4 1', '4 1 5', '1 7 6', '5 2 0', '0 4 8', '4 0 0', '1 5 4', '4 1 5', '4 1 5', '4 1 5', '4 1 5', '0 6 0', '0 3 2', '4 1 5', '4 1 5', '4 1 5', '7 0 0', '7 4 0', '4 1 5', '0 9 0', '1 2 5', '1 7 5', '0 6 0', '4 1 5', '4 1 5', '2 2 2', '4 1 5', '2 6 9', '4 1 5', '4 1 5', '4 1 5', '2 5 5', '4 1 5', '1 7 5', '3 6 8', '2 3 1', '4 1 5', '0 6 3', '2 2 0', '4 1 5', '4 1 5', '1 7 6', '0 1 7', '4 1 5', '4 0 0', '4 1 5', '4 1 5', '4 1 5', '4 1 5', '4 1 5', '3 5 3', '4 1 5', '4 1 5', '0 0 2', '0 6 1', '0 3 3', '0 0 2', '4 1 5', '4 1 5', '0 3 3', '8 2 4', '6 9 0', '4 1 5', '2 5 5', '2 5 5', '4 1 5', '4 1 5', '0 5 5', '0 6 6', '2 2 0', '4 1 5', '4 1 5', '4 1 5', '1 7 6', '3 8 2', '2 2 2', '4 1 5', '4 0 0', '4 1 5', '4 1 5', '4 0 0', '4 1

Fwd_Quadgram_model: StupidBackoff  identified Answers: ['0 3 2', '2 2 0', '</ s>', '3 9 0', '1 0 0', '5 2 0', '0 3 2', '5 2 0', '0 4 9', '</ s>', '5 2 0', '0 3 3', '5 2 0', '6 3 2', '7 4 0', '2 3 3', '7 4 1', '5 2 0', '2 3 3', '2 3 3', '0 1 3', '4 0 0', '2 2 0', '5 2 0', '2 4 0', '5 2 0', '1 7 6', '5 2 0', '0 4 8', '</ s>', '7 4 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '3 5 3', '0 3 2', '5 2 0', '5 2 0', '5 2 0', '</ s>', '7 4 0', '5 2 0', '0 9 0', '3 2 6', '3 3 6', '0 6 0', '5 2 0', '5 2 0', '2 3 3', '5 2 0', '4 0 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '1 7 5', '5 5 0', '2 3 1', '5 2 0', '2 2 0', '2 2 0', '5 2 0', '5 2 0', '1 7 6', '0 1 7', '5 2 0', '</ s>', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '</ s>', '5 2 0', '5 2 0', '0 0 2', '0 6 1', '0 3 3', '0 0 2', '5 2 0', '5 2 0', '0 3 3', '8 2 4', '</ s>', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '0 5 5', '0 6 6', '4 3 5', '5 2 0', '5 2 0', '5 2 0', '1 7 6', '3 8 2', '2 3 3', '5 2 0', '4 0 0', '5 2 0', '5 2 0', '</ s>',

Fwd_Quadgram_model: WittenBellInterpolated  identified Answers: ['0 3 2', '2 3 1', '</ s>', '3 9 0', '1 0 0', '5 2 0', '0 3 2', '5 2 0', '0 4 9', '</ s>', '5 2 0', '0 3 3', '5 2 0', '6 3 2', '7 4 0', '2 3 3', '7 4 1', '5 2 0', '2 3 3', '2 4 0', '0 1 3', '4 0 0', '2 2 0', '5 2 0', '2 4 1', '5 2 0', '1 7 6', '5 2 0', '0 3 2', '</ s>', '7 4 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '3 5 3', '0 3 2', '5 2 0', '5 2 0', '5 2 0', '</ s>', '7 4 0', '5 2 0', '0 9 0', '3 9 0', '3 3 7', '0 6 0', '5 2 0', '5 2 0', '2 3 3', '5 2 0', '4 0 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '1 7 5', '5 5 0', '2 3 1', '5 2 0', '2 2 0', '2 2 0', '5 2 0', '5 2 0', '1 7 6', '0 1 7', '5 2 0', '</ s>', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '</ s>', '5 2 0', '5 2 0', '0 0 2', '0 6 1', '0 3 3', '0 0 2', '5 2 0', '5 2 0', '0 3 3', '8 2 4', '</ s>', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '5 2 0', '0 5 5', '0 6 6', '4 3 5', '5 2 0', '5 2 0', '5 2 0', '1 7 6', '3 8 2', '2 3 3', '5 2 0', '4 0 0', '5 2 0', '5 2 0',

Rev_Quadgram_model: MLE  identified Answers: ['0 3 2', '</ s>', '0 0 2', '2 2 6', '2 2 0', '0 3 2', '5 2 0', '1 7 6', '7 4 0', '5 2 0', '1 7 6', '0 3 3', '5 2 0', '0 3 2', '5 2 0', '7 4 0', '0 3 2', '5 2 0', '7 4 1', '5 2 0', '8 6 1', '4 1 5', '2 2 0', '3 5 0', '0 3 2', '2 2 0', '1 7 6', '0 4 9', '1 7 6', '1 7 6', '1 5 8', '8 3 2', '</ s>', '5 0 3', '2 2 0', '1 7 6', '7 4 0', '3 5 3', '0 3 2', '0 0 2', '0 6 0', '1 7 6', '</ s>', '0 0 1', '5 2 0', '2 2 6', '0 3 3', '2 2 0', '5 2 0', '0 6 1', '5 2 0', '5 2 0', '1 7 6', '0 0 2', '1 7 6', '1 7 6', '1 7 6', '5 2 0', '5 2 0', '</ s>', '7 0 5', '1 7 5', '5 2 0', '5 2 0', '0 3 3', '5 2 0', '2 2 0', '0 3 3', '0 3 3', '0 3 2', '1 7 6', '7 4 0', '5 2 0', '5 2 0', '0 1 7', '5 2 0', '3 4 2', '5 2 0', '5 2 0', '3 5 0', '1 7 6', '1 9 0', '</ s>', '5 2 0', '0 3 3', '0 0 2', '5 2 0', '5 2 0', '0 3 3', '8 2 4', '</ s>', '5 2 0', '7 0 5', '7 0 5', '1 7 6', '5 2 0', '5 2 0', '1 7 6', '0 7 0', '8 1 7', '5 2 0', '5 2 0', '3 8 2', '0 3 2', '1 7 6', '1 7 6', 

Rev_Quadgram_model: KneserNeyInterpolated  identified Answers: ['0 3 2', '6 2 6', '0 6 0', '0 6 0', '2 2 0', '0 3 2', '1 3 1', '1 7 6', '1 2 5', '1 3 1', '2 2 0', '0 3 3', '1 3 1', '0 3 2', '1 3 1', '7 4 0', '0 3 2', '1 3 1', '7 4 1', '1 3 1', '8 6 1', '0 0 2', '0 3 3', '3 5 0', '0 3 2', '1 4 0', '2 2 0', '0 7 0', '1 0 1', '2 2 0', '1 5 3', '8 3 2', '3 9 0', '3 5 2', '0 6 0', '2 2 0', '1 2 5', '3 5 0', '0 3 2', '0 0 2', '0 6 0', '2 2 0', '4 0 0', '0 3 1', '1 3 1', '0 6 0', '0 3 3', '0 6 0', '1 3 1', '0 6 0', '1 3 1', '1 3 1', '2 2 0', '0 0 2', '2 2 0', '2 2 0', '2 2 0', '1 3 1', '1 3 1', '4 0 0', '5 0 3', '1 7 5', '1 3 1', '1 3 1', '0 7 0', '1 3 1', '0 6 0', '0 3 3', '0 3 3', '0 3 2', '2 2 0', '2 3 5', '1 3 1', '1 3 1', '7 0 3', '1 3 1', '3 4 2', '1 3 1', '1 3 1', '3 5 2', '2 2 0', '1 9 0', '3 9 0', '5 2 0', '0 3 3', '0 0 2', '1 3 1', '1 3 1', '0 3 3', '8 2 4', '3 9 0', '1 3 1', '5 0 3', '5 0 3', '2 2 0', '1 3 1', '1 3 1', '2 2 0', '0 7 2', '8 1 7', '1 3 1', '1 3 1', '3 8 2', '0 3 2', 

Rev_Quadgram_model: Laplace  identified Answers: ['0 3 2', '6 8 3', '0 0 2', '1 9 1', '2 2 0', '0 3 2', '5 0 1', '1 7 6', '7 4 0', '5 0 1', '1 7 6', '0 3 3', '5 0 1', '0 3 2', '5 0 1', '7 4 0', '0 3 2', '5 0 1', '7 4 1', '5 0 1', '8 6 1', '4 1 5', '1 6 8', '3 5 0', '0 3 2', '1 9 3', '1 7 6', '0 6 1', '1 7 6', '1 7 6', '1 5 8', '8 3 2', '</ s>', '4 9 5', '1 5 4', '1 7 6', '7 4 0', '3 5 0', '0 3 2', '0 0 2', '0 6 0', '1 7 6', '9 9 9', '0 0 1', '5 0 1', '1 9 1', '0 3 3', '2 2 0', '5 0 1', '0 6 1', '5 0 1', '5 0 1', '1 7 6', '0 0 2', '1 7 6', '1 7 6', '1 7 6', '5 0 1', '5 0 1', '9 9 9', '7 0 0', '1 7 5', '5 0 1', '5 0 1', '0 3 3', '5 0 1', '1 5 4', '0 3 3', '0 3 3', '0 3 2', '1 7 6', '2 4 0', '5 0 1', '5 0 1', '0 1 7', '5 0 1', '3 4 2', '5 0 1', '5 0 1', '3 5 0', '1 7 6', '1 9 0', '</ s>', '5 2 0', '0 3 3', '0 0 2', '5 0 1', '5 0 1', '0 3 3', '8 2 4', '</ s>', '5 0 1', '7 0 0', '7 0 0', '1 7 6', '5 0 1', '5 0 1', '1 7 6', '0 7 0', '8 1 7', '5 0 1', '5 0 1', '3 8 2', '0 3 2', '1 7 6', '1 7 

Rev_Quadgram_model: Lidstone  identified Answers: ['0 3 2', '4 1 6', '0 6 0', '1 0 0', '1 4 0', '0 3 1', '4 1 5', '1 7 6', '3 5 1', '4 1 5', '1 7 6', '0 3 3', '4 1 5', '0 3 2', '4 1 5', '7 4 0', '0 3 2', '4 1 5', '7 4 1', '4 1 5', '8 6 1', '0 0 2', '0 6 0', '3 5 0', '0 3 2', '1 4 0', '1 7 6', '0 6 1', '1 7 6', '1 7 6', '1 5 6', '8 3 2', '6 9 0', '3 5 2', '0 6 3', '1 7 6', '3 5 1', '0 6 0', '0 3 2', '0 5 5', '0 5 5', '1 7 6', '7 0 0', '0 3 1', '4 1 5', '1 0 0', '0 3 3', '1 5 6', '4 1 5', '0 9 0', '4 1 5', '4 1 5', '1 7 6', '0 0 2', '1 7 6', '1 7 6', '1 7 6', '4 1 5', '4 1 5', '7 0 0', '6 0 5', '1 7 5', '4 1 5', '4 1 5', '0 3 3', '4 1 5', '0 6 3', '0 3 3', '0 3 3', '0 3 2', '1 7 6', '2 3 5', '4 1 5', '4 1 5', '0 3 1', '4 1 5', '3 4 2', '4 1 5', '4 1 5', '3 5 0', '1 7 6', '1 9 0', '6 9 0', '5 2 0', '0 3 3', '0 0 2', '4 1 5', '4 1 5', '0 3 3', '8 2 4', '6 9 0', '4 1 5', '6 0 5', '6 0 5', '1 7 6', '4 1 5', '4 1 5', '1 7 6', '0 7 0', '8 1 7', '4 1 5', '4 1 5', '3 8 2', '0 3 2', '1 7 6', '1 7

Rev_Quadgram_model: StupidBackoff  identified Answers: ['0 3 2', '</ s>', '0 0 2', '2 2 6', '2 2 0', '0 3 2', '5 2 0', '1 7 6', '7 4 0', '5 2 0', '1 7 6', '0 3 3', '5 2 0', '0 3 2', '5 2 0', '7 4 0', '0 3 2', '5 2 0', '7 4 1', '5 2 0', '8 6 1', '4 1 5', '2 2 0', '3 5 0', '0 3 2', '2 2 0', '1 7 6', '0 4 9', '1 7 6', '1 7 6', '1 5 8', '8 3 2', '</ s>', '5 0 3', '2 2 0', '1 7 6', '7 4 0', '3 5 3', '0 3 2', '0 0 2', '0 6 0', '1 7 6', '</ s>', '0 0 1', '5 2 0', '2 2 6', '0 3 3', '2 2 0', '5 2 0', '0 6 1', '5 2 0', '5 2 0', '1 7 6', '0 0 2', '1 7 6', '1 7 6', '1 7 6', '5 2 0', '5 2 0', '</ s>', '7 0 5', '1 7 5', '5 2 0', '5 2 0', '0 3 3', '5 2 0', '2 2 0', '0 3 3', '0 3 3', '0 3 2', '1 7 6', '7 4 0', '5 2 0', '5 2 0', '0 1 7', '5 2 0', '3 4 2', '5 2 0', '5 2 0', '3 5 0', '1 7 6', '1 9 0', '</ s>', '5 2 0', '0 3 3', '0 0 2', '5 2 0', '5 2 0', '0 3 3', '8 2 4', '</ s>', '5 2 0', '7 0 5', '7 0 5', '1 7 6', '5 2 0', '5 2 0', '1 7 6', '0 7 0', '8 1 7', '5 2 0', '5 2 0', '3 8 2', '0 3 2', '1 7 6',

Rev_Quadgram_model: WittenBellInterpolated  identified Answers: ['0 3 2', '</ s>', '0 0 2', '2 3 5', '2 3 1', '0 3 2', '5 2 0', '1 7 6', '7 4 0', '5 2 0', '1 7 6', '0 3 3', '5 2 0', '0 3 2', '5 2 0', '7 4 0', '0 3 2', '5 2 0', '7 4 1', '5 2 0', '8 6 1', '4 1 5', '2 2 0', '3 5 0', '0 3 2', '2 2 0', '1 7 6', '0 6 1', '1 7 6', '1 7 6', '1 5 8', '8 3 2', '</ s>', '5 0 3', '2 2 0', '1 7 6', '7 4 0', '3 5 3', '0 3 2', '0 0 2', '4 4 0', '1 7 6', '</ s>', '0 0 1', '5 2 0', '2 3 5', '0 3 3', '2 2 0', '5 2 0', '0 9 0', '5 2 0', '5 2 0', '1 7 6', '0 0 2', '1 7 6', '1 7 6', '1 7 6', '5 2 0', '5 2 0', '</ s>', '7 0 5', '1 7 5', '5 2 0', '5 2 0', '0 3 3', '5 2 0', '2 2 0', '0 3 3', '0 3 3', '0 3 3', '1 7 6', '7 4 0', '5 2 0', '5 2 0', '0 1 7', '5 2 0', '3 4 2', '5 2 0', '5 2 0', '3 5 0', '1 7 6', '1 9 0', '</ s>', '5 2 0', '0 3 3', '0 0 2', '5 2 0', '5 2 0', '0 3 3', '8 2 4', '</ s>', '5 2 0', '7 0 5', '7 0 5', '1 7 6', '5 2 0', '5 2 0', '1 7 6', '0 7 0', '8 1 7', '5 2 0', '5 2 0', '3 8 2', '0 3 2',

## Other Tests

In [44]:
def flatten(t):
    return [item for sublist in t for item in sublist]

i=0
for text in df[df.standardized_text!=''].standardized_text:
    #print(flatten(nltk.ngrams(text,2)))
    i=i+1

In [45]:
import nltk
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
bi_dict = dict()
bg_measures = BigramAssocMeasures()
for text in df[df.standardized_text!=''].standardized_text:
    words = nltk.word_tokenize(text)
    print(words)
    bi_finder = BigramCollocationFinder.from_words(words)
    bi_finder
    bi_collocs = bi_finder.nbest(bg_measures.likelihood_ratio, 10)
    #print(bi_collocs)
    for colloc in bi_collocs:
        print(colloc)
        bi_dict[colloc] += 1

['410', '017']
('410', '017')


KeyError: ('410', '017')

In [None]:

unique_frequencies = dict()
total_frequencies = dict()
for text in df[df.standardized_text!=''].standardized_text:
    words = nltk.word_tokenize(text)
    fdist = nltk.FreqDist(words)
    for word, freq in fdist.most_common(50):
        total_frequencies[word] += freq # total count
        unique_frequencies[word] += 1 # unique count

In [None]:
bigrams_series = (pd.Series(flatten(nltk.ngrams(tokenized_text, 2))).value_counts())[:10]
trigrams_series = (pd.Series(flatten(nltk.ngrams(tokenized_text, 3))).value_counts())[:10]
quadgrams_series = (pd.Series(flatten(nltk.ngrams(tokenized_text, 4))).value_counts())[:10]

In [None]:
bigrams_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
bigrams_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('10 Most Frequently Occuring Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# of Occurances')