# Indus Valley Scripts - ICIT coded Text Analysis for Decipherment

# Language Model Development

Dataset was created as a csv file from ICIT web site from raw html files for each for the Text
Data labels were changes and a linearized copy of the original text was added

icit_text_text_corpus.csv


!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install scikit-learn
!pip install nltk
!pip install ipywidgets
!pip install -U dill
!pip3 install requests
!pip3 install -U spacy

In [1]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import collections
import random
import traceback

plt.style.use(style='seaborn')
%matplotlib inline

In [2]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from collections import defaultdict

In [3]:
drop_duplicate_texts = True

# Set the filters on data here
filter_by_site = False
filter_by_keywords = False
filter_by_text_length= False

site = 'Mohenjo-daro'
#site = 'Harappa'
#site = 'Dholavira'
#site = 'Rakhigarhi'
#keyword = "Bull"
#keyword = "Gaur"

min_text_length=1
max_text_length=50

num_rows_text_corpus= 4999

In [4]:
# Read the signs
orig_sign_df=pd.read_csv('../../IndusCorpusUtils/data/icit_corpus/icit_sign_corpus.csv',dtype=str)
# set the max columns to none
pd.set_option('display.max_columns', None)

orig_sign_df

Unnamed: 0,id_sign,sign_class,set,graph,type,image,variants,function,ligatur,value,frequency,comment
0,1,SIM,01,stroke,stroke,sign001.jpg,1,"NUM, ITM, SHN",-,-,227,-
1,2,MKR,01,stroke,stroke,sign002.jpg,1,"ITM, SHN, EMS",-,-,865,-
2,3,SIM,01,stroke,stroke,sign003.jpg,1,"NUM, SHN",-,-,260,-
3,4,SIM,01,stroke,stroke,sign004.jpg,1,"NUM, SHN",-,-,99,-
4,5,SIM,01,stroke,stroke,sign005.jpg,1,"NUM, SHN",-,-,49,-
...,...,...,...,...,...,...,...,...,...,...,...,...
704,952,CMX,71,animal,uncertain,sign952.jpg,1,LFS,-,-,1,-
705,953,CMX,71,animal,Pict,sign953.jpg,1,LFS,-,-,1,-
706,956,SIM,71,-,att.d.e,sign956.jpg,1,LOG,-,-,2,-
707,957,CMX,71,-,uncertain,sign957.jpg,1,LOG,-,-,2,-


In [5]:
# Read the Text Corpus
orig_df=pd.read_csv('../../IndusCorpusUtils/data/icit_corpus/icit_text_text_corpus.csv',dtype=str, nrows=num_rows_text_corpus)
# set the max columns to none
pd.set_option('display.max_columns', None)


In [6]:
#Reverse text and add that as a new column
# Add text length as a column
list_reversed_text = []
for text in orig_df[orig_df.l_to_r_text!=''].l_to_r_text:
    # Tokenize to words
    # first split the string into chars
    chars = text.split(' ')
    length = len(chars)
    # then reverse the split string list and join with a space
    reversed_text = ' '.join(reversed(chars))
    list_reversed_text.append(reversed_text)
    
orig_df['reversed_text']= list_reversed_text #same as r_to_l text
orig_df['text_length']= orig_df['l_to_r_text'].str.len().div(3).round()

print("Dataframe has ", len(orig_df.index), " rows")

orig_df.head()

Dataframe has  4999  rows


Unnamed: 0,icit_id,site,keywords,text_class,lines,direction,text,signs,complete,alignment,sign height,text_images,linearized_text,l_to_r_text,r_to_l_text,reversed_text,text_length
0,1,Alamgirpur,,SS,1,L/R,+410-017+,2,Y,Unordered,Unequal,,410 017,410 017,017 410,017 410,2.0
1,2,Alamgirpur,,SS,1,L/R,+410-017+,2,Y,,,,410 017,410 017,017 410,017 410,2.0
2,3,Alamgirpur,,SC,1,L/R,+405-017+,2,Y,,,,405 017,405 017,017 405,017 405,2.0
3,4,Allahdino,,??,1,,+220-000+,1,N,,,,220 000,000 220,000 220,220 000,2.0
4,5,Allahdino,Bull,UC,1,R/L,+740-235+,2,Y,,,,740 235,235 740,740 235,740 235,2.0


In [7]:
# Retain texts that are only wanted

#remove the values where the text is unclear
df = orig_df[orig_df['l_to_r_text'].str.contains('000') == False] 

print("After removing unclear texts, we have ", len(df.index), " rows")

if(drop_duplicate_texts):
    #Remove out duplicate inplace
    df = df.drop_duplicates(subset ="text",
                         keep = False, inplace = False)

    print("After removing duplicate texts, we have ", len(df.index), " rows")


#keep only the values that does not have multi-line text
df = df[df['text'].str.contains('/') == False] 

print("After removing multi-line text, we have ", len(df.index), " rows")


#keep only the values where the direction is known (if direction does not have a /, i,e. L/R or R/L)
#Btw standardized_text is Left to right as in English
df = df[df['direction'].str.contains('/') == True] 

print("After keeping only text with known direction, we have ", len(df.index), " rows")

#Remove Multipart texts that have [ or ]
df = df[df['text'].str.contains("\[") == False] 
df = df[df['text'].str.contains("\]") == False] 

print("After keeping only text without multipart, we have ", len(df.index), " rows")

After removing unclear texts, we have  3945  rows
After removing duplicate texts, we have  2130  rows
After removing multi-line text, we have  2053  rows
After keeping only text with known direction, we have  1813  rows
After keeping only text without multipart, we have  1566  rows


In [8]:
# of those whose direction is know print out L/R and L/R text count
df_l_r = df[df['direction'].str.contains('L/R') == True] 

print("L/R texts: ", len(df_l_r.index))

df_r_l = df[df['direction'].str.contains('R/L') == True]

print("R/L texts: ", len(df_r_l.index))

L/R texts:  58
R/L texts:  1502


In [9]:
if(filter_by_site==True):
    #keep only the values that matches the provided site
    df = df[df['site'].str.contains(site) == True] 
    print("After filtering by site ", site, " it has ", len(df.index), " rows")

if(filter_by_keywords==True):
     #keep only the values that matches the provided keyword
    df = df[df['keywords'].str.contains(keyword) == True] 
    print("After filtering by keywords ", keyword, " it has ", len(df.index), " rows")

if(filter_by_text_length==True):
    df = df[(df['text_length'] > min_text_length) & (df['text_length']< max_text_length)]
    print("After filtering by text_length ",  " it has ", len(df.index), " rows")
    
print(df)


     icit_id          site keywords text_class lines direction  \
4          5     Allahdino     Bull         UC     1       R/L   
6          7     Allahdino    Bull1         SC     1       R/L   
7          8     Allahdino    Bull1         PP     1       R/L   
8          9     Allahdino    Bull1         LP     1       R/L   
9         10     Allahdino    Bull1         SS     1       R/L   
...      ...           ...      ...        ...   ...       ...   
4949    4030  Mohenjo-daro      NaN         SC     1       R/L   
4952    4032  Mohenjo-daro      NaN         UC     1       R/L   
4959    4039  Mohenjo-daro      NaN         SC     1       R/L   
4967    4047  Mohenjo-daro      NaN         SC     1       R/L   
4985    4059       Harappa      NaN         SP     1       R/L   

                           text signs complete alignment sign height  \
4                     +740-235+     2        Y       NaN         NaN   
6             +368-390-125-033+     4        Y    Linear       

In [10]:
#Keep the items with unclear text in another dataframe
df_unclear = orig_df[orig_df['l_to_r_text'].str.contains('000') == True]

#Note: Lot of the text with unclear text have direction empty

### Feature Extraction

In [11]:
y=df['site'].values
y.shape

# y axis is still the same
y_rev=df['site'].values
y_rev.shape

(1566,)

In [12]:
x=df['l_to_r_text'].values
x.shape

x_rev=df['reversed_text'].values
x_rev.shape

(1566,)

### Train-test split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
(x_train,x_test,y_train,y_test)=train_test_split(x,y,test_size=0.1, random_state=43)

(x_rev_train,x_rev_test,y_rev_train,y_rev_test)=train_test_split(x_rev,y_rev,test_size=0.1, random_state=43)

In [15]:
#Train data, fwd and reverse
df_train_x=pd.DataFrame(x_train)
df_train_x=df_train_x.rename(columns={0:'l_to_r_text'})

df_train_y=pd.DataFrame(y_train)
df_train_y=df_train_y.rename(columns={0:'site'})

#rev
df_train_x_rev=pd.DataFrame(x_rev_train)
df_train_x_rev=df_train_x_rev.rename(columns={0:'reversed_text'})

df_train_y_rev=pd.DataFrame(y_rev_train)
df_train_y_rev=df_train_y_rev.rename(columns={0:'site'})

#Test data, fwd and reverse
df_test_x=pd.DataFrame(x_test)
df_test_x=df_test_x.rename(columns={0:'l_to_r_text'})

df_test_y=pd.DataFrame(y_test)
df_test_y=df_test_y.rename(columns={0:'site'})

#rev
df_test_x_rev=pd.DataFrame(x_rev_test)
df_test_x_rev=df_test_x_rev.rename(columns={0:'reversed_text'})

df_test_y_rev=pd.DataFrame(y_rev_test)
df_test_y_rev=df_test_y_rev.rename(columns={0:'site'})

In [16]:
df_train=pd.concat([df_train_x,df_train_y],axis=1)
print(df_train.head())

df_test=pd.concat([df_test_x,df_test_y],axis=1)
print(df_test.head())


df_train_rev=pd.concat([df_train_x_rev,df_train_y_rev],axis=1)
print(df_train_rev.head())

df_test_rev=pd.concat([df_test_x_rev,df_test_y_rev],axis=1)
print(df_test_rev.head())

                   l_to_r_text          site
0  850 092 741 838 798 740 621  Mohenjo-daro
1                  820 798 740       Harappa
2              255 204 540 740       Harappa
3                  231 233 804  Mohenjo-daro
4      140 900 390 741 838 740  Mohenjo-daro
                   l_to_r_text          site
0                  746 130 400  Mohenjo-daro
1      861 002 705 255 740 090  Mohenjo-daro
2              244 065 880 820  Mohenjo-daro
3  747 717 095 595 001 142 617  Mohenjo-daro
4                      097 161  Mohenjo-daro
                 reversed_text          site
0  621 740 798 838 741 092 850  Mohenjo-daro
1                  740 798 820       Harappa
2              740 540 204 255       Harappa
3                  804 233 231  Mohenjo-daro
4      740 838 741 390 900 140  Mohenjo-daro
                 reversed_text          site
0                  400 130 746  Mohenjo-daro
1      090 740 255 705 002 861  Mohenjo-daro
2              820 880 065 244  Mohenjo-daro
3  617 142

### n-gram Models

In [17]:
from nltk.lm.preprocessing import pad_both_ends
from nltk.util import everygrams
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.lm.models import MLE
from nltk.lm.models import KneserNeyInterpolated

In [18]:
random_seed = 8
seed = 8
CONST_BEGINNING = "Beginning"
CONST_TERMINAL = "Terminal"
CONST_MEDIAL = "Medial"
CONST_NL = 10

In [19]:
def reverse_text(text):
    # first split the string into chars
    chars = text.split(' ')

    # then reverse the split string list and join with a space
    reversed_text = ' '.join(reversed(chars))
    return reversed_text

In [20]:
# Get ngram as list given a text (pass direction_of_string as "R/L" or "L/R")
def get_ngrams_as_list(data,direction_of_string,num):
    
    if(direction_of_string=="R/L"):
        # We need to convert R/L text to L/R to be able to get ngrams using nltk
        data_string = reverse_text(data)
    
    else: data_string= data
    
    n_grams =  ngrams(nltk.word_tokenize(data_string), num)
    return  [ ' '.join(grams) for grams in n_grams]

# Text Positional Ngram Analysis Model

In [21]:
from nltk.tokenize import word_tokenize,sent_tokenize
from typing import Dict
from bisect import bisect
from itertools import accumulate

class PositionalNgramModel:
    text_norm_position_unigrams_freq_dict=None
    df_text_norm_position_unigrams_freq=None
    df_text_norm_position_unigrams_prob=None
    text_norm_position_unigrams_maxValueIndex_dict = None
    text_norm_position_unigrams_unigrams_tuple = ()
    
    text_norm_position_bigrams_freq_dict = None
    df_text_norm_position_bigrams_freq=None
    df_text_norm_position_bigrams_prob=None
    text_norm_position_bigrams_maxValueIndex_dict = None
    
    text_norm_position_trigrams_freq_dict = None
    df_text_norm_position_trigrams_freq=None
    df_text_norm_position_trigrams_prob=None
    text_norm_position_trigrams_maxValueIndex_dict = None
    
    text_norm_position_quadgrams_freq_dict= None
    df_text_norm_position_quadgrams_freq=None
    df_text_norm_position_quadgrams_prob=None
    text_norm_position_bigrams_maxValueIndex_dict = None
    

    def _random_generator(self, seed_or_generator):
        if isinstance(seed_or_generator, random.Random):
            return seed_or_generator
        return random.Random(seed_or_generator)
    
    def _weighted_choice(self, population, weights, random_generator=None):
        """Like random.choice, but with weights.
        Heavily inspired by python 3.6 `random.choices`.
        """
        if not population:
            raise ValueError("Can't choose from empty population")
        if len(population) != len(weights):
            raise ValueError("The number of weights does not match the population")
        cum_weights = list(accumulate(weights))
        total = cum_weights[-1]
        threshold = random_generator.random()
        return population[bisect(cum_weights, total * threshold)]
    
    # To get a Positional Number for a unigram, bigram, trigram and Quadgram
    # l = Length of the text
    # s = starting position of the first character of the ngram
    # wp = weighted position = int(s/l)
    # Get the normalized position
    # Since the text passed is L/R,start index i from length of the text
    def get_norm_position(self, text,direction_of_string, search_string):
        try:
            num_chars, minp, maxp,w = 0,-1,-1,0
            sp=-1
            index=-1
            pos=-1

            #ngrams which are used as search_strings are generated from left to right, so text need to be changed to L/R

            if(direction_of_string=="R/L"):
                new_text = reverse_text(text)
            else: new_text = text


            l_pos= len(new_text )

            chars = new_text .split(' ')
            num_chars = len(chars)
            w= num_chars/CONST_NL


            pos = new_text.find(search_string)
            if(pos<0):  #not found
                return sp, minp, maxp, w
            else: 
                #sp = round((l_pos-pos)/4)
                sp = round(pos/4 +1)

            minp= int((sp-1)* (CONST_NL/num_chars) +1)
            maxp = int(sp*CONST_NL/num_chars)

            #print("text, search_string", new_text ,",", search_string)
            #print("l_pos, pos, sp, minp, maxp", l_pos, pos,sp, minp, maxp)

            if(minp<1): minp=1
            elif(minp>CONST_NL): minp = CONST_NL

            if(maxp<1): maxp=1
            elif(maxp>CONST_NL): maxp = CONST_NL

        except Exception as e:
            print("Exception:", e.__class__, "get_norm_position")
            traceback.print_exc()

        return sp,minp, maxp, w


    def get_text_norm_position_ngrams_freq(self,a,direction_of_string, n):

        average_allocation= True # Average out minp and maxp and allocate to the histogram. Don't care about weight

        text_norm_position_ngrams_freq=defaultdict(dict[int, int])
        norm_pos =-1
        i=0
        sp=-1

        try:
            #Fill the values with zeros in decreasing order so that we can keep same order in graph
            for text in a:
              for word in get_ngrams_as_list(text,direction_of_string, n):
                for j in range (10,0, -1):
                    text_norm_position_ngrams_freq[word][j]=0

            for text in a:
              for word in get_ngrams_as_list(text,direction_of_string, n):
                beginning_char = word #Look for position of beginning char of the ngram
                sp,minp, maxp, w = self.get_norm_position(text,direction_of_string, beginning_char)

                if(minp<0 or maxp<0): break #No match


                if(average_allocation):
                    norm_pos = round((minp+ maxp)/2)
                    #print("word, norm_pos", word, norm_pos)
                    val = text_norm_position_ngrams_freq[word].get(norm_pos,0)
                    text_norm_position_ngrams_freq[word][norm_pos]= val+ w
                else:

                    if(minp!=maxp):
                        val = text_norm_position_ngrams_freq[word].get(minp,0)
                        text_norm_position_ngrams_freq[word][minp]= val+ w

                        val = text_norm_position_ngrams_freq[word].get(maxp,0)
                        text_norm_position_ngrams_freq[word][maxp]= val +w
                    else:
                        val = text_norm_position_ngrams_freq[word].get(minp,0)
                        text_norm_position_ngrams_freq[word][minp]= val+ w

                #if(beginning_char=="740"): print(i,":Text:",text, "Char", beginning_char, "sp, minp, maxp, w, norm_pos", sp, minp, maxp, w, norm_pos)

                #if(beginning_char=="817 002"): print(i,":Text:",text, "Char:", beginning_char, sp, minp, maxp, w, norm_pos)

                i=i+1

        except Exception as e:
                print("Exception:", e.__class__, "get_text_norm_position_ngrams_freq")
                traceback.print_exc()

        return  text_norm_position_ngrams_freq

    # Can be generatilzed to any N later
    def fit(self, a, direction_of_string):
        try:
            print(a)

            self.text_norm_position_unigrams_freq_dict=defaultdict(dict[int, int])
            self.text_norm_position_unigrams_freq_dict=self.get_text_norm_position_ngrams_freq(a,direction_of_string, 1)
            self.df_text_norm_position_unigrams_freq=pd.DataFrame(self.text_norm_position_unigrams_freq_dict)

            self.df_text_norm_position_unigrams_prob = self.df_text_norm_position_unigrams_freq.div(self.df_text_norm_position_unigrams_freq.sum(axis=1), axis=0)
            self.df_text_norm_position_unigrams_prob = self.df_text_norm_position_unigrams_prob.assign(max_value=self.df_text_norm_position_unigrams_prob.values.max(1))
            df_text_norm_position_unigrams_maxValueIndex = self.df_text_norm_position_unigrams_prob.idxmax(axis = 1)
            self.text_norm_position_unigrams_maxValueIndex_dict = df_text_norm_position_unigrams_maxValueIndex.to_dict()

            # Build the unigrams tuple
            for col in self.df_text_norm_position_unigrams_prob:
                self.text_norm_position_unigrams_unigrams_tuple =  self.text_norm_position_unigrams_unigrams_tuple + (col,)

            sorted(self.text_norm_position_unigrams_unigrams_tuple)

            #print(self.df_text_norm_position_unigrams_freq)
            #print(self.df_text_norm_position_unigrams_prob)
            #print(self.text_norm_position_unigrams_maxValueIndex_dict)

            #Bigram
            self.text_norm_position_bigrams_freq_dict=defaultdict(dict[int, int])
            self.text_norm_position_bigrams_freq_dict=self.get_text_norm_position_ngrams_freq(a,direction_of_string,2)
            self.df_text_norm_position_bigrams_freq=pd.DataFrame(self.text_norm_position_bigrams_freq_dict)
            self.df_text_norm_position_bigrams_prob = self.df_text_norm_position_bigrams_freq.div(self.df_text_norm_position_bigrams_freq.sum(axis=1), axis=0)
            #print(self.df_text_norm_position_bigrams_freq)

            #Trigram
            self.text_norm_position_trigrams_freq_dict=defaultdict(dict[int, int])
            self.text_norm_position_trigrams_freq_dict=self.get_text_norm_position_ngrams_freq(a,direction_of_string,3)
            self.df_text_norm_position_trigrams_freq=pd.DataFrame(self.text_norm_position_trigrams_freq_dict)
            self.df_text_norm_position_trigrams_prob = self.df_text_norm_position_trigrams_freq.div(self.df_text_norm_position_trigrams_freq.sum(axis=1), axis=0)
            #print(self.df_text_norm_position_trigrams_freq)

            #Quadgram
            self.text_norm_position_quadgrams_freq_dict=defaultdict(dict[int, int])
            self.text_norm_position_quadgrams_freq_dict=self.get_text_norm_position_ngrams_freq(a,direction_of_string,4)
            self.df_text_norm_position_quadgrams_freq=pd.DataFrame(self.text_norm_position_quadgrams_freq_dict)
            self.df_text_norm_position_quadgrams_prob = self.df_text_norm_position_quadgrams_freq.div(self.df_text_norm_position_quadgrams_freq.sum(axis=1), axis=0)
            #print(df_text_norm_position_quadgrams_freq)
        
        except Exception as e:
            print("Exception:", e.__class__, "fit")
            traceback.print_exc()
        
        return self.df_text_norm_position_unigrams_prob, self.df_text_norm_position_bigrams_prob, self.df_text_norm_position_trigrams_prob,self.df_text_norm_position_quadgrams_prob

    def get_text_norm_position_unigrams_char_with_max_prob(self, position):
        try:
            char= self.text_norm_position_unigrams_maxValueIndex_dict[position]
            print("Max Prob for Position:", position, "is for character:",char )
        
        except Exception as e:
            print("Exception:", e.__class__, "get_text_norm_position_unigrams_char_with_max_prob")
            traceback.print_exc()
        return char
    
    def get_text_norm_position_unigrams_char_prob(self, position, word):
       
        prob=0
        try:
            prob = self.df_text_norm_position_unigrams_prob.loc[position,word]
        except Exception as e:
            print("Exception:", e.__class__, "get_text_norm_position_unigrams_char_prob")
            traceback.print_exc()
        return prob
    
    def get_text_norm_position_unigrams_char_with_weighted_prob(self, position):

        try:
            random_generator = self._random_generator(random_seed)
            #print(tuple(self.get_text_norm_position_unigrams_char_prob(position, w) for w in  self.text_norm_position_unigrams_unigrams_tuple)) 

            return self._weighted_choice(
                    self.text_norm_position_unigrams_unigrams_tuple,
                    tuple(self.get_text_norm_position_unigrams_char_prob(position, w) for w in  self.text_norm_position_unigrams_unigrams_tuple),
                    random_generator,
                )
        except Exception as e:
            print("Exception:", e.__class__, "get_text_norm_position_unigrams_char_with_weighted_prob")
            traceback.print_exc()
    
    def find_characters(self, a, seed):
        #For each of the text with unclear character go through it
        ans=-1
        out_char = None
        out_char_list = []
        try:
            for text in a :
                print("Finding Unclear character for", text)
                chars = text.split(' ')
                index = len(chars) - chars.index("000")
                if(index<0):  #not found
                    return 
                else: 
                    # Find the char with highest probablity for this position
                    print(index)
                    out_char = self.get_text_norm_position_unigrams_char_with_max_prob(index)
                    print("Index, out_char:", index, out_char)
                    out_char_list.append(out_char)
    
        except Exception as e:
            print("Exception:", e.__class__, "find_characters")
            
        return  out_char_list

## Text Positional Analysis - Basic check

In [22]:
#a= df_train_x_rev[df_train_x_rev.reversed_text!=''].reversed_text

a= df_train_x[df_train_x.l_to_r_text!=''].l_to_r_text
posNgramModel = PositionalNgramModel()

#Train model with xtrain data
df_text_norm_position_unigrams_prob, df_text_norm_position_bigrams_prob, df_text_norm_position_trigrams_prob,df_text_norm_position_quadgrams_prob = posNgramModel.fit(a, "L/R")

#print(posNgramModel.get_text_norm_position_unigrams_char_prob(10, '720'))

print(posNgramModel.get_text_norm_position_unigrams_char_with_weighted_prob(1))
print(posNgramModel.get_text_norm_position_unigrams_char_with_weighted_prob(10))

0       850 092 741 838 798 740 621
1                       820 798 740
2                   255 204 540 740
3                       231 233 804
4           140 900 390 741 838 740
                   ...             
1404                692 060 003 426
1405                590 407 100 740
1406                    051 063 220
1407                        221 158
1408            371 407 001 142 617
Name: l_to_r_text, Length: 1409, dtype: object
920
740


## ModelWithFilling for ngram

In [23]:
from bisect import bisect
from itertools import accumulate
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize 

class ModelWithFilling:

    def __init__(self, this_model, posNgramModel=None, fill_using_position=True):
        self.model = this_model
        self.posNgramModel = posNgramModel
        self.fill_using_position = fill_using_position
        if(not this_model or not posNgramModel): print("Need non Empty Model for creating instance of ModelWithFilling")
        
    def _random_generator(self, seed_or_generator):
        if isinstance(seed_or_generator, random.Random):
            return seed_or_generator
        return random.Random(seed_or_generator)
    
    def _weighted_choice(self, population, weights, random_generator=None):
        """Like random.choice, but with weights.
        Heavily inspired by python 3.6 `random.choices`.
        """
        if not population:
            raise ValueError("Can't choose from empty population")
        if len(population) != len(weights):
            raise ValueError("The number of weights does not match the population")
        cum_weights = list(accumulate(weights))
        total = cum_weights[-1]
        threshold = random_generator.random()
        return population[bisect(cum_weights, total * threshold)]

        
    def get_model(self):
        return self.model
    
    # Redefine this function defined in NTLK with some changes
    def generate(self,num_words=1, text_seed=None, random_seed=None, position=-1):
        
        #print("Generating for text:", text_seed, "and position:", position)
        text_seed = [] if text_seed is None else list(text_seed)
        pos=-1

        random_generator = self._random_generator(random_seed)
        # This is the base recursion case.
        if num_words == 1:
            context = (
                text_seed[-self.model.order + 1 :]
                if len(text_seed) >= self.model.order
                else text_seed
            )
            samples = self.model.context_counts(self.model.vocab.lookup(context))   
            
            #print("____Context:", context, "Samples:", samples)
            while context and not samples:
                context = context[1:] if len(context) > 1 else []
                samples = self.model.context_counts(self.model.vocab.lookup(context))
            # Sorting samples achieves two things:
            # - reproducible randomness when sampling
            # - turns Mapping into Sequence which `_weighted_choice` expects
            samples = sorted(samples)
            
            #if(len(context) < 1 ): 
                #print("@@@@@@@@@@@@@@@@@@@@@@@", "context:", context, "Must return Positional Prob for pos:", position)
                
            if(self.fill_using_position and self.posNgramModel and len(context)<1 and position!=-1):
                pos = self.posNgramModel.get_text_norm_position_unigrams_char_with_weighted_prob(position)
                print("Returning Positional Prob for pos:", position, ":", pos)
                return pos
            else: 
                #print("Before returning, Context:", context)
                #for w in samples:
                    #print("Before returning, Sample Item:", w, self.model.score(w, context))
                    
                return self._weighted_choice(
                samples,
                tuple(self.model.score(w, context) for w in samples),
                random_generator,
            )
        
        # We build up text one word at a time using the preceding context.
        generated = []
        for _ in range(num_words):
            generated.append(
                self.generate(
                    num_words=1,
                    text_seed=text_seed + generated,
                    random_seed=random_generator,
                )
            )
        return generated
    

    def generate_sent(self, num_words,char_seed, random_seed=42, position=-1):
        """
        :param model: An ngram language model.
        :param num_words: Max no. of words to generate.
        :param random_seed: Seed value for random.
        """
        content = []
        try:
            for token in self.generate(num_words, text_seed=char_seed, random_seed=random_seed, position=position):
                if token == '<s>':
                    continue
                if token == '</s>'  or token == '< /s>' :
                    break
                content.append(token)
        except Exception as e:
            print("Exception:", e.__class__, "in ModelwithFilling:generate_sent")
            traceback.print_exc()
        return detokenize(content)
    
    def get_missing(self, num_missing=1, input_text_ls=None, random_seed=None):
        if input_text_ls is None: return None
    
        # For not num_missing >1 is not supported
        if num_missing !=1: return None
        
        #input text will look like 'word1, word2, word3, _, word4, word5...'
        
        print("Model Order:", self.model.order)
        print("input_text_ls", input_text_ls)
        
        #find _ in list
        index_pos_list = [i for i in range(len(input_text_ls)) if input_text_ls[i] == '_' ]
        print('Indices of all occurrences of a "_" in the list are : ', index_pos_list)
        
        # We will fill only one missing word for now   
        # if _ is beginning of list, find bigram probablity of
        # All posibile bigram matches of word in vocabulary followed by input_text_ls[index_pos_list[0]+1]

                  
        if(len(index_pos_list)>0 and index_pos_list[0]==0):
            print("Missing character in the Beginning, trying that logic")
            # This is the left most word
            guesses_ls = []
            scores_tuple = ()
            match= False
            
            # For every word in the model vocab try the word for the missing word and see 
            # if it generates samples and therefor a positive probability.
            # If so, add the word and it probability in our list
            # Try this from highest order to lowest order. If not matched in higher order
            # check the lower orders
            
            samples = None
            input_text_ls = input_text_ls[1:] # Remove the first word which is _
            
            if len(input_text_ls) >= self.model.order: max_index=self.model.order
            elif len(input_text_ls)<1 :input_text_ls= None
            else: index = len(input_text_ls)
            
            
            print("input_text_ls[:1]", input_text_ls[:1])
            print("input_text_ls[:2]", input_text_ls[:2])
            print("input_text_ls[:3]", input_text_ls[:3])
            
            input_text_ls = input_text_ls[:1] 
            
            print("Lookup:", self.model.context_counts(self.model.vocab.lookup('850')))
                  
            while input_text_ls and index >=0 and not samples:
                print("to_be_matched_word:",input_text_ls)
                i=0
                for vocab_item in self.model.vocab:
                    i=i+1
                    if(len(input_text_ls)>=1):
                        this_guess= [vocab_item] #Try every item in the vocabulary
                        this_guess = (
                        this_guess
                        )
                        samples = self.model.context_counts(self.model.vocab.lookup(this_guess))

                        # See if the sample returned has the next word in the input_text

                        samples = sorted(samples)
                        to_be_matched_word= input_text_ls[0]
                        print("this_guess:", this_guess, "to_be_matched_word:",to_be_matched_word,"samples:", samples)

                        if(input_text_ls[0] in samples):
                            print("Found a match for:", input_text_ls, "guess:", this_guess,"samples:", samples)
                            match = True
                            indexes = [i for i,x in enumerate(samples) if x == to_be_matched_word]
                            this_guess_score= self.model.score(to_be_matched_word, vocab_item.split())
                            # store off context and this_guess_score
                            guesses_ls.append(this_guess)
                            scores_tuple + (this_guess_score,)

                           
                if(match==False): print("Did not have a match for",to_be_matched_word, "with any of the items in vocabulary")
                
                index=index-1
                input_text_ls = input_text_ls[:index] 
            
            if(match==False):
                print("No matches found. At this point use unigram probability and guess")
            
        else:
            print("Missing character is not passed or not in the Beginning")
            
        
    def fill_missing(self,num_missing=1, input_text=None, random_seed=None):
        
        if input_text is None: return None
    
        # For not num_missing >1 is not supported
        if num_missing !=1: return None
        
        input_text = list(input_text)

        random_generator = self._random_generator(random_seed)
        # This is the base recursion case.
        
        if num_missing == 1:
            context = (
                input_text[-self.model.order + 1 :]
                if len(input_text) >= self.model.order
                else input_text
            )
            samples = self.model.context_counts(self.model.vocab.lookup(context))
            while context and not samples:
                context = context[1:] if len(context) > 1 else []
                samples = self.model.context_counts(self.model.vocab.lookup(context))
                print("Samples:", samples, "Context:", context)
            # Sorting samples achieves two things:
            # - reproducible randomness when sampling
            # - turns Mapping into Sequence which `_weighted_choice` expects
            samples = sorted(samples)
            #print(samples)
            print("tuple of samples:", tuple(self.model.score(w, context) for w in samples))
            return self._weighted_choice(
                samples,
                tuple(self.model.score(w, context) for w in samples),
                random_generator,
            )

        # We build up text one word at a time using the preceding context.
        generated = []
        for _ in range(num_missing):
            generated.append(
                self.fill_missing(
                    num_words=1,
                    text_seed=text_seed + generated,
                    random_seed=random_generator,
                )
            )
        return generated
        

In [24]:
def generate_sent_old(model, num_words,char_seed, random_seed=8):
    """
    :param model: An ngram language model.
    :param num_words: Max no. of words to generate.
    :param random_seed: Seed value for random.
    """
    content = []
    try:
        if(model==None): print("No Model Passed **********")
        for token in model.generate(num_words, text_seed=char_seed, random_seed=random_seed):
            if token == '<s>':
                continue
            if token == '</s>'  or token == '< /s>' :
                break
            content.append(token)
    except Exception as e:
        print("Exception:", e.__class__, "in the generate_sent")
        traceback.print_exc()
    return detokenize(content)

In [87]:
#Tokenize the text

# If we need to generate ngrams from it from r to l text, ngrams would be in opposite direction, so
# use reversed text to generate tokenized_text (l to r) and regular text to generate reverse_tokenized_text (r to l)
    
tokenized_text = list(df_train_x[df_train_x.l_to_r_text!=''].l_to_r_text.apply(word_tokenize))
reverse_tokenized_text = list(df_train_x_rev[df_train_x_rev.reversed_text!=''].reversed_text.apply(word_tokenize))


#print("tokenized_text:",tokenized_text)
#print("Rev tokenized_text:",reverse_tokenized_text)

In [26]:
#%run "ICITTextAnalysis-Textual Analysis.ipynb"
#%run "ICITTextAnalysis-PositionalNgram Analysis.ipynb"


In [86]:
# Preprocess the tokenized text for n-grams language modeling

import array as arr

model_name_list = ["MLE","KneserNeyInterpolated", "Laplace", "Lidstone","StupidBackoff", "WittenBellInterpolated"]

train_data_list_fwd_unigram = [None,None, None, None, None,None]
padded_sents_list_fwd_unigram = [None,None, None, None, None,None]
train_data_list_rev_unigram = [None,None, None, None, None,None]
padded_sents_list_rev_unigram = [None,None, None, None, None,None]

train_data_list_fwd_bigram = [None,None, None, None, None,None]
padded_sents_list_fwd_bigram = [None,None, None, None, None,None]
train_data_list_rev_bigram = [None,None, None, None, None,None]
padded_sents_list_rev_bigram = [None,None, None, None, None,None]

train_data_list_fwd_trigram = [None,None, None, None, None,None]
padded_sents_list_fwd_trigram = [None,None, None, None, None,None]
train_data_list_rev_trigram = [None,None, None, None, None,None]
padded_sents_list_rev_trigram = [None,None, None, None, None,None]

train_data_list_fwd_quadgram = [None,None, None, None, None,None]
padded_sents_list_fwd_quadgram = [None,None, None, None, None,None]
train_data_list_rev_quadgram = [None,None, None, None, None,None]
padded_sents_list_rev_quadgram = [None,None, None, None, None,None]

train_data_list_fwd_pentagram = [None,None, None, None, None,None]
padded_sents_list_fwd_pentagram = [None,None, None, None, None,None]
train_data_list_rev_pentagram = [None,None, None, None, None,None]
padded_sents_list_rev_pentagram = [None,None, None, None, None,None]

train_data_rev_list = [None,None, None, None, None,None]
padded_sents_rev_list = [None,None, None, None, None,None]


for index in range (0,6):

    train_data_list_fwd_unigram[index], padded_sents_list_fwd_unigram[index] = padded_everygram_pipeline(1, tokenized_text)
    train_data_list_rev_unigram[index], padded_sents_list_rev_unigram[index] = padded_everygram_pipeline(1, reverse_tokenized_text)
    
    train_data_list_fwd_bigram[index], padded_sents_list_fwd_bigram[index] = padded_everygram_pipeline(2, tokenized_text)
    train_data_list_rev_bigram[index], padded_sents_list_rev_bigram[index] = padded_everygram_pipeline(2, reverse_tokenized_text)
    
    train_data_list_fwd_trigram[index], padded_sents_list_fwd_trigram[index] = padded_everygram_pipeline(3, tokenized_text)
    train_data_list_rev_trigram[index], padded_sents_list_rev_trigram[index] = padded_everygram_pipeline(3, reverse_tokenized_text)
    
    train_data_list_fwd_quadgram[index], padded_sents_list_fwd_quadgram[index] = padded_everygram_pipeline(4, tokenized_text)
    train_data_list_rev_quadgram[index], padded_sents_list_rev_quadgram[index] = padded_everygram_pipeline(4, reverse_tokenized_text)

    train_data_list_fwd_pentagram[index], padded_sents_list_fwd_pentagram[index] = padded_everygram_pipeline(5, tokenized_text)
    train_data_list_rev_pentagram[index], padded_sents_list_rev_pentagram[index] = padded_everygram_pipeline(5, reverse_tokenized_text)

    
print_train_data_details= False
#If you iterate through this, the iterator is done with and model 
# fitting won't work subsequently
# so set print_train_data_details= False before trying the actual model

if(print_train_data_details):
    for ngramlize_sent in train_data_list_fwd_quadgram[0]:
        print(list(ngramlize_sent))
        print()
    print('#############')
    list(padded_sents_list_fwd_quadgram[0])
    
if(print_train_data_details):
    for ngramlize_sent in train_data_list_rev_quadgram[0]:
        print(list(ngramlize_sent))
        print()
    print('#############')
    list(padded_sents_list_rev_quadgram[0])
    

In [28]:
# Train Unigram, Bigram, Trigram, Quadgram Models for both fwd text and reverse tex with the following
# models. Ignoring AbsoluteDiscountingInterpolated model
from nltk.lm.models import MLE
from nltk.lm.models import AbsoluteDiscountingInterpolated
from nltk.lm.models import KneserNeyInterpolated
from nltk.lm.models import Laplace
from nltk.lm.models import Lidstone
from nltk.lm.models import StupidBackoff
from nltk.lm.models import WittenBellInterpolated

#gamma=1
#order=1

model_MLE_list_fwd = []
model_KneserNeyInterpolated_list_fwd = []
model_Laplace_list_fwd = []
model_Lidstone_list_fwd = []
model_StupidBackoff_list_fwd = []
model_WittenBellInterpolated_list_fwd= []


model_MLE_list_rev = []
model_KneserNeyInterpolated_list_rev = []
model_Laplace_list_rev = []
model_Lidstone_list_rev = []
model_StupidBackoff_list_rev = []
model_WittenBellInterpolated_list_rev= []

for index in range(1, 6):
    model_MLE_list_fwd.append(MLE(index))
    model_KneserNeyInterpolated_list_fwd.append(KneserNeyInterpolated(index))
    model_Laplace_list_fwd.append(Laplace(index))
    model_Lidstone_list_fwd.append(Lidstone(index, index))
    model_StupidBackoff_list_fwd.append(StupidBackoff(index, index))
    model_WittenBellInterpolated_list_fwd.append(WittenBellInterpolated(index))
    
    
for index in range(1, 6):
    model_MLE_list_rev.append(MLE(index))
    model_KneserNeyInterpolated_list_rev.append(KneserNeyInterpolated(index))
    model_Laplace_list_rev.append(Laplace(index))
    model_Lidstone_list_rev.append(Lidstone(index, index))
    model_StupidBackoff_list_rev.append(StupidBackoff(index, index))
    model_WittenBellInterpolated_list_rev.append(WittenBellInterpolated(index))
    
models_list_fwd_unigram = [model_MLE_list_fwd[0] ,model_KneserNeyInterpolated_list_fwd[0] ,model_Laplace_list_fwd[0] , model_Lidstone_list_fwd[0] , model_StupidBackoff_list_fwd[0],model_WittenBellInterpolated_list_fwd[0]]
models_list_rev_unigram = [model_MLE_list_rev[0] ,model_KneserNeyInterpolated_list_rev[0] ,model_Laplace_list_rev[0] , model_Lidstone_list_rev[0] , model_StupidBackoff_list_rev[0], model_WittenBellInterpolated_list_rev[0]]

models_list_fwd_bigram = [model_MLE_list_fwd[1] ,model_KneserNeyInterpolated_list_fwd[1] ,model_Laplace_list_fwd[1] , model_Lidstone_list_fwd[1] , model_StupidBackoff_list_fwd[1],model_WittenBellInterpolated_list_fwd[1]]
models_list_rev_bigram = [model_MLE_list_rev[1] ,model_KneserNeyInterpolated_list_rev[1] ,model_Laplace_list_rev[1] , model_Lidstone_list_rev[1] , model_StupidBackoff_list_rev[1], model_WittenBellInterpolated_list_rev[1]]

models_list_fwd_trigram = [model_MLE_list_fwd[2] ,model_KneserNeyInterpolated_list_fwd[2] ,model_Laplace_list_fwd[2] , model_Lidstone_list_fwd[2] , model_StupidBackoff_list_fwd[2],model_WittenBellInterpolated_list_fwd[2]]
models_list_rev_trigram = [model_MLE_list_rev[2] ,model_KneserNeyInterpolated_list_rev[2] ,model_Laplace_list_rev[2] , model_Lidstone_list_rev[2] , model_StupidBackoff_list_rev[2],model_WittenBellInterpolated_list_rev[2]]

models_list_fwd_quadgram = [model_MLE_list_fwd[3] ,model_KneserNeyInterpolated_list_fwd[3] ,model_Laplace_list_fwd[3] , model_Lidstone_list_fwd[3] , model_StupidBackoff_list_fwd[3],model_WittenBellInterpolated_list_fwd[3]]
models_list_rev_quadgram = [model_MLE_list_rev[3] ,model_KneserNeyInterpolated_list_rev[3] ,model_Laplace_list_rev[3] , model_Lidstone_list_rev[3] , model_StupidBackoff_list_rev[3],model_WittenBellInterpolated_list_rev[3]]

models_list_fwd_pentagram = [model_MLE_list_fwd[4] ,model_KneserNeyInterpolated_list_fwd[4] ,model_Laplace_list_fwd[4] , model_Lidstone_list_fwd[4] , model_StupidBackoff_list_fwd[4],model_WittenBellInterpolated_list_fwd[4]]
models_list_rev_pentagram = [model_MLE_list_rev[4] ,model_KneserNeyInterpolated_list_rev[4] ,model_Laplace_list_rev[4] , model_Lidstone_list_rev[4] , model_StupidBackoff_list_rev[4],model_WittenBellInterpolated_list_rev[4]]



In [29]:
def fit_and_train_models(name, models_list, train_data_list,padded_sents_list):
    for index in range (0,len(models_list)):
        models_list[index].fit(train_data_list[index], padded_sents_list[index])
        #print("Fit:", name, model_name_list[index],"Order:", models_list[index].order, models_list[index].vocab)

In [30]:
fit_and_train_models("Fwd Unigram Model:", models_list_fwd_unigram , train_data_list_fwd_unigram,padded_sents_list_fwd_unigram)
fit_and_train_models("Rev Unigram Model:", models_list_rev_unigram , train_data_list_rev_unigram,padded_sents_list_rev_unigram)

fit_and_train_models("Fwd Bigram Model:", models_list_fwd_bigram , train_data_list_fwd_bigram,padded_sents_list_fwd_bigram)
fit_and_train_models("Rev Bigram Model:", models_list_rev_bigram , train_data_list_rev_bigram,padded_sents_list_rev_bigram)

fit_and_train_models("Fwd Trigram Model:", models_list_fwd_trigram , train_data_list_fwd_trigram,padded_sents_list_fwd_trigram)
fit_and_train_models("Rev Trigram Model:", models_list_rev_trigram , train_data_list_rev_trigram,padded_sents_list_rev_trigram)

fit_and_train_models("Fwd Quadgram Model:", models_list_fwd_quadgram , train_data_list_fwd_quadgram,padded_sents_list_fwd_quadgram)
fit_and_train_models("Rev Quadgram Model:", models_list_rev_quadgram , train_data_list_rev_quadgram,padded_sents_list_rev_quadgram)

fit_and_train_models("Fwd Pentagram Model:", models_list_fwd_pentagram , train_data_list_fwd_pentagram,padded_sents_list_fwd_pentagram)
fit_and_train_models("Rev Pentagram Model:", models_list_rev_pentagram , train_data_list_rev_pentagram,padded_sents_list_rev_pentagram)
 

## Initial Terminal Character Model

In [88]:
# Build Model for relationship between Initial and Terminal characters
# This can be a bigram model. Pick a reasonably good model
# Remove all characters other than initial and terminal and then tokenize
tokenized_text_temp = list(df_train_x[df_train_x.l_to_r_text!=''].l_to_r_text.apply(word_tokenize))

#print(tokenized_text_temp)
      
tokenized_text_it = []
for i in range(len(tokenized_text_temp)):
    l= tokenized_text_temp[i]
    del l[1:len(l)-1]
    l[0],l[1] = l[1], l[0]  #swap
    tokenized_text_it.append(l)
#print(tokenized_text_it)

k=2
model_it_bigram_kn = KneserNeyInterpolated(k) #Bigram model
train_data_it, padded_sents_it = padded_everygram_pipeline(k, tokenized_text_it)


print_train_data_details= False
#If you iterate through this, the iterator is done with and model 
# fitting won't work subsequently
# so set print_train_data_details= False before trying the actual model

if(print_train_data_details):
    for ngramlize_sent in train_data_it:
        print(list(ngramlize_sent))
        print()
    print('#############')
    list(padded_sents_it)
    
model_it_bigram_kn.fit(train_data_it, padded_sents_it)
    
print(model_it_bigram_kn.vocab)
print(model_it_bigram_kn.counts)
print(model_it_bigram_kn.generate(1, ['804'], 8))
print(model_it_bigram_kn.generate(1, ['621'], 8))

model_wf_it = ModelWithFilling(model_it_bigram_kn, posNgramModel, True)

#print(model_wf_it.generate_sent(1, ['621'],8, 10))

<Vocabulary with cutoff=1 unk_label='<UNK>' and 420 items>
<NgramCounter with 2 ngram orders and 9863 ngrams>
231
514


In [32]:
#Check one of the models and play with it
k=4
model_kn = KneserNeyInterpolated(k) #Quadgram model
model_kn_rev = KneserNeyInterpolated(k) # Reverse string, Quadgram model

train_data, padded_sents = padded_everygram_pipeline(k, tokenized_text)
train_data_rev, padded_sents_rev = padded_everygram_pipeline(k, reverse_tokenized_text)

model_kn.fit(train_data, padded_sents)
print(model_kn.vocab)

model_kn_rev.fit(train_data_rev, padded_sents_rev)
print(model_kn_rev.vocab)
      
print(model_kn.vocab.lookup(tokenized_text[0]))
print(model_kn.counts)

text_seed = '390'
num_words=1
context = ['390']


#'l_to_r_text' : "634 368 002 061 717 390"
    
print("count of 390:", model_kn.counts['390'])
print(model_kn.counts[['717']]['390'])
print(model_kn.counts[['368', '002']]['061'])
print(model_kn.counts[['002', '061']]['717'])
print(model_kn.score('390'))
# lm.score("b", ["a"]) what is the chance that “b” is preceded by “a”.
print("---", model_kn.score('390', ['717']))

print(model_kn.score('390', '717'.split()))  # P('390'|'717) Given 717 occurs what is the prob of 390
print(model_kn.score('061', '717'.split()))  # P('740'|'390)
print(model_kn.score('368', '002 061'.split()))  # P('368|'002 061')
print(model_kn.score('002', '0061 717'.split()))

print("Entropy and Perplexity")

test = [('634', '368'), ('002', '061')]
print(model_kn.entropy(test))
print(model_kn.perplexity(test))

<Vocabulary with cutoff=1 unk_label='<UNK>' and 547 items>
<Vocabulary with cutoff=1 unk_label='<UNK>' and 547 items>
('850', '092', '741', '838', '798', '740', '621')
<NgramCounter with 4 ngram orders and 52002 ngrams>
count of 390: 125
1
1
1
0.014970059880239521
--- 0.04406900484744796
0.04406900484744796
0.0002693026645122454
3.712381999286451e-05
0.04557709976871654
Entropy and Perplexity
3.2694987319710305
9.643111515978394


In [33]:
print(models_list_fwd_pentagram[0].vocab)
print(models_list_fwd_pentagram[0].counts)
print(models_list_fwd_pentagram[0].order)
print("count of 390:", models_list_fwd_pentagram[0].counts['390'])
print("count of 850:", models_list_fwd_pentagram[0].counts['850'])

print_train_data_details= False
#If you iterate through this, the iterator is done with and model 
# fitting won't work subsequently
# so set print_train_data_details= False before trying the actual model

if(print_train_data_details):
    for ngramlize_sent in train_data_list_fwd_pentagram[0]:
        print(list(ngramlize_sent))
        print()
    print('#############')
    list(padded_sents_list_fwd_pentagram[0])

<Vocabulary with cutoff=1 unk_label='<UNK>' and 547 items>
<NgramCounter with 5 ngram orders and 75570 ngrams>
5
count of 390: 125
count of 850: 14


## Test the ModelWithFilling

In [34]:
model_wf = ModelWithFilling(model_kn, posNgramModel, True) # Fix this later. Need to create Position Model first

#print(model_wf.generate(1, ['061','002', '368'],42))

print(model_wf.generate(1, ['867'],42, 10))

#model_wf.fill_missing(1, ['061','002', '368'],42)

#input_list =list['061','_','368']

samples = model_kn.context_counts(model_kn.vocab.lookup(['899']))
print(samples)


Returning Positional Prob for pos: 10 : 740
740
<FreqDist with 0 samples and 0 outcomes>


In [35]:
# L to R: 850 092 741 838 798 740 621 
input_list = ['_' ,'092', '741', '838']

#model_wf.get_missing(1,input_list ,42)

In [36]:
def pack_list(first_param, second_param, third_param):
    packed_list = []
    
    if(third_param!=-1):
        packed_list.append(third_param)
        
    if(second_param!=-1):
        packed_list.append(second_param)
        
    if(first_param!=-1):
        packed_list.append(first_param)
  
    return packed_list

In [37]:
def get_list_token(beginning, j, list_tokens, index_unclear_signs, k,l,m):
    one_before=-1
    two_before=-1
    three_before=-1
    param =[]
    last_token_index = len(list_tokens)-1 
    #print(terminal, j, list_tokens, index_unclear_signs, k,l,m)
    
    try:
        if(beginning):     
            # You need to send reverse of the characters
            if(index_unclear_signs[j]+k<=last_token_index):
                one_before = list_tokens[index_unclear_signs[j]+k]

            if(index_unclear_signs[j]+l<=last_token_index):
                two_before = list_tokens[index_unclear_signs[j]+l]
            
            if(index_unclear_signs[j]+m<=last_token_index):
                three_before = list_tokens[index_unclear_signs[j]+m]
                
        else:
            if(index_unclear_signs[j]+k>=0):
                one_before= list_tokens[index_unclear_signs[j]+k]

            if(index_unclear_signs[j]+l>=0):
                two_before = list_tokens[index_unclear_signs[j]+l]
                
            if(index_unclear_signs[j]+m>=0):
                three_before = list_tokens[index_unclear_signs[j]+m]


        param = pack_list(one_before, two_before, three_before)
    except Exception as e:
        print("Exception:", e.__class__, "occurred in get_list_token.")
    return param

In [93]:
def get_char_by_position(this_text, position_type):
    char_list = []
    # first split the string into chars
    chars = this_text.split(' ') 
    if(position_type == "Initial"):
        char_list.append(chars[0])
    elif(position_type == "Terminal"):
        char_list.append(chars[len(chars)-1])
    return char_list

#test
print(get_char_by_position('850 000 741 838 798 740 621', "Terminal"))


['621']


In [91]:
print(model_it_bigram_kn.generate(1,get_char_by_position('850 000 741 838 798 740 621', "Terminal"), seed ))
print(model_it_bigram_kn.generate(1,['621'], seed ))

print(get_char_by_position('231 233 804', "Terminal"))

print(model_it_bigram_kn.generate(1,get_char_by_position('231 233 804', "Terminal"), seed ))

514
514
['804']
231


In [101]:
verbose_debug = True

def find_unclear_characters(model1, model2, a, fill_using_position, seed=8):
    #For each of the text with unclear character go through it
    
    ans=-1
    unclear_chars = []
    first_unclear=0
    
    model_wf_1 = ModelWithFilling(model1,posNgramModel, fill_using_position)
    model_wf_2 = ModelWithFilling(model2,posNgramModel, fill_using_position)
    term = None
    
    for text in a :
        try:
            #print("text:", text)
            #Identify the position of the unclear text and get its immediate neighbors
            list_tokens = nltk.word_tokenize(text)
            last_token_index = len(list_tokens)-1  
            index_unclear_signs = []
            param =[]
            

            for i in range(0, last_token_index+1) :
                    if(list_tokens[i]=='000'):
                        index_unclear_signs.append(i)
                        first_unclear=i

            position_unclear_char = len(list_tokens)- first_unclear
            
            #if(verbose_debug): 
                #print("Text:Index of Unclear signs:", str(text) + ": " + str(index_unclear_signs), "position (RtoL):", position_unclear_char)

            #assuming one unclear sign in a text. TBD: Extend this later to more than one unclear sign
            j=0
            if(index_unclear_signs[j]==0):
                param = get_list_token(True, j, list_tokens, index_unclear_signs, 1,2,3)
                if(verbose_debug): print("L to R: Initial char is unclear", text, "Sending: ", param , " to generate next char from second model")
                
                if(use_initial_terminal_model):
                    # Get the Terminal character as a list
                    # Pass it to the Initial Terminal Model
                    term = get_char_by_position(text, "Terminal")
                    ans = model_it_bigram_kn.generate(1,get_char_by_position(text, "Terminal"), seed ) 
                    #print("Text:", text, "Terminal Char:", get_char_by_position(text, "Terminal"), "Answer:", ans)
                else:
                     #Use reverse model
                    ans = model_wf_2.generate_sent(1, param , random_seed=seed, position=position_unclear_char)
                
                unclear_chars.append(ans)

            elif(index_unclear_signs[j]==last_token_index):

                param = get_list_token(False, j, list_tokens, index_unclear_signs, -1,-2,-3)
                if(verbose_debug): print("L to R: Terminal char is unclear", text,"Sending: ", param , " to generate next char from first model")
                try:
                    ans = model_wf_1.generate_sent(1, param, random_seed=seed, position=position_unclear_char)
                except Exception as e:
                    print("Exception:", e.__class__, "find_unclear_characters:generate_sent")
                
                #print("Answer:", ans)
                unclear_chars.append(ans)

            else:

                #Not proceeding if more than one char is unclear

                if(len(index_unclear_signs)>1):
                    #print("Many chars are unclear in the text, not able to decipher the text. Moving on ...\n")
                    continue
                
                param = get_list_token(False,j, list_tokens, index_unclear_signs, -1,-2,-3)
                if(verbose_debug): print("L to R: One of the middle char is unclear", text, "Sending: ", param , " to generate next char from first model")
                ans = model_wf_1.generate_sent(1, param, random_seed=seed, position=position_unclear_char)
                
                #print("Answer:", ans)
                unclear_chars.append(ans)
        except Exception as e:
            if(verbose_debug): 
                print("Exception:", e.__class__, "find_unclear_characters.")
                traceback.print_exc()
            
    return  unclear_chars


In [40]:
def get_group_for_sign(id_sign):
    for graph in orig_sign_df[orig_sign_df.id_sign==id_sign].graph :
        return(graph)

In [41]:
def add_answers(text,answer_list, answer, type_unclear_char):
    dict_row = {'text':text, 'len_text':len(text), 'answer':answer, 'type': type_unclear_char}
    answer_list.append(dict_row)

In [42]:
def add_wrong_answers(wrong_answer_list, text, predicted_answer,predicted_answer_group, correct_answer,correct_answer_group, type_unclear_char):
    chars = text.split(' ')
    dict_row = {'text':text, 'len_text':len(chars),'pred_answer':predicted_answer, 'pred_answer_group': predicted_answer_group, 'correct_answer': correct_answer,'correct_answer_group': correct_answer_group, 'type': type_unclear_char}
    wrong_answer_list.append(dict_row)
    

In [43]:
def check_answers(ans, test_correct_answers):

    try:
        beg_hit,ter_hit, med_hit,total_hit=0,0,0,0
        beg_hit_category,ter_hit_category, med_hit_category,total_hit_category=0,0,0,0
        
        beg_count,ter_count,med_count,total_count=0,0,0,0
        
        beg_percent,ter_percent,med_percent,total_percent=0,0,0,0
        beg_percent,ter_percent_category,med_percent_category,total_percent_category=0,0,0,0
        
        wrong_answer_list = []
        
        i=0
        for answers in ans:
            correct_ans = test_correct_answers[i].get('answer')
            correct_ans_type = test_correct_answers[i].get('type')
            correct_ans_text = test_correct_answers[i].get('text')
            
            this_ans =answers.replace(" ", "")
            type_unclear_char = correct_ans_type
            
            # Full Match
            if(this_ans==correct_ans):
                if(type_unclear_char==CONST_BEGINNING):
                    beg_hit= beg_hit+1
                    beg_count= beg_count+1
                elif(type_unclear_char==CONST_TERMINAL):
                    ter_hit= ter_hit+1
                    ter_count= ter_count+1
                elif(type_unclear_char==CONST_MEDIAL):
                    med_hit= med_hit+1
                    med_count= med_count+1
                    
                total_hit=total_hit+1
                total_count= total_count+1
                
            # Only a Category match
            elif(get_group_for_sign(this_ans)==get_group_for_sign(correct_ans)):
                if(type_unclear_char==CONST_BEGINNING):
                    beg_hit_category= beg_hit_category+1
                    beg_count= beg_count+1
                elif(type_unclear_char==CONST_TERMINAL):
                    ter_hit_category= ter_hit_category+1
                    ter_count= ter_count+1
                elif(type_unclear_char==CONST_MEDIAL):
                    med_hit_category= med_hit_category+1
                    med_count= med_count+1
                    
                total_hit_category=total_hit_category+1
                total_count= total_count+1
                
                # Category match is still wrong, so add it to wrong answers
                add_wrong_answers(wrong_answer_list, correct_ans_text, this_ans, get_group_for_sign(this_ans), correct_ans, get_group_for_sign(correct_ans),type_unclear_char)
            
            #Not any match
            else: 
                if(type_unclear_char==CONST_BEGINNING):
                    beg_count= beg_count+1
                elif(type_unclear_char==CONST_TERMINAL):
                    ter_count= ter_count+1
                elif(type_unclear_char==CONST_MEDIAL):
                    med_count= med_count+1
                    
                total_count= total_count+1
                
                add_wrong_answers(wrong_answer_list, correct_ans_text, this_ans, get_group_for_sign(this_ans), correct_ans, get_group_for_sign(correct_ans),type_unclear_char)
                
            i=i+1
        
        if(beg_count>0):
            beg_percent = (beg_hit/beg_count)*100
        else:
            beg_percent = -1
            
        if(beg_count>0):
            beg_percent_category = (beg_hit_category/beg_count)*100
        else:
            beg_percent_category  = -1
            
        
        if(ter_count>0):
            ter_percent = (ter_hit/ter_count)*100
        else:
            ter_percent = -1
            
        if(ter_count>0):
            ter_percent_category = (ter_hit_category/ter_count)*100
        else:
            ter_percent_category = -1
            
            
        if(med_count>0):
            med_percent = (med_hit/med_count)*100
        else:
            med_percent = -1
            
        if(med_count>0):
            med_percent_category  = (med_hit_category /med_count)*100
        else:
            med_percent_category  = -1
            
            
        if(total_count>0):
            total_percent = (total_hit/total_count)*100
        else:
            total_percent = -1
            
        if(total_count>0):
            total_percent_category  = (total_hit_category /total_count)*100
        else:
            total_percent_category  = -1
            
        
    except Exception as e:
            print("Exception:", e.__class__, "in check_answers")
            traceback.print_exc()

    return beg_percent,ter_percent,med_percent,total_percent, beg_percent_category,ter_percent_category,med_percent_category,total_percent_category,wrong_answer_list


In [44]:
def check_answers_old(ans, test_correct_answers):

    try:
        beg_hit,ter_hit, med_hit,total_hit=0,0,0,0
        beg_count,ter_count,med_count,total_count=0,0,0,0
        beg_percent,ter_percent,med_percent,total_percent=0,0,0,0
        wrong_answer_list = []
        
        i=0
        for answers in ans:
            correct_ans = test_correct_answers[i].get('answer')
            correct_ans_type = test_correct_answers[i].get('type')
            correct_ans_text = test_correct_answers[i].get('text')
            
            this_ans =answers.replace(" ", "")
            type_unclear_char = correct_ans_type
            
            if(this_ans==correct_ans):
                if(type_unclear_char==CONST_BEGINNING):
                    beg_hit= beg_hit+1
                    beg_count= beg_count+1
                elif(type_unclear_char==CONST_TERMINAL):
                    ter_hit= ter_hit+1
                    ter_count= ter_count+1
                elif(type_unclear_char==CONST_MEDIAL):
                    med_hit= med_hit+1
                    med_count= med_count+1
                    
                total_hit=total_hit+1
                total_count= total_count+1
            else:
                if(type_unclear_char==CONST_BEGINNING):
                    beg_count= beg_count+1
                elif(type_unclear_char==CONST_TERMINAL):
                    ter_count= ter_count+1
                elif(type_unclear_char==CONST_MEDIAL):
                    med_count= med_count+1
                    
                total_count= total_count+1
                
                add_wrong_answers(wrong_answer_list, correct_ans_text, this_ans, get_group_for_sign(this_ans), correct_ans, get_group_for_sign(correct_ans),type_unclear_char)
                
            i=i+1
        
        if(beg_count>0):
            beg_percent = (beg_hit/beg_count)*100
        else:
            beg_percent = -1
            
        if(ter_count>0):
            ter_percent = (ter_hit/ter_count)*100
        else:
            ter_percent = -1
            
        if(med_count>0):
            med_percent = (med_hit/med_count)*100
        else:
            med_percent = -1
            
        if(total_count>0):
            total_percent = (total_hit/total_count)*100
        else:
            total_percent = -1
            
        
    except Exception as e:
            print("Exception:", e.__class__, "in check_answers")
            traceback.print_exc()

    return beg_percent,ter_percent,med_percent,total_percent, wrong_answer_list

In [45]:
def reverse_single_text(text):
    list_reversed_text = []
    # Tokenize to words
    # first split the string into chars
    chars = text.split(' ')

    # then reverse the split string list and join with a space
    reversed_text = ' '.join(reversed(chars))
    return reversed_text


In [46]:
def reverse_text(a):
    list_reversed_text = []
    for text in a :
        # Tokenize to words
        # first split the string into chars
        chars = text.split(' ')

        # then reverse the split string list and join with a space
        reversed_text = ' '.join(reversed(chars))
        list_reversed_text.append(reversed_text)
    return list_reversed_text 
    

## TESTING
1. Test 1: Use random sample of Training data but one character is made unclear: Take a few samples from Training data, make some characters as 000 (unclear). Do it in begining, terminal and medial regions of the text and see if the model is able to figure out the unclear characters
2. Test 2: Use Testing data: Take the Testing data and make some charcters as 000 (unclear). Do it in begining, terminal and medial regions of the text to test and see if the model is able to figure out the unclear characters
3. Test 3: Real Unclear texts: Use the actual texts with unclear data and what the model is able to come up with for the unclear characters

## Test 0 - Manual test


In [47]:
print("-----Testing-------------------")

model_wf_kn = ModelWithFilling(model_kn,posNgramModel, True)
model_wf_kn_rev = ModelWithFilling(model_kn_rev,posNgramModel, True)
    
print("-----RtoL: Terminal char----- Correct Ans: 520")
print("Send three characters")
print(model_wf_kn.generate_sent(1,['060','705', '033'],8))

#ALL OF THIS IS L TO R

#Beginning: 741
#741 031 221 400
print("Train Data: Beginning: 741:", model_wf_kn_rev.generate_sent(1,['400','221','031'],8))
#-> Not able to find

print("Directly looking")
samples = model_kn_rev.context_counts(model_kn_rev.vocab.lookup(['400','221','031']))
print(samples)
tpl = tuple(model_kn.score(w, context) for w in samples)
for w in samples:
    print(w)
print("tpl:", tpl)

samples = model_kn_rev.context_counts(model_kn_rev.vocab.lookup(['031']))
#print(samples)

#medial: 001
#255 435 690 740 900 001 003 424
print("Train Data: medial: 001:",model_wf_kn.generate_sent(1,['435','690', '740', '900'],8))


#medial:798
#850 092 741 838 798 740 621
print("Train Data: medial:798:", model_wf_kn.generate_sent(1,['092', '741', '838'],8))

#Terminal 740
#321 405 002 806 233 320 920 740
print("Train Data: Terminal 740:", model_wf_kn.generate_sent(1,['233','320', '920'],8))

#Beginning: 820
#820 798 740
print("Train Data: Beginning: 820:", model_wf_kn_rev.generate_sent(1,['740', '798'],8))

print("Directly looking")
samples = model_kn_rev.context_counts(model_kn_rev.vocab.lookup(['740','798']))
print(samples)
tpl = tuple(model_kn.score(w, context) for w in samples)
for w in samples:
    print(w)
print("tpl:", tpl)


#{'text': '746 130 400', 'len_text': 3, 'pred_answer': '032', 'pred_answer_group': None, 'correct_answer': '746', 'correct_answer_group': 'U-shape', 'type': 'Beginning

print("Test Data: Beginning: 746:",model_wf_kn_rev.generate_sent(1,['400', '130'],8))
print("Test Data: Beginning: 746:",model_wf_kn.generate_sent(1,['130', '400'],8))


-----Testing-------------------
-----RtoL: Terminal char----- Correct Ans: 520
Send three characters
5 2 0
Train Data: Beginning: 741: 7 4 1
Directly looking
<FreqDist with 1 samples and 1 outcomes>
741
tpl: (0.08613401216889893,)
Train Data: medial: 001: 0 0 1
Train Data: medial:798: 7 9 8
Train Data: Terminal 740: 7 4 0
Train Data: Beginning: 820: 2 3 1
Directly looking
<FreqDist with 14 samples and 22 outcomes>
460
220
233
235
240
231
838
820
700
745
405
003
002
803
tpl: (0.0001253531584174646, 0.00045127137030287256, 0.0002507063168349292, 0.00021310036930968982, 0.0002757769485184221, 0.00016295910594270397, 6.26765792087323e-05, 0.01338193178859672, 0.00026324163267667564, 0.0001253531584174646, 0.00021310036930968982, 0.013369396472754974, 0.1302515741463932, 0.0002757769485184221)
Test Data: Beginning: 746: 0 3 2
Test Data: Beginning: 746: 7 4 0


In [48]:
# Function for Data Preparation
# Data preparation for testing
# Take n rows from given set, convert a known sign to unclear sign and produce a dataframe

def prepare_data(a,max_text_chars,min_text_chars,max_num_of_rows, seed):

    list_changed_texts = []
    list_changed_reversed_text = []
    test_correct_answers= []
    row_count=0
    ls_made_up_row = []
    random.seed(seed)

    for text in a:
        # Tokenize to words, first split the string into chars
        chars = text.split(' ')
        new_text = chars

        if(len(chars)<=max_text_chars):
            if(len(chars)>min_text_chars):
                #randomly pick an index in the tokenized_text and change it to unclear
                r = random.randrange(0, len(chars))
            else: r=0

            if(r==0): type_unclear_char = CONST_BEGINNING 
            elif(r==len(chars)-1): type_unclear_char = CONST_TERMINAL
            else: type_unclear_char = CONST_MEDIAL

            add_answers(text, test_correct_answers, chars[r], type_unclear_char)

            new_text[r]= '000'
            # then join with a space
            changed_text = ' '.join((new_text))

            made_up_row= {'site' : 'fake_site',
               'changed_reversed_text'  : reverse_single_text(changed_text),
               'changed_text' : changed_text}

            ls_made_up_row.append(made_up_row)


            row_count=row_count+1
            if(row_count>=max_num_of_rows): break


    df_made_up = pd.DataFrame(ls_made_up_row)
    
    return df_made_up, test_correct_answers

In [49]:
def run_model(model_type, model_name_list,model_fwd, model_rev,a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose) :
    
    #beg_hit,term_hit, med_hit, total_hit=0,0,0,0
    beg_percent,ter_percent,med_percent,total_percent=0,0,0,0
    beg_percent_category,ter_percent_category,med_percent_category,total_percent_category=0,0,0,0
    wrong_answer_list = []
    ans= None
    
    try:
        print("\n_____Trying unclear texts in Fwd Direction:______")
        for index in range(0, len(model_name_list)):

            print("\n****************Model Name:", model_name_list[index], ", Order:", model_fwd[index].order)
            ans = find_unclear_characters(model_fwd[index], model_rev[index],a,fill_using_position, seed)

            print(model_type,":", model_name_list[index])
            if(check_the_answers==True):
                beg_percent,ter_percent,med_percent,total_percent,beg_percent_category,ter_percent_category,med_percent_category,total_percent_category, wrong_answer_list = check_answers(ans,test_correct_answers)

                print("beg%:",round(beg_percent,2), " ter%:",round(ter_percent,2), " med%:", round(med_percent,2), " tot%:", round(total_percent,2)," and a total of ", round((total_percent/100)*len(test_correct_answers)), " out of", len(test_correct_answers))
                
                print("beg_w_cat%:",round(beg_percent+beg_percent_category,2), " ter_w_cat%:",round(ter_percent+ter_percent_category,2), " med_w_cat%:", round(med_percent+med_percent_category,2), " tot_w_cat%:", round(total_percent+total_percent_category,2))
                
                
                if(wrong_answer_details_verbose): print("\n Wrong Answers:", wrong_answer_list)
            else:
                print("Answers:", ans)
    except Exception as e:
        print("Exception:", e.__class__, "run_model")
        traceback.print_exc()


    if(try_reverse):
        # Try unclear strings in reverse
        print("\n_____Trying unclear texts in Reverse Direction:______")
        try:
            for index in range(0, len(model_name_list)):
                print("\n****************Model Name:", model_name_list[index], ", Order:", model_rev[index].order)
                ans = find_unclear_characters(model_rev[index], model_fwd[index],a_rev,fill_using_position, seed)

                print(model_type,":", model_name_list[index]) 
                if(check_the_answers==True):
                    beg_percent,ter_percent,med_percent,total_percent, beg_percent_category,ter_percent_category,med_percent_category,total_percent_category, wrong_answer_list = check_answers(ans,test_correct_answers)
                    
                    print(" beg%:",round(beg_percent,2), " ter%:",round(ter_percent,2), " med%:", round(med_percent,2), " tot%:", round(total_percent,2)," and a total of ", round((total_percent/100)*len(test_correct_answers)), " out of", len(test_correct_answers))
                    
                    print("beg_w_cat%:",round(beg_percent+beg_percent_category,2), " ter_w_cat%:",round(ter_percent+ter_percent_category,2), " med_w_cat%:", round(med_percent+med_percent_category,2), " tot_w_cat%:", round(total_percent+total_percent_category,2))
               
                    if(wrong_answer_details_verbose): print("\n Wrong Answers:", wrong_answer_list)
                else:
                    print("Answers:", ans)
        except Exception as e:
            print("Exception:", e.__class__, "run_model")
            traceback.print_exc()
    
    return ans



In [50]:
# Function for running a test
def run_test(test_name,a, a_rev,fill_using_position, check_the_answers, test_correct_answers, try_reverse,wrong_answer_details_verbose, seed):
    
    verbose_debug= True
    
    unigram_models= False
    bigram_models= False
    trigram_models= False
    quadgram_models= False
    pentagram_models= True

    print("_____________________________")
    print("_____ Running ", test_name, "_________")
    print("_____________________________")


    beg_hit,term_hit, med_hit, total_hit=0,0,0,0
    beg_percent,ter_percent,med_percent,total_percent=0,0,0,0
    wrong_answer_list = []

    try_reverse = False

    try:
        
        if(unigram_models):
            ans = run_model("Fwd Unigram Model:", model_name_list,models_list_fwd_unigram, models_list_rev_unigram, a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose)
    
        if(bigram_models):
            ans = run_model("Fwd Bigram Model:", model_name_list,models_list_fwd_bigram, models_list_rev_bigram, a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose)
    
        if(trigram_models):
            ans = run_model("Fwd Trigram Model:", model_name_list,models_list_fwd_trigram, models_list_rev_trigram, a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose)
    
        if(quadgram_models):
            ans = run_model("Fwd Quadgram Model:", model_name_list,models_list_fwd_quadgram, models_list_rev_quadgram, a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose)
    
        if(pentagram_models):
            ans = run_model("Fwd Pentagram Model:", model_name_list,models_list_fwd_pentagram, models_list_rev_pentagram, a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose)
    
    
    except Exception as e:
        print("Exception:", e.__class__, "run_test")
        traceback.print_exc()

# Test 1

In [51]:
# Test1
# Data preparation for this test
# Take n rows from train set, convert a known sign to unclear sign and produce a dataframe

verbose_debug = True
min_text_chars = 1
max_text_chars = 40
max_num_of_rows= 200
    
df_made_up_from_train, test1_correct_answers =prepare_data(df_train_x[df_train_x.l_to_r_text!=''].l_to_r_text, max_text_chars,min_text_chars,max_num_of_rows, seed)

if(verbose_debug):
    print(df_made_up_from_train.changed_text)
    print(df_made_up_from_train.changed_reversed_text)
    
    print("test1_correct_answers: \n", test1_correct_answers)


0      850 000 741 838 798 740 621
1                      820 000 740
2                  255 204 540 000
3                      000 233 804
4          140 000 390 741 838 740
                  ...             
195                    000 032 226
196            413 575 335 700 000
197            817 002 503 007 000
198            904 032 597 142 000
199            634 000 298 554 527
Name: changed_text, Length: 200, dtype: object
0      621 740 798 838 741 000 850
1                      740 000 820
2                  000 540 204 255
3                      804 233 000
4          740 838 741 390 000 140
                  ...             
195                    226 032 000
196            000 700 335 575 413
197            000 007 503 002 817
198            000 142 597 032 904
199            527 554 298 000 634
Name: changed_reversed_text, Length: 200, dtype: object
test1_correct_answers: 
 [{'text': '850 092 741 838 798 740 621', 'len_text': 27, 'answer': '092', 'type': 'Medial'}, {'text': 

In [111]:
# Test1
# Run the test
a= df_made_up_from_train[df_made_up_from_train.changed_text!=''].changed_text
a_rev= df_made_up_from_train[df_made_up_from_train.changed_reversed_text!=''].changed_reversed_text
check_the_answers = True
try_reverse = False
wrong_answer_details_verbose = True
verbose_debug = False
fill_using_position =True
use_initial_terminal_model= False

run_test("Test-1",a, a_rev, fill_using_position, check_the_answers, test1_correct_answers, try_reverse, wrong_answer_details_verbose, seed, )
    

_____________________________
_____ Running  Test-1 _________
_____________________________

_____Trying unclear texts in Fwd Direction:______

****************Model Name: MLE , Order: 5
Fwd Pentagram Model: : MLE
beg%: 57.14  ter%: 82.93  med%: 56.31  tot%: 62.0  and a total of  124  out of 200
beg_w_cat%: 69.64  ter_w_cat%: 82.93  med_w_cat%: 64.08  tot_w_cat%: 69.5

 Wrong Answers: [{'text': '850 092 741 838 798 740 621', 'len_text': 7, 'pred_answer': '090', 'pred_answer_group': None, 'correct_answer': '092', 'correct_answer_group': None, 'type': 'Medial'}, {'text': '820 798 740', 'len_text': 3, 'pred_answer': '002', 'pred_answer_group': None, 'correct_answer': '798', 'correct_answer_group': 'circle', 'type': 'Medial'}, {'text': '140 900 390 741 838 740', 'len_text': 6, 'pred_answer': '236', 'pred_answer_group': 'fish', 'correct_answer': '900', 'correct_answer_group': 'bow', 'type': 'Medial'}, {'text': '456 527', 'len_text': 2, 'pred_answer': '550', 'pred_answer_group': 'triangle', 

Fwd Pentagram Model: : KneserNeyInterpolated
beg%: 55.36  ter%: 82.93  med%: 55.34  tot%: 61.0  and a total of  122  out of 200
beg_w_cat%: 66.07  ter_w_cat%: 82.93  med_w_cat%: 64.08  tot_w_cat%: 68.5

 Wrong Answers: [{'text': '850 092 741 838 798 740 621', 'len_text': 7, 'pred_answer': '031', 'pred_answer_group': None, 'correct_answer': '092', 'correct_answer_group': None, 'type': 'Medial'}, {'text': '820 798 740', 'len_text': 3, 'pred_answer': '032', 'pred_answer_group': None, 'correct_answer': '798', 'correct_answer_group': 'circle', 'type': 'Medial'}, {'text': '140 900 390 741 838 740', 'len_text': 6, 'pred_answer': '226', 'pred_answer_group': 'fish', 'correct_answer': '900', 'correct_answer_group': 'bow', 'type': 'Medial'}, {'text': '456 527', 'len_text': 2, 'pred_answer': '388', 'pred_answer_group': 'leaf', 'correct_answer': '456', 'correct_answer_group': 'triangle', 'type': 'Beginning'}, {'text': '235 320 920 740', 'len_text': 4, 'pred_answer': '233', 'pred_answer_group': 'fis

beg%: 57.14  ter%: 82.93  med%: 56.31  tot%: 62.0  and a total of  124  out of 200
beg_w_cat%: 67.86  ter_w_cat%: 82.93  med_w_cat%: 63.11  tot_w_cat%: 68.5

 Wrong Answers: [{'text': '850 092 741 838 798 740 621', 'len_text': 7, 'pred_answer': '090', 'pred_answer_group': None, 'correct_answer': '092', 'correct_answer_group': None, 'type': 'Medial'}, {'text': '820 798 740', 'len_text': 3, 'pred_answer': '002', 'pred_answer_group': None, 'correct_answer': '798', 'correct_answer_group': 'circle', 'type': 'Medial'}, {'text': '140 900 390 741 838 740', 'len_text': 6, 'pred_answer': '236', 'pred_answer_group': 'fish', 'correct_answer': '900', 'correct_answer_group': 'bow', 'type': 'Medial'}, {'text': '235 320 920 740', 'len_text': 4, 'pred_answer': '233', 'pred_answer_group': 'fish', 'correct_answer': '235', 'correct_answer_group': 'fish', 'type': 'Beginning'}, {'text': '752 740', 'len_text': 2, 'pred_answer': '205', 'pred_answer_group': 'insect', 'correct_answer': '752', 'correct_answer_gr

beg%: 55.36  ter%: 82.93  med%: 57.28  tot%: 62.0  and a total of  124  out of 200
beg_w_cat%: 69.64  ter_w_cat%: 82.93  med_w_cat%: 66.02  tot_w_cat%: 70.5

 Wrong Answers: [{'text': '850 092 741 838 798 740 621', 'len_text': 7, 'pred_answer': '031', 'pred_answer_group': None, 'correct_answer': '092', 'correct_answer_group': None, 'type': 'Medial'}, {'text': '820 798 740', 'len_text': 3, 'pred_answer': '002', 'pred_answer_group': None, 'correct_answer': '798', 'correct_answer_group': 'circle', 'type': 'Medial'}, {'text': '140 900 390 741 838 740', 'len_text': 6, 'pred_answer': '390', 'pred_answer_group': 'stick', 'correct_answer': '900', 'correct_answer_group': 'bow', 'type': 'Medial'}, {'text': '456 527', 'len_text': 2, 'pred_answer': '550', 'pred_answer_group': 'triangle', 'correct_answer': '456', 'correct_answer_group': 'triangle', 'type': 'Beginning'}, {'text': '235 320 920 740', 'len_text': 4, 'pred_answer': '233', 'pred_answer_group': 'fish', 'correct_answer': '235', 'correct_an

In [100]:
a= df_made_up_from_train[df_made_up_from_train.changed_text=='000 320 920 740'].changed_text
a_rev= df_made_up_from_train[df_made_up_from_train.changed_reversed_text=='749 920 320 000'].changed_reversed_text
check_the_answers = False
try_reverse = False
wrong_answer_details_verbose = True
verbose_debug = True
fill_using_position =False

model_wf_quad_mle_fwd = ModelWithFilling(models_list_fwd_quadgram[0],posNgramModel, True)
model_wf_quad_mle_rev = ModelWithFilling(models_list_rev_quadgram[0],posNgramModel, True)

#answer 235
print(model_wf_quad_mle_rev.generate_sent(1,['749','920','320'],8))

print(model_wf_quad_mle_rev.generate_sent(1,['920','320'],8))

print(model_wf_quad_mle_fwd.generate_sent(1,['320'],8))

print("Directly looking-1")
samples = models_list_rev_quadgram[0].context_counts(models_list_rev_quadgram[0].vocab.lookup(['740','920']))
print(samples)
tpl = tuple(models_list_rev_quadgram[0].score(w, context) for w in samples)
for w in samples:
    print(w)
print("tpl:", tpl)

print("Directly looking-1")
samples = models_list_fwd_quadgram[0].context_counts(models_list_fwd_quadgram[0].vocab.lookup(['235','320', '920']))
print(samples)
tpl = tuple(models_list_fwd_quadgram[0].score(w, context) for w in samples)
for w in samples:
    print(w)
print("tpl:", tpl)

run_test("Test-1.1",a, a_rev, fill_using_position, check_the_answers, test1_correct_answers, try_reverse, wrong_answer_details_verbose, seed, )


2 3 3
2 3 3
9 0 0
Directly looking-1
<FreqDist with 5 samples and 8 outcomes>
320
323
318
140
322
tpl: (0.0, 0.0, 0.0, 0.016, 0.0)
Directly looking-1
<FreqDist with 1 samples and 1 outcomes>
740
tpl: (0.152,)
_____________________________
_____ Running  Test-1.1 _________
_____________________________

_____Trying unclear texts in Fwd Direction:______

****************Model Name: MLE , Order: 5
L to R: Initial char is unclear 000 320 920 740 Sending:  ['740', '920', '320']  to generate next char from second model
Text: 000 320 920 740 Terminal Char: ['740'] Answer: 235
Fwd Pentagram Model: : MLE
Answers: ['235']

****************Model Name: KneserNeyInterpolated , Order: 5
L to R: Initial char is unclear 000 320 920 740 Sending:  ['740', '920', '320']  to generate next char from second model
Text: 000 320 920 740 Terminal Char: ['740'] Answer: 235
Fwd Pentagram Model: : KneserNeyInterpolated
Answers: ['235']

****************Model Name: Laplace , Order: 5
L to R: Initial char is unclea

## Test 2

In [54]:
# Test 2
# Data preparation for this test
# Take n rows from test set, convert a known sign to unclear sign and produce a dataframe

verbose_debug = True
min_text_chars = 1
max_text_chars = 40
max_num_of_rows=200
    
df_made_up_from_test, test2_correct_answers =prepare_data(df_test_x[df_test_x.l_to_r_text!=''].l_to_r_text, max_text_chars,min_text_chars,max_num_of_rows,seed)

if(verbose_debug):
    print(df_made_up_from_test.changed_text)
    print("test2_correct_answers: \n", test2_correct_answers)


0                      000 130 400
1          861 002 000 255 740 090
2                  244 065 880 000
3      747 000 095 595 001 142 617
4                          000 161
                  ...             
152                    000 142 617
153                240 233 000 679
154        920 060 741 000 100 740
155            820 060 415 000 520
156        000 703 575 240 740 090
Name: changed_text, Length: 157, dtype: object
test2_correct_answers: 
 [{'text': '746 130 400', 'len_text': 11, 'answer': '746', 'type': 'Beginning'}, {'text': '861 002 705 255 740 090', 'len_text': 23, 'answer': '705', 'type': 'Medial'}, {'text': '244 065 880 820', 'len_text': 15, 'answer': '820', 'type': 'Terminal'}, {'text': '747 717 095 595 001 142 617', 'len_text': 27, 'answer': '717', 'type': 'Medial'}, {'text': '097 161', 'len_text': 7, 'answer': '097', 'type': 'Beginning'}, {'text': '400 520 400 353', 'len_text': 15, 'answer': '400', 'type': 'Beginning'}, {'text': '346 390 002 384 740', 'len_text': 

In [109]:
# Test2
# Run the test
verbose_debug = False
a= df_made_up_from_test[df_made_up_from_test.changed_text!=''].changed_text
a_rev= df_made_up_from_test[df_made_up_from_test.changed_reversed_text!=''].changed_reversed_text
check_the_answers = True
try_reverse = False
wrong_answer_details_verbose = True
fill_using_position = True
use_initial_terminal_model= False

ans = run_test("Test-2",a, a_rev, fill_using_position, check_the_answers, test2_correct_answers, try_reverse, wrong_answer_details_verbose, seed)

_____________________________
_____ Running  Test-2 _________
_____________________________

_____Trying unclear texts in Fwd Direction:______

****************Model Name: MLE , Order: 5
Returning Positional Prob for pos: 1 : 920
Fwd Pentagram Model: : MLE
beg%: 13.95  ter%: 26.47  med%: 25.0  tot%: 22.29  and a total of  35  out of 157
beg_w_cat%: 27.91  ter_w_cat%: 29.41  med_w_cat%: 41.25  tot_w_cat%: 35.03

 Wrong Answers: [{'text': '746 130 400', 'len_text': 3, 'pred_answer': '032', 'pred_answer_group': None, 'correct_answer': '746', 'correct_answer_group': 'U-shape', 'type': 'Beginning'}, {'text': '861 002 705 255 740 090', 'len_text': 6, 'pred_answer': '032', 'pred_answer_group': None, 'correct_answer': '705', 'correct_answer_group': 'U-shape', 'type': 'Medial'}, {'text': '244 065 880 820', 'len_text': 4, 'pred_answer': '002', 'pred_answer_group': None, 'correct_answer': '820', 'correct_answer_group': 'circle', 'type': 'Terminal'}, {'text': '747 717 095 595 001 142 617', 'len_te

Returning Positional Prob for pos: 1 : 920
Fwd Pentagram Model: : KneserNeyInterpolated
beg%: 13.95  ter%: 26.47  med%: 21.25  tot%: 20.38  and a total of  32  out of 157
beg_w_cat%: 30.23  ter_w_cat%: 32.35  med_w_cat%: 40.0  tot_w_cat%: 35.67

 Wrong Answers: [{'text': '746 130 400', 'len_text': 3, 'pred_answer': '032', 'pred_answer_group': None, 'correct_answer': '746', 'correct_answer_group': 'U-shape', 'type': 'Beginning'}, {'text': '861 002 705 255 740 090', 'len_text': 6, 'pred_answer': '033', 'pred_answer_group': None, 'correct_answer': '705', 'correct_answer_group': 'U-shape', 'type': 'Medial'}, {'text': '244 065 880 820', 'len_text': 4, 'pred_answer': '002', 'pred_answer_group': None, 'correct_answer': '820', 'correct_answer_group': 'circle', 'type': 'Terminal'}, {'text': '747 717 095 595 001 142 617', 'len_text': 7, 'pred_answer': '</s>', 'pred_answer_group': None, 'correct_answer': '717', 'correct_answer_group': 'U-shape', 'type': 'Medial'}, {'text': '400 520 400 353', 'len

beg%: 11.63  ter%: 26.47  med%: 23.75  tot%: 21.02  and a total of  33  out of 157
beg_w_cat%: 32.56  ter_w_cat%: 29.41  med_w_cat%: 41.25  tot_w_cat%: 36.31

 Wrong Answers: [{'text': '746 130 400', 'len_text': 3, 'pred_answer': '032', 'pred_answer_group': None, 'correct_answer': '746', 'correct_answer_group': 'U-shape', 'type': 'Beginning'}, {'text': '861 002 705 255 740 090', 'len_text': 6, 'pred_answer': '033', 'pred_answer_group': None, 'correct_answer': '705', 'correct_answer_group': 'U-shape', 'type': 'Medial'}, {'text': '244 065 880 820', 'len_text': 4, 'pred_answer': '002', 'pred_answer_group': None, 'correct_answer': '820', 'correct_answer_group': 'circle', 'type': 'Terminal'}, {'text': '747 717 095 595 001 142 617', 'len_text': 7, 'pred_answer': '</s>', 'pred_answer_group': None, 'correct_answer': '717', 'correct_answer_group': 'U-shape', 'type': 'Medial'}, {'text': '097 161', 'len_text': 2, 'pred_answer': '055', 'pred_answer_group': None, 'correct_answer': '097', 'correct_a

beg%: 13.95  ter%: 26.47  med%: 25.0  tot%: 22.29  and a total of  35  out of 157
beg_w_cat%: 27.91  ter_w_cat%: 29.41  med_w_cat%: 41.25  tot_w_cat%: 35.03

 Wrong Answers: [{'text': '746 130 400', 'len_text': 3, 'pred_answer': '032', 'pred_answer_group': None, 'correct_answer': '746', 'correct_answer_group': 'U-shape', 'type': 'Beginning'}, {'text': '861 002 705 255 740 090', 'len_text': 6, 'pred_answer': '032', 'pred_answer_group': None, 'correct_answer': '705', 'correct_answer_group': 'U-shape', 'type': 'Medial'}, {'text': '244 065 880 820', 'len_text': 4, 'pred_answer': '002', 'pred_answer_group': None, 'correct_answer': '820', 'correct_answer_group': 'circle', 'type': 'Terminal'}, {'text': '747 717 095 595 001 142 617', 'len_text': 7, 'pred_answer': '</s>', 'pred_answer_group': None, 'correct_answer': '717', 'correct_answer_group': 'U-shape', 'type': 'Medial'}, {'text': '097 161', 'len_text': 2, 'pred_answer': '055', 'pred_answer_group': None, 'correct_answer': '097', 'correct_an

## Test 3

In [56]:
# Test3
# Run the test
a = df_unclear[df_unclear.l_to_r_text!=''].l_to_r_text
a= df_unclear[df_unclear.reversed_text!=''].reversed_text
try_reverse = False
check_the_answers = False
wrong_answer_details_verbose = False
fill_using_position = True

#ans = run_test("Test-3",a, a_rev, fill_using_position, check_the_answers, None, try_reverse, wrong_answer_details_verbose, seed)

# Text Positional Analysis - Tests

In [57]:
# Test1 - PositionalNgramModel
# Run the test
#a= df_made_up_from_train[df_made_up_from_train.changed_text!=''].changed_text #l_to_r


a_rev= df_made_up_from_train[df_made_up_from_train.changed_reversed_text!=''].changed_reversed_text

check_the_answers = True
try_reverse = False
wrong_answer_details_verbose = True

#posNgramModel.get_text_norm_position_unigrams_char_with_max_prob(10)

ans =posNgramModel.find_characters(a_rev, seed)
print("ans:", ans)

if(check_the_answers==True):
    beg_percent,ter_percent,med_percent,total_percent, wrong_answer_list = check_answers(ans,test1_correct_answers)
    print("beg%:",round(beg_percent,2), " ter%:",round(ter_percent,2), " med%:", round(med_percent,2), " tot%:", round(total_percent,2)," and a total of ", round((total_percent/100)*len(test1_correct_answers)), " out of", len(test1_correct_answers))
    if(wrong_answer_details_verbose): print("\n Wrong Answers:", wrong_answer_list)
else:
    print("Answers:", ans)

Finding Unclear character for 621 740 798 838 741 000 850
2
Max Prob for Position: 2 is for character: 002
Index, out_char: 2 002
Finding Unclear character for 740 000 820
2
Max Prob for Position: 2 is for character: 002
Index, out_char: 2 002
Finding Unclear character for 000 540 204 255
4
Max Prob for Position: 4 is for character: 002
Index, out_char: 4 002
Finding Unclear character for 804 233 000
1
Max Prob for Position: 1 is for character: 820
Index, out_char: 1 820
Finding Unclear character for 740 838 741 390 000 140
2
Max Prob for Position: 2 is for character: 002
Index, out_char: 2 002
Finding Unclear character for 000 220 240 798 060 692
6
Max Prob for Position: 6 is for character: 240
Index, out_char: 6 240
Finding Unclear character for 527 000
1
Max Prob for Position: 1 is for character: 820
Index, out_char: 1 820
Finding Unclear character for 740 920 320 000
1
Max Prob for Position: 1 is for character: 820
Index, out_char: 1 820
Finding Unclear character for 740 000
1
Max 

ValueError: too many values to unpack (expected 5)

## Other Tests

In [None]:
def flatten(t):
    return [item for sublist in t for item in sublist]

i=0
for text in df[df.l_to_r_text!=''].l_to_r_text:
    #print(flatten(nltk.ngrams(text,2)))
    i=i+1

In [None]:
import nltk
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
bi_dict = dict()
bg_measures = BigramAssocMeasures()
for text in df[df.l_to_r_text!=''].l_to_r_text:
    words = nltk.word_tokenize(text)
    print(words)
    bi_finder = BigramCollocationFinder.from_words(words)
    bi_finder
    bi_collocs = bi_finder.nbest(bg_measures.likelihood_ratio, 10)
    #print(bi_collocs)
    for colloc in bi_collocs:
        print(colloc)
        bi_dict[colloc] += 1

In [None]:

unique_frequencies = dict()
total_frequencies = dict()
for text in df[df.l_to_r_text!=''].l_to_r_text:
    words = nltk.word_tokenize(text)
    fdist = nltk.FreqDist(words)
    for word, freq in fdist.most_common(50):
        total_frequencies[word] += freq # total count
        unique_frequencies[word] += 1 # unique count

In [None]:
bigrams_series = (pd.Series(flatten(nltk.ngrams(tokenized_text, 2))).value_counts())[:10]
trigrams_series = (pd.Series(flatten(nltk.ngrams(tokenized_text, 3))).value_counts())[:10]
quadgrams_series = (pd.Series(flatten(nltk.ngrams(tokenized_text, 4))).value_counts())[:10]

In [None]:
bigrams_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
bigrams_series.sort_values().plot.barh(color='blue', width=.9, figsize=(12, 8))
plt.title('10 Most Frequently Occuring Bigrams')
plt.ylabel('Bigram')
plt.xlabel('# of Occurances')