!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install scikit-learn
!pip install nltk
!pip install ipywidgets
!pip install -U dill
!pip3 install requests
!pip3 install -U spacy

In [1]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import collections
import random
import traceback
import pickle

plt.style.use(style='seaborn')
%matplotlib inline

In [2]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten
from collections import defaultdict

In [3]:
drop_duplicate_texts = True

# Set the filters on data here
filter_by_site = False
filter_by_keywords = False
filter_by_text_length= False

site = 'Mohenjo-daro'
#site = 'Harappa'
#site = 'Dholavira'
#site = 'Rakhigarhi'
#keyword = "Bull"
#keyword = "Gaur"

min_text_length=1
max_text_length=50

num_rows_text_corpus= 4999

In [4]:
seed =10
random_seed=10

In [5]:
# Read the signs
orig_sign_df=pd.read_csv('../../IndusCorpusUtils/data/icit_corpus/icit_sign_corpus.csv',dtype=str)
# set the max columns to none
pd.set_option('display.max_columns', None)

orig_sign_df

Unnamed: 0,id_sign,sign_class,set,graph,type,image,variants,function,ligatur,value,frequency,comment
0,1,SIM,01,stroke,stroke,sign001.jpg,1,"NUM, ITM, SHN",-,-,227,-
1,2,MKR,01,stroke,stroke,sign002.jpg,1,"ITM, SHN, EMS",-,-,865,-
2,3,SIM,01,stroke,stroke,sign003.jpg,1,"NUM, SHN",-,-,260,-
3,4,SIM,01,stroke,stroke,sign004.jpg,1,"NUM, SHN",-,-,99,-
4,5,SIM,01,stroke,stroke,sign005.jpg,1,"NUM, SHN",-,-,49,-
...,...,...,...,...,...,...,...,...,...,...,...,...
704,952,CMX,71,animal,uncertain,sign952.jpg,1,LFS,-,-,1,-
705,953,CMX,71,animal,Pict,sign953.jpg,1,LFS,-,-,1,-
706,956,SIM,71,-,att.d.e,sign956.jpg,1,LOG,-,-,2,-
707,957,CMX,71,-,uncertain,sign957.jpg,1,LOG,-,-,2,-


In [6]:
# Read the Text Corpus
orig_df=pd.read_csv('../../IndusCorpusUtils/data/icit_corpus/icit_text_text_corpus.csv',dtype=str, nrows=num_rows_text_corpus)
# set the max columns to none
pd.set_option('display.max_columns', None)


In [7]:
#Reverse text and add that as a new column
# Add text length as a column
list_reversed_text = []
for text in orig_df[orig_df.l_to_r_text!=''].l_to_r_text:
    # Tokenize to words
    # first split the string into chars
    chars = text.split(' ')
    length = len(chars)
    # then reverse the split string list and join with a space
    reversed_text = ' '.join(reversed(chars))
    list_reversed_text.append(reversed_text)
    
orig_df['reversed_text']= list_reversed_text #same as r_to_l text
orig_df['text_length']= orig_df['l_to_r_text'].str.len().div(3).round()

print("Dataframe has ", len(orig_df.index), " rows")

orig_df.head()

Dataframe has  4999  rows


Unnamed: 0,icit_id,site,keywords,text_class,lines,direction,text,signs,complete,alignment,sign height,text_images,linearized_text,l_to_r_text,r_to_l_text,reversed_text,text_length
0,1,Alamgirpur,,SS,1,L/R,+410-017+,2,Y,Unordered,Unequal,,410 017,410 017,017 410,017 410,2.0
1,2,Alamgirpur,,SS,1,L/R,+410-017+,2,Y,,,,410 017,410 017,017 410,017 410,2.0
2,3,Alamgirpur,,SC,1,L/R,+405-017+,2,Y,,,,405 017,405 017,017 405,017 405,2.0
3,4,Allahdino,,??,1,,+220-000+,1,N,,,,220 000,000 220,000 220,220 000,2.0
4,5,Allahdino,Bull,UC,1,R/L,+740-235+,2,Y,,,,740 235,235 740,740 235,740 235,2.0


In [8]:
# Retain texts that are only wanted

#remove the values where the text is unclear
df = orig_df[orig_df['l_to_r_text'].str.contains('000') == False] 

print("After removing unclear texts, we have ", len(df.index), " rows")

if(drop_duplicate_texts):
    #Remove out duplicate inplace
    
    #df = df.drop_duplicates(subset ="text",
    #                     keep = 'first', inplace = False)
    
    # We will consider a text duplicate only of the keywords(pics) are different
    # In that case we will retain the first occurance of it
    df = df.drop_duplicates(subset =["text", "keywords"], inplace = False, keep = "first")
    print("After removing duplicate texts, we have ", len(df.index), " rows")


#keep only the values that does not have multi-line text
df = df[df['text'].str.contains('/') == False] 

print("After removing multi-line text, we have ", len(df.index), " rows")


#Single sign ones don't have direction and won't have /, wo we need to explicitly include it
#Btw standardized_text is Left to right as in English
#df = df[df['direction'].str.contains('/') == True] 
df = df[(df['direction'].str.contains('/') == True) | (df['text_length'] ==1)] 

print("After keeping only text with known direction, we have ", len(df.index), " rows")

#Remove Multipart texts that have [ or ]
df = df[df['text'].str.contains("\[") == False] 
df = df[df['text'].str.contains("\]") == False] 

print("After keeping only text without multipart, we have ", len(df.index), " rows")

After removing unclear texts, we have  3945  rows
After removing duplicate texts, we have  2793  rows
After removing multi-line text, we have  2715  rows
After keeping only text with known direction, we have  2584  rows
After keeping only text without multipart, we have  2180  rows


In [9]:
# of those whose direction is know print out L/R and L/R text count
df_l_r = df[df['direction'].str.contains('L/R') == True] 

print("L/R texts: ", len(df_l_r.index))

df_r_l = df[df['direction'].str.contains('R/L') == True]

print("R/L texts: ", len(df_r_l.index))

L/R texts:  96
R/L texts:  1959


In [10]:
if(filter_by_site==True):
    #keep only the values that matches the provided site
    df = df[df['site'].str.contains(site) == True] 
    print("After filtering by site ", site, " it has ", len(df.index), " rows")

if(filter_by_keywords==True):
     #keep only the values that matches the provided keyword
    df = df[df['keywords'].str.contains(keyword) == True] 
    print("After filtering by keywords ", keyword, " it has ", len(df.index), " rows")

if(filter_by_text_length==True):
    df = df[(df['text_length'] > min_text_length) & (df['text_length']< max_text_length)]
    print("After filtering by text_length ",  " it has ", len(df.index), " rows")
    
#print(df)


# Unclear Texts

In [11]:
#Keep the items with unclear text in another dataframe
df_unclear = orig_df[orig_df['l_to_r_text'].str.contains('000') == True]

print("We have", len(df_unclear.index), " rows of unclear texts")

#Note: Lot of the text with unclear text have direction empty

We have 1054  rows of unclear texts


# Multi Line Texts

In [12]:
#Keep the text that are multiline (has ''/'') in another dataframe

print("Dataframe has ", len(orig_df.index), " rows")

#remove the values where the text is unclear
df_multi_line = orig_df[orig_df['l_to_r_text'].str.contains('000') == False] 

print("After removing unclear texts, we have ", len(df_multi_line.index), " rows")

if(drop_duplicate_texts):
    #Remove out duplicate inplace
    df_multi_line = df_multi_line.drop_duplicates(subset ="text",
                         keep = False, inplace = False)

    print("After removing duplicate texts, we have ", len(df_multi_line.index), " rows")


#keep only the values that has multi-line text
df_multi_line = df_multi_line[df_multi_line['text'].str.contains('/') == True] 

print("We have", len(df_multi_line.index), " rows of multi line texts")
print(df_multi_line.text)

df_multi_line.to_csv('multi_line_texts.csv')


Dataframe has  4999  rows
After removing unclear texts, we have  3945  rows
After removing duplicate texts, we have  2130  rows
We have 77  rows of multi line texts
69                      +032-031/151-740-240-235+
71              +032-031/850-032-530-740-741-456+
72                          +032-031/740-791-713+
74                              +032/226-032-817+
80                          +740-636-240/002-817+
                          ...                    
4386                    +621/090-740-231-560-534+
4402                +790/740-100-415-740-257-840+
4705                        +740-900-003/741-002+
4729                                    +840/790+
4752    +605-740-142-067/002-374-310-350-495-834+
Name: text, Length: 77, dtype: object


### Feature Extraction

In [13]:
y=df['site'].values
y.shape

# y axis is still the same
y_rev=df['site'].values
y_rev.shape

(2180,)

In [14]:
#x=df['l_to_r_text'].values
x = np.asarray(df[['l_to_r_text', 'direction']])
x.shape

x_rev=df['reversed_text'].values
x_rev.shape

(2180,)

### Train-test split

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
(x_train,x_test,y_train,y_test)=train_test_split(x,y,test_size=0.1, random_state=43)

(x_rev_train,x_rev_test,y_rev_train,y_rev_test)=train_test_split(x_rev,y_rev,test_size=0.1, random_state=43)

In [17]:
#Train data, fwd and reverse
df_train_x=pd.DataFrame(x_train)
df_train_x=df_train_x.rename(columns={0:'l_to_r_text'})
df_train_x=df_train_x.rename(columns={1:'direction'})


df_train_y=pd.DataFrame(y_train)
df_train_y=df_train_y.rename(columns={0:'site'})

#rev
df_train_x_rev=pd.DataFrame(x_rev_train)
df_train_x_rev=df_train_x_rev.rename(columns={0:'reversed_text'})


df_train_y_rev=pd.DataFrame(y_rev_train)
df_train_y_rev=df_train_y_rev.rename(columns={0:'site'})

#Test data, fwd and reverse
df_test_x=pd.DataFrame(x_test)
df_test_x=df_test_x.rename(columns={0:'l_to_r_text'})
df_test_x=df_test_x.rename(columns={1:'direction'})

df_test_y=pd.DataFrame(y_test)
df_test_y=df_test_y.rename(columns={0:'site'})

#rev
df_test_x_rev=pd.DataFrame(x_rev_test)
df_test_x_rev=df_test_x_rev.rename(columns={0:'reversed_text'})

df_test_y_rev=pd.DataFrame(y_rev_test)
df_test_y_rev=df_test_y_rev.rename(columns={0:'site'})

In [18]:
df_train=pd.concat([df_train_x,df_train_y],axis=1)
print(df_train.head())

df_test=pd.concat([df_test_x,df_test_y],axis=1)
print(df_test.head())


df_train_rev=pd.concat([df_train_x_rev,df_train_y_rev],axis=1)
print(df_train_rev.head())

df_test_rev=pd.concat([df_test_x_rev,df_test_y_rev],axis=1)
print(df_test_rev.head())

df_train.to_csv('train_texts.csv')
df_train_rev.to_csv('train_texts_rev.csv')

df_test.to_csv('test_texts.csv')
df_test_rev.to_csv('test_texts_rev.csv')

                   l_to_r_text direction          site
0                  806 740 045       R/L  Mohenjo-daro
1          455 590 002 005 368       R/L  Mohenjo-daro
2                      365 527       L/R    Wattoowala
3                  327 740 090       R/L       Harappa
4  140 920 944 002 240 482 740       R/L  Mohenjo-daro
                       l_to_r_text direction          site
0                      002 861 096       L/R  Mohenjo-daro
1                      491 817 740       R/L       Harappa
2                  933 002 004 405       R/L       Harappa
3  924 001 319 031 055 002 150 416       R/L          Susa
4          322 920 740 001 003 137       R/L  Mohenjo-daro
                 reversed_text          site
0                  045 740 806  Mohenjo-daro
1          368 005 002 590 455  Mohenjo-daro
2                      527 365    Wattoowala
3                  090 740 327       Harappa
4  740 482 240 002 944 920 140  Mohenjo-daro
                     reversed_text          si

### n-gram Models

In [19]:
from nltk.lm.preprocessing import pad_both_ends
from nltk.util import everygrams
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.lm.models import MLE
from nltk.lm.models import KneserNeyInterpolated

In [20]:
random_seed = 8
seed = 8
CONST_INITIAL = "Initial"
CONST_TERMINAL = "Terminal"
CONST_MEDIAL = "Medial"
CONST_NL = 10

In [21]:
def reverse_text(text):
    # first split the string into chars
    chars = text.split(' ')

    # then reverse the split string list and join with a space
    reversed_text = ' '.join(reversed(chars))
    return reversed_text

In [22]:
# Get ngram as list given a text (pass direction_of_string as "R/L" or "L/R")
def get_ngrams_as_list(data,direction_of_string,num):
    
    if(direction_of_string=="R/L"):
        # We need to convert R/L text to L/R to be able to get ngrams using nltk
        data_string = reverse_text(data)
    
    else: data_string= data
    
    n_grams =  ngrams(nltk.word_tokenize(data_string), num)
    return  [ ' '.join(grams) for grams in n_grams]

# Text Positional Ngram Analysis Model

In [23]:
from nltk.tokenize import word_tokenize,sent_tokenize
from typing import Dict
from bisect import bisect
from itertools import accumulate

class PositionalNgramModel:
    text_norm_position_unigrams_freq_dict=None
    df_text_norm_position_unigrams_freq=None
    df_text_norm_position_unigrams_prob=None
    text_norm_position_unigrams_maxValueIndex_dict = None
    text_norm_position_unigrams_unigrams_tuple = ()
    
    text_norm_position_bigrams_freq_dict = None
    df_text_norm_position_bigrams_freq=None
    df_text_norm_position_bigrams_prob=None
    text_norm_position_bigrams_maxValueIndex_dict = None
    
    text_norm_position_trigrams_freq_dict = None
    df_text_norm_position_trigrams_freq=None
    df_text_norm_position_trigrams_prob=None
    text_norm_position_trigrams_maxValueIndex_dict = None
    
    text_norm_position_quadgrams_freq_dict= None
    df_text_norm_position_quadgrams_freq=None
    df_text_norm_position_quadgrams_prob=None
    text_norm_position_bigrams_maxValueIndex_dict = None
    

    def _random_generator(self, seed_or_generator):
        if isinstance(seed_or_generator, random.Random):
            return seed_or_generator
        return random.Random(seed_or_generator)
    
    def _weighted_choice(self, population, weights, random_generator=None):
        """Like random.choice, but with weights.
        Heavily inspired by python 3.6 `random.choices`.
        """
        if not population:
            raise ValueError("Can't choose from empty population")
        if len(population) != len(weights):
            raise ValueError("The number of weights does not match the population")
        cum_weights = list(accumulate(weights))
        total = cum_weights[-1]
        threshold = random_generator.random()
        return population[bisect(cum_weights, total * threshold)]
    
    # To get a Positional Number for a unigram, bigram, trigram and Quadgram
    # l = Length of the text
    # s = starting position of the first character of the ngram
    # wp = weighted position = int(s/l)
    # Get the normalized position
    # Since the text passed is L/R,start index i from length of the text
    def get_norm_position(self, text,direction_of_string, search_string):
        try:
            num_chars, minp, maxp,w = 0,-1,-1,0
            sp=-1
            index=-1
            pos=-1

            #ngrams which are used as search_strings are generated from left to right, so text need to be changed to L/R

            if(direction_of_string=="R/L"):
                new_text = reverse_text(text)
            else: new_text = text


            l_pos= len(new_text )

            chars = new_text .split(' ')
            num_chars = len(chars)
            w= num_chars/CONST_NL


            pos = new_text.find(search_string)
            if(pos<0):  #not found
                return sp, minp, maxp, w
            else: 
                #sp = round((l_pos-pos)/4)
                sp = round(pos/4 +1)

            minp= int((sp-1)* (CONST_NL/num_chars) +1)
            maxp = int(sp*CONST_NL/num_chars)

            #print("text, search_string", new_text ,",", search_string)
            #print("l_pos, pos, sp, minp, maxp", l_pos, pos,sp, minp, maxp)

            if(minp<1): minp=1
            elif(minp>CONST_NL): minp = CONST_NL

            if(maxp<1): maxp=1
            elif(maxp>CONST_NL): maxp = CONST_NL

        except Exception as e:
            print("Exception:", e.__class__, "get_norm_position")
            traceback.print_exc()

        return sp,minp, maxp, w


    def get_text_norm_position_ngrams_freq(self,a,direction_of_string, n):

        average_allocation= True # Average out minp and maxp and allocate to the histogram. Don't care about weight

        text_norm_position_ngrams_freq=defaultdict(dict[int, int])
        norm_pos =-1
        i=0
        sp=-1

        try:
            #Fill the values with zeros in decreasing order so that we can keep same order in graph
            for text in a:
              for word in get_ngrams_as_list(text,direction_of_string, n):
                for j in range (10,0, -1):
                    text_norm_position_ngrams_freq[word][j]=0

            for text in a:
              for word in get_ngrams_as_list(text,direction_of_string, n):
                initial_char = word #Look for position of Initial char of the ngram
                sp,minp, maxp, w = self.get_norm_position(text,direction_of_string, initial_char)

                if(minp<0 or maxp<0): break #No match


                if(average_allocation):
                    norm_pos = round((minp+ maxp)/2)
                    #print("word, norm_pos", word, norm_pos)
                    val = text_norm_position_ngrams_freq[word].get(norm_pos,0)
                    text_norm_position_ngrams_freq[word][norm_pos]= val+ w
                else:

                    if(minp!=maxp):
                        val = text_norm_position_ngrams_freq[word].get(minp,0)
                        text_norm_position_ngrams_freq[word][minp]= val+ w

                        val = text_norm_position_ngrams_freq[word].get(maxp,0)
                        text_norm_position_ngrams_freq[word][maxp]= val +w
                    else:
                        val = text_norm_position_ngrams_freq[word].get(minp,0)
                        text_norm_position_ngrams_freq[word][minp]= val+ w

                #if(initial_char=="740"): print(i,":Text:",text, "Char", initial_char, "sp, minp, maxp, w, norm_pos", sp, minp, maxp, w, norm_pos)

                #if(initial_char=="817 002"): print(i,":Text:",text, "Char:", initial_char, sp, minp, maxp, w, norm_pos)

                i=i+1

        except Exception as e:
                print("Exception:", e.__class__, "get_text_norm_position_ngrams_freq")
                traceback.print_exc()

        return  text_norm_position_ngrams_freq

    # Can be generatilzed to any N later
    def fit(self, a, direction_of_string):
        try:
            print(a)

            self.text_norm_position_unigrams_freq_dict=defaultdict(dict[int, int])
            self.text_norm_position_unigrams_freq_dict=self.get_text_norm_position_ngrams_freq(a,direction_of_string, 1)
            self.df_text_norm_position_unigrams_freq=pd.DataFrame(self.text_norm_position_unigrams_freq_dict)

            self.df_text_norm_position_unigrams_prob = self.df_text_norm_position_unigrams_freq.div(self.df_text_norm_position_unigrams_freq.sum(axis=1), axis=0)
            self.df_text_norm_position_unigrams_prob = self.df_text_norm_position_unigrams_prob.assign(max_value=self.df_text_norm_position_unigrams_prob.values.max(1))
            df_text_norm_position_unigrams_maxValueIndex = self.df_text_norm_position_unigrams_prob.idxmax(axis = 1)
            self.text_norm_position_unigrams_maxValueIndex_dict = df_text_norm_position_unigrams_maxValueIndex.to_dict()

            # Build the unigrams tuple
            for col in self.df_text_norm_position_unigrams_prob:
                self.text_norm_position_unigrams_unigrams_tuple =  self.text_norm_position_unigrams_unigrams_tuple + (col,)

            self.text_norm_position_unigrams_unigrams_tuple= sorted(self.text_norm_position_unigrams_unigrams_tuple)

            #print(self.df_text_norm_position_unigrams_freq)
            #print(self.df_text_norm_position_unigrams_prob)
            #print(self.text_norm_position_unigrams_maxValueIndex_dict)

            #Bigram
            self.text_norm_position_bigrams_freq_dict=defaultdict(dict[int, int])
            self.text_norm_position_bigrams_freq_dict=self.get_text_norm_position_ngrams_freq(a,direction_of_string,2)
            self.df_text_norm_position_bigrams_freq=pd.DataFrame(self.text_norm_position_bigrams_freq_dict)
            self.df_text_norm_position_bigrams_prob = self.df_text_norm_position_bigrams_freq.div(self.df_text_norm_position_bigrams_freq.sum(axis=1), axis=0)
            #print(self.df_text_norm_position_bigrams_freq)test

            #Trigram
            self.text_norm_position_trigrams_freq_dict=defaultdict(dict[int, int])
            self.text_norm_position_trigrams_freq_dict=self.get_text_norm_position_ngrams_freq(a,direction_of_string,3)
            self.df_text_norm_position_trigrams_freq=pd.DataFrame(self.text_norm_position_trigrams_freq_dict)
            self.df_text_norm_position_trigrams_prob = self.df_text_norm_position_trigrams_freq.div(self.df_text_norm_position_trigrams_freq.sum(axis=1), axis=0)
            #print(self.df_text_norm_position_trigrams_freq)

            #Quadgram
            self.text_norm_position_quadgrams_freq_dict=defaultdict(dict[int, int])
            self.text_norm_position_quadgrams_freq_dict=self.get_text_norm_position_ngrams_freq(a,direction_of_string,4)
            self.df_text_norm_position_quadgrams_freq=pd.DataFrame(self.text_norm_position_quadgrams_freq_dict)
            self.df_text_norm_position_quadgrams_prob = self.df_text_norm_position_quadgrams_freq.div(self.df_text_norm_position_quadgrams_freq.sum(axis=1), axis=0)
            #print(df_text_norm_position_quadgrams_freq)
        
        except Exception as e:
            print("Exception:", e.__class__, "fit")
            traceback.print_exc()
        
        return self.df_text_norm_position_unigrams_prob, self.df_text_norm_position_bigrams_prob, self.df_text_norm_position_trigrams_prob,self.df_text_norm_position_quadgrams_prob

    def get_text_norm_position_unigrams_char_with_max_prob(self, position):
        try:
            char= self.text_norm_position_unigrams_maxValueIndex_dict[position]
            print("Max Prob for Position:", position, "is for character:",char )
        
        except Exception as e:
            print("Exception:", e.__class__, "get_text_norm_position_unigrams_char_with_max_prob")
            traceback.print_exc()
        return char
    
    def get_text_norm_position_unigrams_char_prob(self, position, word):
       
        prob=0
        try:
            prob = self.df_text_norm_position_unigrams_prob.loc[position,word]
        except Exception as e:
            print("Exception:", e.__class__, "get_text_norm_position_unigrams_char_prob")
            traceback.print_exc()
        return prob
    
    def get_text_norm_position_unigrams_char_with_weighted_prob(self, position):

        try:
            random_generator = self._random_generator(random_seed)
            #print(tuple(self.get_text_norm_position_unigrams_char_prob(position, w) for w in  self.text_norm_position_unigrams_unigrams_tuple)) 

            return self._weighted_choice(
                    self.text_norm_position_unigrams_unigrams_tuple,
                    tuple(self.get_text_norm_position_unigrams_char_prob(position, w) for w in  self.text_norm_position_unigrams_unigrams_tuple),
                    random_generator,
                )
        except Exception as e:
            print("Exception:", e.__class__, "get_text_norm_position_unigrams_char_with_weighted_prob")
            traceback.print_exc()
    
    def find_characters(self, a, seed):
        #For each of the text with unclear character go through it
        ans=-1
        out_char = None
        out_char_list = []
        try:
            for text in a :
                print("Finding Unclear character for", text)
                chars = text.split(' ')
                index = len(chars) - chars.index("000")
                if(index<0):  #not found
                    return 
                else: 
                    # Find the char with highest probablity for this position
                    print(index)
                    out_char = self.get_text_norm_position_unigrams_char_with_max_prob(index)
                    print("Index, out_char:", index, out_char)
                    out_char_list.append(out_char)
    
        except Exception as e:
            print("Exception:", e.__class__, "find_characters")
            
        return  out_char_list

## Text Positional Analysis - Basic check

In [24]:
#a= df_train_x_rev[df_train_x_rev.reversed_text!=''].reversed_text

a= df_train_x[df_train_x.l_to_r_text!=''].l_to_r_text
posNgramModel = PositionalNgramModel()

#Train model with xtrain data
df_text_norm_position_unigrams_prob, df_text_norm_position_bigrams_prob, df_text_norm_position_trigrams_prob,df_text_norm_position_quadgrams_prob = posNgramModel.fit(a, "L/R")

#print(posNgramModel.get_text_norm_position_unigrams_char_prob(10, '720'))

print(posNgramModel.get_text_norm_position_unigrams_char_with_weighted_prob(1))
print(posNgramModel.get_text_norm_position_unigrams_char_with_weighted_prob(10))

0                       806 740 045
1               455 590 002 005 368
2                           365 527
3                       327 740 090
4       140 920 944 002 240 482 740
                   ...             
1957                745 717 420 595
1958            740 690 435 255 055
1959    255 435 690 740 900 003 422
1960            740 803 390 060 740
1961    032 590 390 002 032 205 740
Name: l_to_r_text, Length: 1962, dtype: object
235
422


## ModelWithFilling for ngram

In [25]:
from bisect import bisect
from itertools import accumulate
from nltk.tokenize.treebank import TreebankWordDetokenizer
import traceback

detokenize = TreebankWordDetokenizer().detokenize 

class ModelWithFilling:

    def __init__(self, this_model, posNgramModel=None, fill_using_position=True):
        self.model = this_model
        self.posNgramModel = posNgramModel
        self.fill_using_position = fill_using_position
        if(not this_model or not posNgramModel): print("Need non Empty Model for creating instance of ModelWithFilling")
        
    def _random_generator(self, seed_or_generator):
        if isinstance(seed_or_generator, random.Random):
            return seed_or_generator
        return random.Random(seed_or_generator)
    
    def _weighted_choice(self, population, weights, random_generator=None):
        """Like random.choice, but with weights.
        Heavily inspired by python 3.6 `random.choices`.
        """
        if not population:
            raise ValueError("Can't choose from empty population")
        if len(population) != len(weights):
            raise ValueError("The number of weights does not match the population")
        cum_weights = list(accumulate(weights))
        total = cum_weights[-1]
        threshold = random_generator.random()
        #TBD Remove this
        #threshold =0.001
        
        return population[bisect(cum_weights, total * threshold)]

        
    def get_model(self):
        return self.model

    
      # Redefine this function defined in NTLK with some changes
    def generate(self,num_words=1, text_seed=None, random_seed=None, position=-1):

        #print("Generating for text:", text_seed, "and position:", position)
        text_seed = [] if text_seed is None else list(text_seed)
        pos=-1

        random_generator = self._random_generator(random_seed)
        # This is the base recursion case.
        if num_words == 1:
            context = (
                text_seed[-self.model.order + 1 :]
                if len(text_seed) >= self.model.order
                else text_seed
            )
            samples = self.model.context_counts(self.model.vocab.lookup(context))   
            
            #print("____Context:", context, "Samples:", samples)
            while context and not samples:
                context = context[1:] if len(context) > 1 else []
                samples = self.model.context_counts(self.model.vocab.lookup(context))
            # Sorting samples achieves two things:
            # - reproducible randomness when sampling
            # - turns Mapping into Sequence which `_weighted_choice` expects
            samples = sorted(samples)
            
            #if(len(context) < 1 ): 
                #print("@@@@@@@@@@@@@@@@@@@@@@@", "context:", context, "Must return Positional Prob for pos:", position)
                
            if(self.fill_using_position and self.posNgramModel and len(context)<1 and position!=-1):
                pos = self.posNgramModel.get_text_norm_position_unigrams_char_with_weighted_prob(position)
                print("Returning Positional Prob for pos:", position, ":", pos, "for text_seed:", text_seed)
                return pos
            else: 
                #print("Before returning, Context:", context)
                #for w in samples:
                    #print("Before returning, Sample Item:", w, self.model.score(w, context))
                    
                return self._weighted_choice(
                samples,
                tuple(self.model.score(w, context) for w in samples),
                random_generator,
            )
        
        # We build up text one word at a time using the preceding context.
        generated = []
        for _ in range(num_words):
            generated.append(
                self.generate(
                    num_words=1,
                    text_seed=text_seed + generated,
                    random_seed=random_generator,
                )
            )
        return generated
    

    def generate_sent(self, num_words,char_seed, random_seed, position=-1):
        """
        :param model: An ngram language model.
        :param num_words: Max no. of words to generate.
        :param random_seed: Seed value for random.
        """
        content = []
        try:
            for token in self.generate(num_words, text_seed=char_seed, random_seed=random_seed, position=position):
                if token == '<s>':
                    continue
                if token == '</s>'  or token == '< /s>' :
                    break
                content.append(token)
        except Exception as e:
            print("Exception:", e.__class__, "in ModelwithFilling:generate_sent")
            traceback.print_exc()
        return detokenize(content)
    
     # Redefine this function defined in NTLK with some changes
    def generate0(self,scores_dict, num_words=1, text_seed=None, random_seed=None, position=-1):
    
        try:
            #print("Generating for text:", text_seed, "and position:", position)
            text_seed = [] if text_seed is None else list(text_seed)
            pos=-1

            random_generator = self._random_generator(random_seed)
            # This is the base recursion case.
            if num_words == 1:
                context = (
                    text_seed[-self.model.order + 1 :]
                    if len(text_seed) >= self.model.order
                    else text_seed
                )
                samples = self.model.context_counts(self.model.vocab.lookup(context))
                
                #Added: If we get a padding character, ignore it and go further
                if(len(samples)==1):
                    #print(samples)
                    for w in samples:
                        if(w=="</s>" or w=="<s>" ): 
                            #print("settings samples to None")
                            samples = None
                    

                #print("____Context:", context, "Samples:", samples)
                while context and not samples:
                    context = context[1:] if len(context) > 1 else []
                    samples = self.model.context_counts(self.model.vocab.lookup(context))
                # Sorting samples achieves two things:
                # - reproducible randomness when sampling
                # - turns Mapping into Sequence which `_weighted_choice` expects
                samples = sorted(samples)

                #if(len(context) < 1 ): 
                    #print("@@@@@@@@@@@@@@@@@@@@@@@", "context:", context, "Must return Positional Prob for pos:", position)

                if(self.fill_using_position and self.posNgramModel and len(context)<1 and position!=-1):
                    pos = self.posNgramModel.get_text_norm_position_unigrams_char_with_weighted_prob(position)
                    print("Returning Positional Prob for pos:", position, ":", pos, "for text_seed:", text_seed)
                    return pos
                else: 
                    #print("Before returning, Context:", context)
                    for w in samples:
                        scr = self.model.score(w, context)
                        scores_dict[w]=scr
                        #print("Context:", context, "sample",w, "score", scr)


                    return self._weighted_choice(
                    samples,
                    tuple(self.model.score(w, context) for w in samples),
                    random_generator,
                ), scores_dict, context #sending the scores dictionary

            # We build up text one word at a time using the preceding context.
            generated = []
            for _ in range(num_words):
                generated.append(
                    self.generate0(
                        scores_dict,
                        num_words=1,
                        text_seed=text_seed + generated,
                        random_seed=random_generator,
                    )
                )
        except Exception as e:
            print("Exception:", e.__class__, "in generate0")
            traceback.print_exc()
            
        return generated,scores_dict, context #sending the dictionary and the context
    
    def generate_sent0(self, num_words,char_seed, random_seed, position=-1):
        """
        :param model: An ngram language model.
        :param num_words: Max no. of words to generate.
        :param random_seed: Seed value for random.
        """
        content = []
        token_list = []
        #dict to store score of various options we had
        scores_dict = defaultdict() # Pass it in
        try:
            token_list, scores_dict, context = self.generate0(scores_dict, num_words, text_seed=char_seed, random_seed=random_seed, position=position)
            # clean dictionary of padding
            scores_dict.pop('<s>', None)
            scores_dict.pop('</s>', None)
            scores_dict.pop('< /s>', None)
            
            for token in token_list:
                if token == '<s>':
                    continue
                if token == '</s>'  or token == '< /s>' :
                    break
                content.append(token)
        except Exception as e:
            print("Exception:", e.__class__, "in ModelwithFilling:generate_sent0")
            traceback.print_exc()
        return detokenize(content), scores_dict, context
    
    def get_the_perplexity(self, text, sign):
        
        perp_dict = defaultdict()
        perp=100000
        #this_tokenized_text= word_tokenize(text)
        this_tokenized_text= text.split(' ')
        this_tokenized_text_list = []
        this_tokenized_text_list.append(this_tokenized_text)

        #print("this_tokenized_text_list", this_tokenized_text_list)
        this_data_list = []
        this_data_list, _= padded_everygram_pipeline(self.model.order, this_tokenized_text_list)
        for i, test in enumerate(this_data_list):
            perp =self.model.perplexity(test)
            if(perp>100000): perp = 100000

            #if(perp<100000):print("sign:",sign, "text:", new_text,"Perp:", perp)
            perp_dict[str(sign)] = perp 
        
        min_value = min(perp_dict.values())
        res = [key for key in perp_dict if perp_dict[key] == min_value]
        #print("res[0], min_value", res[0], min_value)
                
        return res[0], min_value
    
    def get_model_ngrams_list(self, text):
        try:
            context = (
                    text[-self.model.order + 1 :]
                    if len(text) >= self.model.order
                    else text
                )
            #print(context)

            list_ngrams = sorted(model_wf_kn.get_model().counts[[context]].items(),reverse=True, key = lambda x: x[1])
            #print(list_ngrams)

            #samples = self.model.context_counts(self.model.vocab.lookup(context)) 
        
        except Exception as e:
            print("Exception:", e.__class__, "get_model_ngrams_list")
            traceback.print_exc()    
        return list_ngrams
    
    
    def get_model_ngrams_list_for_missing_ends(self, context_text_token_list, position_missing):
        try:
            list_ngrams = []
            list_dict_ngrams= []
            k= len(context_text_token_list)
            init_char_missing,final_char_missing= False, False
            
            """If k is greater than order of the model return none"""
            if(k>self.model.order): return list_ngrams
        
            """ Get the ngrams of order k+1: This is assuming padding is sent in the text"""
            """ If model order is < k then an empty list is returned"""
            list_dict_ngrams = list(self.model.counts[k+1])
            #print("Input context:", context_text_token_list)
            #print("List of ngrams of order ", k, ":", list_dict_ngrams)
            
            if(position_missing=="initial"): init_char_missing = True
            elif(position_missing=="terminal"): final_char_missing = True
            
            for tpl in list_dict_ngrams:
                """ There won't be any match if number of items in tpl is not same as k """
                """ which will happen if context is longer than order of the model"""
                if(len(tpl)==k):
                    #print("text_list, tpl:", context_text_token_list, tpl)
                    
                    """Do a match and append the matched tpls to list_ngrams"""
                    
                    if(init_char_missing):
                        match= True
                        if(context_text_token_list[0]==tpl[0]):
                            
                            """Skip the second char, start from third char till the end"""
                            for m in range(2, len(context_text_token_list)):        
                                if(context_text_token_list[m]!=tpl[m]):
                                    match = False
                                    break
                            if(match): 
                                if(init_char_missing):
                                    """Append the second element of the tuple"""
                                    """ Don't match <s> and </s>"""
                                    if(tpl[1]!="<s>" and tpl[1]!="</s>"): 
                                        #print("######## Matched! Adding",tpl[1])
                                        list_ngrams.append(tpl[1])
                                        
                    elif(final_char_missing):
                        match= True
                        last_char_context = len(context_text_token_list)-1
                        last_char_tpl = len(tpl)-1
                        
                        if(context_text_token_list[last_char_context]==tpl[last_char_tpl]):
                            
                            """Start from beginning, to the char before missing char"""
                            for m in range(0, last_char_context-1):    
                                #print("--Comparing",context_text_token_list[m],tpl[m])
                                if(context_text_token_list[m]!=tpl[m]):
                                    match = False
                                    break

                            if(match): 
                                if(final_char_missing):
                                    """Append the last but one element of the tuple"""
                                    """ Don't match <s> and </s>"""
                                    if(tpl[last_char_tpl-1]!="<s>" and tpl[last_char_tpl-1]!="</s>"): 
                                        #print("######## Matched! Adding",tpl[last_char_tpl-1])
                                        list_ngrams.append(tpl[last_char_tpl-1])
                            
                            
        except Exception as e:
            print("Exception:", e.__class__, "get_model_ngrams_list_for_missing_ends")
            traceback.print_exc()    
        return list_ngrams
            
        
    """ From the language model get the k th order ngram
        and using it get the characters that form the ngram"""
    def get_model_ngrams_list0(self, text, k, type_ngram):
        try:
            list_ngrams = []
            list_dict_ngrams = list(self.model.counts[k+1])
            
            #print("get_model_ngrams_list0", text, ":", list_dict_ngrams)

            if(k>=3):
                # cannot use word_tokenize as we could be dealing with <s>
                text_list = text.split(' ')

            for tpl in list_dict_ngrams:

                if(k==2 and len(tpl)>1):
                    #bigram
                    if(type_ngram=="left_bigram" and tpl[0]==text):
                        list_ngrams.append(tpl[1])
                    elif(type_ngram=="right_bigram" and tpl[1]==text):

                        list_ngrams.append(tpl[0])

                elif(k==3 and len(tpl)>2):
                    #trigram
                    if(type_ngram=="trigram" and tpl[0]==text_list[0] and tpl[2]==text_list[1]):
                        list_ngrams.append(tpl[1])
                elif(k==4 and len(tpl)>3):
                    #quadragram
                    #print("text_list, tpl:", text_list, tpl)
                    if(type_ngram=="quadragram" and tpl[0]==text_list[0] and tpl[2]==text_list[1] and tpl[3]==text_list[2]):
                        list_ngrams.append(tpl[1])

        except Exception as e:
            print("Exception:", e.__class__, "get_model_ngrams_list0")
            traceback.print_exc()    
        return list_ngrams
  
    
    def get_the_score(self, text):
        
        try:
            score =0
            #list_tokens = nltk.word_tokenize(text)
            
            # cannot use word_tokenize as we could be dealing with <s>
            list_tokens = text.split(' ')
            #print("list_tokens:", list_tokens)
            if(len(list_tokens)==1):
                score = self.model.score(list_tokens[0]) 
            elif(len(list_tokens)==2):
                score = self.model.score(list_tokens[1], list_tokens[0].split())
            elif(len(list_tokens)==3):
                score = self.model.score(list_tokens[2], (list_tokens[0] + " " + list_tokens[1]).split())
            elif(len(list_tokens)==4):
                score = self.model.score(list_tokens[3], (list_tokens[0] + " " + list_tokens[1] + " " + list_tokens[2]).split())
            elif(len(list_tokens)==5):
                score = self.model.score(list_tokens[4], (list_tokens[0] + " " + list_tokens[1] + " " + list_tokens[2] + " " + list_tokens[3]).split())
            elif(len(list_tokens)==6):
                score = self.model.score(list_tokens[5], (list_tokens[0] + " " + list_tokens[1] + " " + list_tokens[2] + " " + list_tokens[3] + " " + list_tokens[4]).split())
            elif(len(list_tokens)==7):
                score = self.model.score(list_tokens[6], (list_tokens[0] + " " + list_tokens[1] + " " + list_tokens[2] + " " + list_tokens[3] + " " + list_tokens[4] + " " + list_tokens[5]).split())
            else:
                print("More than 7 tokens in input text. Not getting score",list_tokens )
        except Exception as e:
            print("Exception:", e.__class__, "get_the_score")
            traceback.print_exc()
        return score
    
    def compute_score(self, list_ngram, char1, char2,char3, type_ngram):
        
        try:
            score =0
            this_dict = defaultdict()
            
            #print("compute_score:", type_ngram, list_ngram)
            
            #print("Getting compute_score for:", type_ngram)

            for sign in list_ngram:

                if(sign!='<s>'): replaced_sign = f"{sign:03}"
                else: replaced_sign = sign

                if(type_ngram=="left_bigram"):
                    left_bigram = char1 + " " + replaced_sign
                    score = self.get_the_score(left_bigram)
                    this_dict[replaced_sign]=score

                elif(type_ngram=="right_bigram"):
                    right_bigram = replaced_sign + " " + char1
                    score = self.get_the_score(right_bigram)
                    this_dict[replaced_sign]=score

                elif(type_ngram=="trigram"):
                    trigram = char1 + " " + replaced_sign +  " " + char2
                    score = self.get_the_score(trigram)
                    this_dict[replaced_sign]=score
                    
                elif(type_ngram=="quadragram"):
                    quadragram = char1 + " " + replaced_sign +  " " + char2 + " " + char3
                    #print("Getting quadragram score for:", quadragram)
                    score = self.get_the_score(quadragram)
                    this_dict[replaced_sign]=score
                else:
                    print("Wrong type ngram. Can't compute score")

        except Exception as e:
            print("Exception:", e.__class__, "compute_score")
            traceback.print_exc()

        return this_dict

    """ Dictionary passed in has the char and score
         match the character in dictionaries and get score and calculate
         ranks.Return a rank dictionary with charactor and score"""
    def get_char_and_rank(self, dict1, dict2):

        alpha1=0.5
        alpha2=0.5
        score1, score2 = 0,0
        rank=0
        rank_dict = defaultdict()
        full_list = []
        
        try:
            """ For Initial character dict2 will be empty"""
            if(dict1==None and dict2):
                alpha1=0
                alpha2=1
            elif(dict2 and dict2==None):
                alpha1=1
                alpha2=0
        
            #create a list of keys with no duplicates in a list
            if(dict1): full_list = list(dict1.keys())
            if(dict2): full_list = list(set(full_list + list(dict2.keys())))

            for sign in full_list:
                if(sign!='<s>' or sign!='</s>' ): replaced_sign = f"{sign:03}"
                else: replaced_sign = sign

                if dict1 and replaced_sign in dict1:
                    score1 = dict1[replaced_sign]
                else: score1=0

                if dict2 and replaced_sign in dict2:
                    score2 = dict2[replaced_sign]
                else: score2=0

                rank = alpha1*score1  + alpha2*score2
                rank_dict[replaced_sign]=rank

            sorted_rank_list = sorted(rank_dict.items(),reverse=True, key=lambda x:x[1])
            
            #if(len(sorted_rank_list)==0): 
                #print("dict1:", dict1, "dict2:", dict2, "full_list:", full_list)
                #print("alpha1:", alpha2, "alpha2:, alpha2")
                      
            #print("Sorted Rank --------:", sorted_rank_list)
            sign_rank_tpl = sorted_rank_list[0]
        
        except Exception as e:
            print("Exception:", e.__class__, "get_char_and_rank")
            traceback.print_exc()
            
        return sign_rank_tpl[0], sign_rank_tpl[1]

   
    """ This is to find the missing end characters
    First the end characters are selected with padding taken into account
    and then the end characters are selected without padding and the best of the two
    is the answer"""
    def get_end_chars(self,text, random_seed=seed, position=-1):
        
        missing_char=""
        sign_score =-1
        order_of_match=-1
        position_missing = ""
        i=-1
        
        try:
            score_dict = defaultdict()
            context_text_token_list = text.split(' ')
            missing_char=""
            sign_score=-1
            order_of_match=-1

            i = context_text_token_list.index("000")

            """ Replace missing char with padding character"""
            if(i==1):
                """Initial char is missing. Take it in to account the initial padding"""
                #context_text_token_list[i]= "<s>"
                position_missing = "initial"
            elif(i==len(context_text_token_list)-2):
                """Terminal char is missing. Take it in to account the terminal padding"""
                #context_text_token_list[i]= "</s>"
                position_missing = "terminal"
            elif(i==-1):
                print("No Missing characters in text")
                return missing_char, sign_score, order_of_match
            else:
                print("Initial or Terminal padding is missing or Missing characters in middle in text. Ignoring ...")
                return missing_char, sign_score, order_of_match
           
            missing_char, sign_score, order_of_match = self.get_end_chars_selection(context_text_token_list,position_missing, random_seed=seed, position=-1)
            
            #print("char, score, order of match with padding",missing_char_1, sign_score_1, order_of_match_1)
           
                
        except Exception as e:
            print("Exception:", e.__class__, "get_end_chars")
            traceback.print_exc()
                
        return missing_char, sign_score, order_of_match
    
                
    def get_end_chars_selection(self,context_text_token_list, position_missing, random_seed=seed, position=-1):
  
        try:
            score_dict = defaultdict()
            order_of_match=0
            sign_score_tpl= ()
            order_of_match=-1
            
            #print("context_text_token_list:", context_text_token_list)
            ngrams_list = self.get_model_ngrams_list_for_missing_ends(context_text_token_list, position_missing)

            while context_text_token_list and len(ngrams_list)==0:
                
                if(position_missing=="initial"):
                    """Remove the right most item in list"""
                    context_text_token_list = context_text_token_list[:-1] if len(context_text_token_list) > 1 else []
                elif(position_missing=="terminal"):
                    """Remove the left most item in list"""
                    context_text_token_list = context_text_token_list[1:] if len(context_text_token_list) > 1 else []
                else:
                    print("Neither initial nor terminal characer is missing. Exiting ...")
                    break;
                
                #print("context_text_token_list:", context_text_token_list)
                ngrams_list = self.get_model_ngrams_list_for_missing_ends(context_text_token_list, position_missing)
         
            if(len(ngrams_list)>0):
                #print("***** For context:",context_text_token_list, "ngrams_list:",ngrams_list)
                for sign in ngrams_list:

                    if(sign!='<s>'and sign!='</s>'): replaced_sign = f"{sign:03}"
                    else: replaced_sign = sign

                    new_context_text_token_list = context_text_token_list.copy()
                    
                    if(position_missing=="initial"):
                        new_context_text_token_list.insert(1, replaced_sign)
                        new_context_text_token_list.remove("000")
                    elif(position_missing=="terminal"):
                        new_context_text_token_list.insert(len(new_context_text_token_list)-2, replaced_sign)
                        new_context_text_token_list.remove("000")
                    
                    #print("new_context_text_token_list", new_context_text_token_list)

                    """Convert to a string"""
                    context_text=""
                    context_text = ' '.join(str(x) for x in new_context_text_token_list)

                    score = self.get_the_score(context_text)
                    #print("Score for:", context_text, ":", score)
                    score_dict[replaced_sign]=score


                sorted_score_list = sorted(score_dict.items(),reverse=True, key=lambda x:x[1])

                #print("Sorted Score --------:", sorted_score_list)
                sign_score_tpl = sorted_score_list[0]
                
                order_of_match = len(new_context_text_token_list)
            
        except Exception as e:
            print("Exception:", e.__class__, "get_end_char_selection")
            traceback.print_exc()
        
        if(len(sign_score_tpl)>=1):
            return sign_score_tpl[0] , sign_score_tpl[1], order_of_match
        else: 
            return "",-1, -1
 

        
    def find_char_by_rank(self,text, random_seed=seed, position=-1):
        
        try:
            alpha1 = 0.2
            alpha2 = 0.2
            alpha3 = 0.6
            rank_dict = defaultdict()

            sign_list = orig_sign_df['id_sign'].tolist()

            orig_text = text
            left_bigram_init_char, right_bigram_term_char,trigram_init_char, trigram_term_char ="","","",""
            quadragram_init_char, quadragram_third_char, quadragram_term_char = "","",""

            #list_tokens = nltk.word_tokenize(text)
            list_tokens = text.split(' ')
            
            last_token_index = len(list_tokens)-1 

            i = list_tokens.index("000")
            if(i>0 and i < last_token_index):
                """Medial character"""
                left_bigram_init_char = list_tokens[i-1]
                right_bigram_term_char = list_tokens[i+1]

                trigram_init_char = list_tokens[i-1]
                trigram_term_char = list_tokens[i+1]
            elif(i==0):
                      
                """Initial character"""
                """ We are using <s> as sentence start pading character"""
                #print("index is zero")
                if(i+1<=last_token_index):
                    left_bigram_init_char = '<s>'
                    right_bigram_term_char = list_tokens[i+1]

                    trigram_init_char = '<s>'
                    trigram_term_char = list_tokens[i+1]
                if(i+2<=last_token_index):
                    quadragram_init_char = '<s>'
                    quadragram_third_char = list_tokens[i+1]
                    quadragram_term_char = list_tokens[i+2]
                    
            else:
                print("No missing characters in text")

        
            # Get the list of bigrams 
            left_bigrams_list = self.get_model_ngrams_list0(left_bigram_init_char, 2,"left_bigram")
            right_bigrams_list = self.get_model_ngrams_list0(right_bigram_term_char,2, "right_bigram")

            trigrams_inp = trigram_init_char + " " + trigram_term_char
            trigrams_list = self.get_model_ngrams_list0(trigrams_inp, 3,"trigram")
            
            if(i+2<=last_token_index):
                quadragram_inp = quadragram_init_char + " " + quadragram_third_char + " " + quadragram_term_char
                quadragrams_list = self.get_model_ngrams_list0(quadragram_inp, 4,"quadragram")

            
            left_bigram_score_dict = self.compute_score(left_bigrams_list, left_bigram_init_char, "", "", "left_bigram")
            right_bigram_score_dict = self.compute_score(right_bigrams_list, right_bigram_term_char, "", "","right_bigram")
            trigram_score_dict = self.compute_score(trigrams_list, trigram_init_char, trigram_term_char, "", "trigram")
            if(i+2<=last_token_index): quadragram_score_dict = self.compute_score(quadragrams_list, quadragram_init_char, quadragram_third_char, quadragram_term_char, "quadragram")

            #print("left_bigram_init_char, left_bigrams_list:",left_bigram_init_char, left_bigrams_list, "\n")
            #print("left_bigram_score_dict:",dict(sorted(left_bigram_score_dict.items(),reverse=True, key=lambda item: item[1])))
            
            #print("right_bigram_term_char, right_bigrams_list:",right_bigram_term_char, right_bigrams_list, "\n")
            #print("right_bigram_score_dict:",dict(sorted(right_bigram_score_dict.items(),reverse=True, key=lambda item: item[1])))
            
            #print("trigram_init_char,trigram_term_char, trigrams_list:",trigram_init_char,trigram_term_char, trigrams_list, "\n")
            #print("trigram_score_dict :",dict(sorted(trigram_score_dict.items(),reverse=True, key=lambda item: item[1])))
            
            #if(i+2<=last_token_index): print("quadragram_init_char, quadragram_third_char, quadragram_term_char, quadragrams_list:",quadragram_init_char,quadragram_third_char, quadragram_term_char, quadragrams_list, "\n")
            #if(i+2<=last_token_index): print("quadragram_score_dict1 :",dict(sorted(quadragram_score_dict.items(),reverse=True, key=lambda item: item[1])))

            # Get the rank for each of the sign in sign list

            # Add the three list and use it
            full_list = left_bigrams_list + right_bigrams_list + trigrams_list
            if(i+2<=last_token_index):full_list = full_list + quadragrams_list
            
            for sign in full_list:
                replaced_sign = f"{sign:03}"

                if replaced_sign in left_bigram_score_dict:
                    score_left_bigram = left_bigram_score_dict[replaced_sign]
                else: score_left_bigram=0

                if replaced_sign in right_bigram_score_dict:
                    score_right_bigram = right_bigram_score_dict[replaced_sign]
                else: score_right_bigram=0

                if replaced_sign in trigram_score_dict:
                    score_trigram = trigram_score_dict[replaced_sign]
                else: score_trigram=0
                    
                if(i+2<=last_token_index):
                    if replaced_sign in quadragram_score_dict:
                        score_quadragram = quadragram_score_dict[replaced_sign]
                    else: score_quadragram=0

                if(i+2<=last_token_index):
                    alpha1 = 0.1
                    alpha2 = 0.1
                    alpha3 = 0.3
                    alpha4 = 0.5
                    rank = alpha1*score_left_bigram  + alpha2*score_right_bigram + alpha3*score_trigram + alpha4*score_quadragram
                else:
                    rank = alpha1*score_left_bigram  + alpha2*score_right_bigram + alpha3*score_trigram
                    
                rank_dict[replaced_sign]=rank

            sorted_rank_list = sorted(rank_dict.items(),reverse=True, key=lambda x:x[1])

            #print("Sorted Rank --------:", sorted_rank_list)
            sign_rank_tpl = sorted_rank_list[0]
        
        except Exception as e:
            print("Exception:", e.__class__, "find_char_by_rank")
            traceback.print_exc()
        
        return sign_rank_tpl[0], sign_rank_tpl[1]
    
        
    
    def get_missing(self, num_missing=1, input_text_ls=None, random_seed=None):
        if input_text_ls is None: return None
    
        # For not num_missing >1 is not supported
        if num_missing !=1: return None
        
        #input text will look like 'word1, word2, word3, _, word4, word5...'
        
        print("Model Order:", self.model.order)
        print("input_text_ls", input_text_ls)
        
        #find _ in list
        index_pos_list = [i for i in range(len(input_text_ls)) if input_text_ls[i] == '_' ]
        print('Indices of all occurrences of a "_" in the list are : ', index_pos_list)
        
        # We will fill only one missing word for now   
        # if _ is Initial char of the list, find bigram probablity of
        # All posibile bigram matches of word in vocabulary followed by input_text_ls[index_pos_list[0]+1]

                  
        if(len(index_pos_list)>0 and index_pos_list[0]==0):
            print("Missing character in the Initial, trying that logic")
            # This is the left most word
            guesses_ls = []
            scores_tuple = ()
            match= False
            
            # For every word in the model vocab try the word for the missing word and see 
            # if it generates samples and therefor a positive probability.
            # If so, add the word and it probability in our list
            # Try this from highest order to lowest order. If not matched in higher order
            # check the lower orders
            
            samples = None
            input_text_ls = input_text_ls[1:] # Remove the first word which is _
            
            if len(input_text_ls) >= self.model.order: max_index=self.model.order
            elif len(input_text_ls)<1 :input_text_ls= None
            else: index = len(input_text_ls)
            
            
            print("input_text_ls[:1]", input_text_ls[:1])
            print("input_text_ls[:2]", input_text_ls[:2])
            print("input_text_ls[:3]", input_text_ls[:3])
            
            input_text_ls = input_text_ls[:1] 
            
            print("Lookup:", self.model.context_counts(self.model.vocab.lookup('850')))
                  
            while input_text_ls and index >=0 and not samples:
                print("to_be_matched_word:",input_text_ls)
                i=0
                for vocab_item in self.model.vocab:
                    i=i+1
                    if(len(input_text_ls)>=1):
                        this_guess= [vocab_item] #Try every item in the vocabulary
                        this_guess = (
                        this_guess
                        )
                        samples = self.model.context_counts(self.model.vocab.lookup(this_guess))

                        # See if the sample returned has the next word in the input_text

                        samples = sorted(samples)
                        to_be_matched_word= input_text_ls[0]
                        print("this_guess:", this_guess, "to_be_matched_word:",to_be_matched_word,"samples:", samples)

                        if(input_text_ls[0] in samples):
                            print("Found a match for:", input_text_ls, "guess:", this_guess,"samples:", samples)
                            match = True
                            indexes = [i for i,x in enumerate(samples) if x == to_be_matched_word]
                            this_guess_score= self.model.score(to_be_matched_word, vocab_item.split())
                            # store off context and this_guess_score
                            guesses_ls.append(this_guess)
                            scores_tuple + (this_guess_score,)

                           
                if(match==False): print("Did not have a match for",to_be_matched_word, "with any of the items in vocabulary")
                
                index=index-1
                input_text_ls = input_text_ls[:index] 
            
            if(match==False):
                print("No matches found. At this point use unigram probability and guess")
            
        else:
            print("Missing character is not passed or not Initial")
            
        
    def fill_missing(self,num_missing=1, input_text=None, random_seed=None):
        
        if input_text is None: return None
    
        # For not num_missing >1 is not supported
        if num_missing !=1: return None
        
        input_text = list(input_text)

        random_generator = self._random_generator(random_seed)
        # This is the base recursion case.
        
        if num_missing == 1:
            context = (
                input_text[-self.model.order + 1 :]
                if len(input_text) >= self.model.order
                else input_text
            )
            samples = self.model.context_counts(self.model.vocab.lookup(context))
            while context and not samples:
                context = context[1:] if len(context) > 1 else []
                samples = self.model.context_counts(self.model.vocab.lookup(context))
                print("Samples:", samples, "Context:", context)
            # Sorting samples achieves two things:
            # - reproducible randomness when sampling
            # - turns Mapping into Sequence which `_weighted_choice` expects
            samples = sorted(samples)
            #print(samples)
            print("tuple of samples:", tuple(self.model.score(w, context) for w in samples))
            return self._weighted_choice(
                samples,
                tuple(self.model.score(w, context) for w in samples),
                random_generator,
            )

        # We build up text one word at a time using the preceding context.
        generated = []
        for _ in range(num_missing):
            generated.append(
                self.fill_missing(
                    num_words=1,
                    text_seed=text_seed + generated,
                    random_seed=random_generator,
                )
            )
        return generated
        

In [26]:
#TBD This is just a test. Remove this later
def _random_generator(seed_or_generator):
        if isinstance(seed_or_generator, random.Random):
            return seed_or_generator
        return random.Random(seed_or_generator)
    
def _weighted_choice(population, weights, random_generator=None):
    """Like random.choice, but with weights.
    Heavily inspired by python 3.6 `random.choices`.
    """
    if not population:
        raise ValueError("Can't choose from empty population")
    if len(population) != len(weights):
        raise ValueError("The number of weights does not match the population")
    cum_weights = list(accumulate(weights))
    
    total = cum_weights[-1]
    threshold = random_generator.random()
    threshold =0.99
    
    print("cum_weights:", cum_weights, "total:",total, "threshold:", threshold)
    
    return population[bisect(cum_weights, total * threshold)]

random_generator = _random_generator(80)
random_generator.random()
tuple1= ('A', 'B', 'C', 'D')
tuple2= (0.2, 0.1, 0.6, 0.6)

_weighted_choice(tuple1, tuple2, random_generator,)


cum_weights: [0.2, 0.30000000000000004, 0.9, 1.5] total: 1.5 threshold: 0.99


'D'

In [27]:
#Tokenize the text

# If we need to generate ngrams from it from r to l text, ngrams would be in opposite direction, so
# use reversed text to generate tokenized_text (l to r) and regular text to generate reverse_tokenized_text (r to l)
    
tokenized_text = list(df_train_x[df_train_x.l_to_r_text!=''].l_to_r_text.apply(word_tokenize))
reverse_tokenized_text = list(df_train_x_rev[df_train_x_rev.reversed_text!=''].reversed_text.apply(word_tokenize))


#print("tokenized_text:",tokenized_text)
#print("Rev tokenized_text:",reverse_tokenized_text)

In [28]:
# Preprocess the tokenized text for n-grams language modeling

import array as arr

model_name_list = ["MLE","KneserNeyInterpolated", "Laplace", "Lidstone","StupidBackoff", "WittenBellInterpolated"]

train_data_list_fwd_unigram = [None,None, None, None, None,None]
padded_sents_list_fwd_unigram = [None,None, None, None, None,None]
train_data_list_rev_unigram = [None,None, None, None, None,None]
padded_sents_list_rev_unigram = [None,None, None, None, None,None]

train_data_list_fwd_bigram = [None,None, None, None, None,None]
padded_sents_list_fwd_bigram = [None,None, None, None, None,None]
train_data_list_rev_bigram = [None,None, None, None, None,None]
padded_sents_list_rev_bigram = [None,None, None, None, None,None]

train_data_list_fwd_trigram = [None,None, None, None, None,None]
padded_sents_list_fwd_trigram = [None,None, None, None, None,None]
train_data_list_rev_trigram = [None,None, None, None, None,None]
padded_sents_list_rev_trigram = [None,None, None, None, None,None]

train_data_list_fwd_quadgram = [None,None, None, None, None,None]
padded_sents_list_fwd_quadgram = [None,None, None, None, None,None]
train_data_list_rev_quadgram = [None,None, None, None, None,None]
padded_sents_list_rev_quadgram = [None,None, None, None, None,None]

train_data_list_fwd_pentagram = [None,None, None, None, None,None]
padded_sents_list_fwd_pentagram = [None,None, None, None, None,None]
train_data_list_rev_pentagram = [None,None, None, None, None,None]
padded_sents_list_rev_pentagram = [None,None, None, None, None,None]

train_data_list_fwd_hexagram = [None,None, None, None, None,None]
padded_sents_list_fwd_hexagram = [None,None, None, None, None,None]
train_data_list_rev_hexagram = [None,None, None, None, None,None]
padded_sents_list_rev_hexagram = [None,None, None, None, None,None]

train_data_list_fwd_septagram = [None,None, None, None, None,None]
padded_sents_list_fwd_septagram = [None,None, None, None, None,None]
train_data_list_rev_septagram = [None,None, None, None, None,None]
padded_sents_list_rev_septagram = [None,None, None, None, None,None]

train_data_rev_list = [None,None, None, None, None,None]
padded_sents_rev_list = [None,None, None, None, None,None]


for index in range (0,6):

    train_data_list_fwd_unigram[index], padded_sents_list_fwd_unigram[index] = padded_everygram_pipeline(1, tokenized_text)
    train_data_list_rev_unigram[index], padded_sents_list_rev_unigram[index] = padded_everygram_pipeline(1, reverse_tokenized_text)
    
    train_data_list_fwd_bigram[index], padded_sents_list_fwd_bigram[index] = padded_everygram_pipeline(2, tokenized_text)
    train_data_list_rev_bigram[index], padded_sents_list_rev_bigram[index] = padded_everygram_pipeline(2, reverse_tokenized_text)
    
    train_data_list_fwd_trigram[index], padded_sents_list_fwd_trigram[index] = padded_everygram_pipeline(3, tokenized_text)
    train_data_list_rev_trigram[index], padded_sents_list_rev_trigram[index] = padded_everygram_pipeline(3, reverse_tokenized_text)
    
    train_data_list_fwd_quadgram[index], padded_sents_list_fwd_quadgram[index] = padded_everygram_pipeline(4, tokenized_text)
    train_data_list_rev_quadgram[index], padded_sents_list_rev_quadgram[index] = padded_everygram_pipeline(4, reverse_tokenized_text)

    train_data_list_fwd_pentagram[index], padded_sents_list_fwd_pentagram[index] = padded_everygram_pipeline(5, tokenized_text)
    train_data_list_rev_pentagram[index], padded_sents_list_rev_pentagram[index] = padded_everygram_pipeline(5, reverse_tokenized_text)

    train_data_list_fwd_hexagram[index], padded_sents_list_fwd_hexagram[index] = padded_everygram_pipeline(6, tokenized_text)
    train_data_list_rev_hexagram[index], padded_sents_list_rev_hexagram[index] = padded_everygram_pipeline(6, reverse_tokenized_text)
    
    train_data_list_fwd_septagram[index], padded_sents_list_fwd_septagram[index] = padded_everygram_pipeline(7, tokenized_text)
    train_data_list_rev_septagram[index], padded_sents_list_rev_septagram[index] = padded_everygram_pipeline(7, reverse_tokenized_text)

    
print_train_data_details= False
#If you iterate through this, the iterator is done with and model 
# fitting won't work subsequently
# so set print_train_data_details= False before trying the actual model

# Example
if(print_train_data_details):
    for ngramlize_sent in train_data_list_fwd_quadgram[0]:
        print(list(ngramlize_sent))
        print()
    print('#############')
    list(padded_sents_list_fwd_quadgram[0])
    
if(print_train_data_details):
    for ngramlize_sent in train_data_list_rev_quadgram[0]:
        print(list(ngramlize_sent))
        print()
    print('#############')
    list(padded_sents_list_rev_quadgram[0])
    

In [29]:
# Train Unigram, Bigram, Trigram, Quadgram, Pentagram, Hexagram Models for both fwd text and reverse tex with the following
# models. Ignoring AbsoluteDiscountingInterpolated model
from nltk.lm.models import MLE
from nltk.lm.models import AbsoluteDiscountingInterpolated
from nltk.lm.models import KneserNeyInterpolated
from nltk.lm.models import Laplace
from nltk.lm.models import Lidstone
from nltk.lm.models import StupidBackoff
from nltk.lm.models import WittenBellInterpolated


model_MLE_list_fwd = []
model_KneserNeyInterpolated_list_fwd = []
model_Laplace_list_fwd = []
model_Lidstone_list_fwd = []
model_StupidBackoff_list_fwd = []
model_WittenBellInterpolated_list_fwd= []


model_MLE_list_rev = []
model_KneserNeyInterpolated_list_rev = []
model_Laplace_list_rev = []
model_Lidstone_list_rev = []
model_StupidBackoff_list_rev = []
model_WittenBellInterpolated_list_rev= []

for index in range(1, 8):
    model_MLE_list_fwd.append(MLE(index))
    model_KneserNeyInterpolated_list_fwd.append(KneserNeyInterpolated(index))
    model_Laplace_list_fwd.append(Laplace(index))
    model_Lidstone_list_fwd.append(Lidstone(index, index))
    model_StupidBackoff_list_fwd.append(StupidBackoff(index, index))
    model_WittenBellInterpolated_list_fwd.append(WittenBellInterpolated(index))
    
    
for index in range(1, 8):
    model_MLE_list_rev.append(MLE(index))
    model_KneserNeyInterpolated_list_rev.append(KneserNeyInterpolated(index))
    model_Laplace_list_rev.append(Laplace(index))
    model_Lidstone_list_rev.append(Lidstone(index, index))
    model_StupidBackoff_list_rev.append(StupidBackoff(index, index))
    model_WittenBellInterpolated_list_rev.append(WittenBellInterpolated(index))
    
models_list_fwd_unigram = [model_MLE_list_fwd[0] ,model_KneserNeyInterpolated_list_fwd[0] ,model_Laplace_list_fwd[0] , model_Lidstone_list_fwd[0] , model_StupidBackoff_list_fwd[0],model_WittenBellInterpolated_list_fwd[0]]
models_list_rev_unigram = [model_MLE_list_rev[0] ,model_KneserNeyInterpolated_list_rev[0] ,model_Laplace_list_rev[0] , model_Lidstone_list_rev[0] , model_StupidBackoff_list_rev[0], model_WittenBellInterpolated_list_rev[0]]

models_list_fwd_bigram = [model_MLE_list_fwd[1] ,model_KneserNeyInterpolated_list_fwd[1] ,model_Laplace_list_fwd[1] , model_Lidstone_list_fwd[1] , model_StupidBackoff_list_fwd[1],model_WittenBellInterpolated_list_fwd[1]]
models_list_rev_bigram = [model_MLE_list_rev[1] ,model_KneserNeyInterpolated_list_rev[1] ,model_Laplace_list_rev[1] , model_Lidstone_list_rev[1] , model_StupidBackoff_list_rev[1], model_WittenBellInterpolated_list_rev[1]]

models_list_fwd_trigram = [model_MLE_list_fwd[2] ,model_KneserNeyInterpolated_list_fwd[2] ,model_Laplace_list_fwd[2] , model_Lidstone_list_fwd[2] , model_StupidBackoff_list_fwd[2],model_WittenBellInterpolated_list_fwd[2]]
models_list_rev_trigram = [model_MLE_list_rev[2] ,model_KneserNeyInterpolated_list_rev[2] ,model_Laplace_list_rev[2] , model_Lidstone_list_rev[2] , model_StupidBackoff_list_rev[2],model_WittenBellInterpolated_list_rev[2]]

models_list_fwd_quadgram = [model_MLE_list_fwd[3] ,model_KneserNeyInterpolated_list_fwd[3] ,model_Laplace_list_fwd[3] , model_Lidstone_list_fwd[3] , model_StupidBackoff_list_fwd[3],model_WittenBellInterpolated_list_fwd[3]]
models_list_rev_quadgram = [model_MLE_list_rev[3] ,model_KneserNeyInterpolated_list_rev[3] ,model_Laplace_list_rev[3] , model_Lidstone_list_rev[3] , model_StupidBackoff_list_rev[3],model_WittenBellInterpolated_list_rev[3]]

models_list_fwd_pentagram = [model_MLE_list_fwd[4] ,model_KneserNeyInterpolated_list_fwd[4] ,model_Laplace_list_fwd[4] , model_Lidstone_list_fwd[4] , model_StupidBackoff_list_fwd[4],model_WittenBellInterpolated_list_fwd[4]]
models_list_rev_pentagram = [model_MLE_list_rev[4] ,model_KneserNeyInterpolated_list_rev[4] ,model_Laplace_list_rev[4] , model_Lidstone_list_rev[4] , model_StupidBackoff_list_rev[4],model_WittenBellInterpolated_list_rev[4]]

models_list_fwd_hexagram = [model_MLE_list_fwd[5] ,model_KneserNeyInterpolated_list_fwd[5] ,model_Laplace_list_fwd[4] , model_Lidstone_list_fwd[5] , model_StupidBackoff_list_fwd[5],model_WittenBellInterpolated_list_fwd[5]]
models_list_rev_hexagram = [model_MLE_list_rev[5] ,model_KneserNeyInterpolated_list_rev[5] ,model_Laplace_list_rev[4] , model_Lidstone_list_rev[5] , model_StupidBackoff_list_rev[5],model_WittenBellInterpolated_list_rev[5]]

models_list_fwd_septagram = [model_MLE_list_fwd[6] ,model_KneserNeyInterpolated_list_fwd[6] ,model_Laplace_list_fwd[6] , model_Lidstone_list_fwd[6] , model_StupidBackoff_list_fwd[6],model_WittenBellInterpolated_list_fwd[6]]
models_list_rev_septagram = [model_MLE_list_rev[6] ,model_KneserNeyInterpolated_list_rev[6] ,model_Laplace_list_rev[6] , model_Lidstone_list_rev[6] , model_StupidBackoff_list_rev[6],model_WittenBellInterpolated_list_rev[6]]



In [30]:
def fit_and_train_models(name, models_list, train_data_list,padded_sents_list):
    for index in range (0,len(models_list)):
        models_list[index].fit(train_data_list[index], padded_sents_list[index])
        print("Fit:", name, model_name_list[index],"Order:", models_list[index].order, models_list[index].vocab)

In [31]:
fit_and_train_models("Fwd Unigram Model:", models_list_fwd_unigram , train_data_list_fwd_unigram,padded_sents_list_fwd_unigram)
fit_and_train_models("Rev Unigram Model:", models_list_rev_unigram , train_data_list_rev_unigram,padded_sents_list_rev_unigram)

fit_and_train_models("Fwd Bigram Model:", models_list_fwd_bigram , train_data_list_fwd_bigram,padded_sents_list_fwd_bigram)
fit_and_train_models("Rev Bigram Model:", models_list_rev_bigram , train_data_list_rev_bigram,padded_sents_list_rev_bigram)

fit_and_train_models("Fwd Trigram Model:", models_list_fwd_trigram , train_data_list_fwd_trigram,padded_sents_list_fwd_trigram)
fit_and_train_models("Rev Trigram Model:", models_list_rev_trigram , train_data_list_rev_trigram,padded_sents_list_rev_trigram)

fit_and_train_models("Fwd Quadgram Model:", models_list_fwd_quadgram , train_data_list_fwd_quadgram,padded_sents_list_fwd_quadgram)
fit_and_train_models("Rev Quadgram Model:", models_list_rev_quadgram , train_data_list_rev_quadgram,padded_sents_list_rev_quadgram)

fit_and_train_models("Fwd Pentagram Model:", models_list_fwd_pentagram , train_data_list_fwd_pentagram,padded_sents_list_fwd_pentagram)
fit_and_train_models("Rev Pentagram Model:", models_list_rev_pentagram , train_data_list_rev_pentagram,padded_sents_list_rev_pentagram)

fit_and_train_models("Fwd Hexagram Model:", models_list_fwd_hexagram , train_data_list_fwd_hexagram,padded_sents_list_fwd_hexagram)
fit_and_train_models("Rev Hexagram Model:", models_list_rev_hexagram , train_data_list_rev_hexagram,padded_sents_list_rev_hexagram)

fit_and_train_models("Fwd Septagram Model:", models_list_fwd_septagram , train_data_list_fwd_septagram,padded_sents_list_fwd_septagram)
fit_and_train_models("Rev Septagram Model:", models_list_rev_septagram , train_data_list_rev_septagram,padded_sents_list_rev_septagram)



Fit: Fwd Unigram Model: MLE Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 580 items>
Fit: Fwd Unigram Model: KneserNeyInterpolated Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 580 items>
Fit: Fwd Unigram Model: Laplace Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 580 items>
Fit: Fwd Unigram Model: Lidstone Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 580 items>
Fit: Fwd Unigram Model: StupidBackoff Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 580 items>
Fit: Fwd Unigram Model: WittenBellInterpolated Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 580 items>
Fit: Rev Unigram Model: MLE Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 580 items>
Fit: Rev Unigram Model: KneserNeyInterpolated Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 580 items>
Fit: Rev Unigram Model: Laplace Order: 1 <Vocabulary with cutoff=1 unk_label='<UNK>' and 580 items>
Fit: Rev Unigram Model: Lidstone Order: 1 <Vocabulary with

Fit: Rev Septagram Model: MLE Order: 7 <Vocabulary with cutoff=1 unk_label='<UNK>' and 582 items>
Fit: Rev Septagram Model: KneserNeyInterpolated Order: 7 <Vocabulary with cutoff=1 unk_label='<UNK>' and 582 items>
Fit: Rev Septagram Model: Laplace Order: 7 <Vocabulary with cutoff=1 unk_label='<UNK>' and 582 items>
Fit: Rev Septagram Model: Lidstone Order: 7 <Vocabulary with cutoff=1 unk_label='<UNK>' and 582 items>
Fit: Rev Septagram Model: StupidBackoff Order: 7 <Vocabulary with cutoff=1 unk_label='<UNK>' and 582 items>
Fit: Rev Septagram Model: WittenBellInterpolated Order: 7 <Vocabulary with cutoff=1 unk_label='<UNK>' and 582 items>


## Initial Terminal Character Model

In [32]:
# Build Model for relationship between Initial and Terminal characters
# This can be a bigram model. Pick a reasonably good model
# Remove all characters other than initial and terminal and then tokenize
tokenized_text_temp = list(df_train_x[df_train_x.l_to_r_text!=''].l_to_r_text.apply(word_tokenize))

#print(tokenized_text_temp)
      
tokenized_text_it = []
for i in range(len(tokenized_text_temp)):
    
    l= tokenized_text_temp[i]
    #single character text, ignore it
    if(len(l)>1):
        del l[1:len(l)-1]
        l[0],l[1] = l[1], l[0]  #swap
        tokenized_text_it.append(l)
        
#print(tokenized_text_it)

k=2
model_it_bigram_kn = KneserNeyInterpolated(k) #Bigram model
train_data_it, padded_sents_it = padded_everygram_pipeline(k, tokenized_text_it)


print_train_data_details= False
#If you iterate through this, the iterator is done with and model 
# fitting won't work subsequently
# so set print_train_data_details= False before trying the actual model

if(print_train_data_details):
    for ngramlize_sent in train_data_it:
        print(list(ngramlize_sent))
        print()
    print('#############')
    list(padded_sents_it)
    
model_it_bigram_kn.fit(train_data_it, padded_sents_it)
    
print(model_it_bigram_kn.vocab)
print(model_it_bigram_kn.counts)
print(model_it_bigram_kn.generate(1, ['804'], 8))
print(model_it_bigram_kn.generate(1, ['621'], 8))

model_wf_it = ModelWithFilling(model_it_bigram_kn, posNgramModel, True)

print(model_wf_it.generate_sent(1, ['621'],8, 10))

<Vocabulary with cutoff=1 unk_label='<UNK>' and 440 items>
<NgramCounter with 2 ngram orders and 12992 ngrams>
219
456
4 5 6


In [33]:
#Check one of the models and play with it
k=4
model_kn = KneserNeyInterpolated(k) #Quadgram model
model_kn_rev = KneserNeyInterpolated(k) # Reverse string, Quadgram model

train_data, padded_sents = padded_everygram_pipeline(k, tokenized_text)
train_data_rev, padded_sents_rev = padded_everygram_pipeline(k, reverse_tokenized_text)

model_kn.fit(train_data, padded_sents)
print(model_kn.vocab)

model_kn_rev.fit(train_data_rev, padded_sents_rev)
print(model_kn_rev.vocab)
      
print(model_kn.vocab.lookup(tokenized_text[0]))
print(model_kn.counts)

text_seed = '390'
num_words=1
context = ['390']

#pickle the model
pickle.dump(model_kn, open('kn_quad_model.pkl', 'wb'))
pickle.dump(model_kn_rev, open('kn_rev_quad_model.pkl', 'wb'))

#'l_to_r_text' : "634 368 002 061 717 390"
    
print("count of 390:", model_kn.counts['390'])
print(model_kn.counts[['717']]['390'])
print(model_kn.counts[['368', '002']]['061'])
print(model_kn.counts[['002', '061']]['717'])
print(model_kn.score('390'))
# lm.score("b", ["a"]) what is the chance that “b” is preceded by “a”.
print("---", model_kn.score('390', ['717']))

print(model_kn.score('390', '717'.split()))  # P('390'|'717) Given 717 occurs what is the prob of 390
print(model_kn.score('061', '717'.split()))  # P('740'|'390)
print(model_kn.score('368', '002 061'.split()))  # P('368|'002 061')
print(model_kn.score('002', '0061 717'.split()))

print("Entropy and Perplexity")

test = [('634', '368'), ('002', '061')]
print(model_kn.entropy(test))
print(model_kn.perplexity(test))

<Vocabulary with cutoff=1 unk_label='<UNK>' and 582 items>
<Vocabulary with cutoff=1 unk_label='<UNK>' and 582 items>
('806', '740', '045')
<NgramCounter with 4 ngram orders and 68244 ngrams>
count of 390: 158
1
1
1
0.014576374126935925
--- 0.034467051320983905
0.034467051320983905
0.00035428687114080374
3.2700663363947176e-05
0.0358605796808044
Entropy and Perplexity
2.7058475203720582
6.524410339963913


In [34]:
print(models_list_fwd_pentagram[0].vocab)
print(models_list_fwd_pentagram[0].counts)
print(models_list_fwd_pentagram[0].order)
print("count of 390:", models_list_fwd_pentagram[0].counts['390'])
print("count of 850:", models_list_fwd_pentagram[0].counts['850'])

print_train_data_details= False
#If you iterate through this, the iterator is done with and model 
# fitting won't work subsequently
# so set print_train_data_details= False before trying the actual model

if(print_train_data_details):
    for ngramlize_sent in train_data_list_fwd_pentagram[0]:
        print(list(ngramlize_sent))
        print()
    print('#############')
    list(padded_sents_list_fwd_pentagram[0])

<Vocabulary with cutoff=1 unk_label='<UNK>' and 582 items>
<NgramCounter with 5 ngram orders and 100020 ngrams>
5
count of 390: 158
count of 850: 20


## Test the ModelWithFilling

In [35]:
model_wf = ModelWithFilling(model_kn, posNgramModel, True) # Fix this later. Need to create Position Model first

#print(model_wf.generate(1, ['061','002', '368'],42))

print(model_wf.generate(1, ['867'],42, 10))

#model_wf.fill_missing(1, ['061','002', '368'],42)

#input_list =list['061','_','368']

samples = model_kn.context_counts(model_kn.vocab.lookup(['899']))
print(samples)


Returning Positional Prob for pos: 10 : 422 for text_seed: ['867']
422
<FreqDist with 0 samples and 0 outcomes>


In [36]:
# L to R: 850 092 741 838 798 740 621 
input_list = ['_' ,'092', '741', '838']

#model_wf.get_missing(1,input_list ,42)

In [37]:
def pack_list(first_param, second_param, third_param):
    packed_list = []
    
    if(third_param!=-1):
        packed_list.append(third_param)
        
    if(second_param!=-1):
        packed_list.append(second_param)
        
    if(first_param!=-1):
        packed_list.append(first_param)
  
    return packed_list

In [38]:
def get_list_token(initial, j, list_tokens, index_unclear_signs, k,l,m):
    one_before=-1
    two_before=-1
    three_before=-1
    param =[]
    last_token_index = len(list_tokens)-1 
    #print(terminal, j, list_tokens, index_unclear_signs, k,l,m)
    
    try:
        if(initial):     
            # You need to send reverse of the characters
            if(index_unclear_signs[j]+k<=last_token_index):
                one_before = list_tokens[index_unclear_signs[j]+k]

            if(index_unclear_signs[j]+l<=last_token_index):
                two_before = list_tokens[index_unclear_signs[j]+l]
            
            if(index_unclear_signs[j]+m<=last_token_index):
                three_before = list_tokens[index_unclear_signs[j]+m]
                
        else:
            if(index_unclear_signs[j]+k>=0):
                one_before= list_tokens[index_unclear_signs[j]+k]

            if(index_unclear_signs[j]+l>=0):
                two_before = list_tokens[index_unclear_signs[j]+l]
                
            if(index_unclear_signs[j]+m>=0):
                three_before = list_tokens[index_unclear_signs[j]+m]


        param = pack_list(one_before, two_before, three_before)
        
    except Exception as e:
        print("Exception:", e.__class__, "occurred in get_list_token.")
    return param

In [39]:
def get_char_by_position(this_text, position_type):
    char_list = []
    # first split the string into chars
    chars = this_text.split(' ') 
    if(position_type == "Initial"):
        char_list.append(chars[0])
    elif(position_type == "Terminal"):
        char_list.append(chars[len(chars)-1])
    return char_list

#test
print(get_char_by_position('850 000 741 838 798 740 621', "Terminal"))


['621']


In [40]:
print(model_it_bigram_kn.generate(1,get_char_by_position('850 000 741 838 798 740 621', "Terminal"), seed ))
print(model_it_bigram_kn.generate(1,['621'], seed ))

print(get_char_by_position('231 233 804', "Terminal"))

print(model_it_bigram_kn.generate(1,get_char_by_position('231 233 804', "Terminal"), seed ))

456
456
['804']
219


In [41]:
def get_perplexity(model1, k, this_texts, this_tokenized_text,seed):
    try:
        model_wf_1 = ModelWithFilling(model1,posNgramModel, None)
        this_data_list = []
        this_data_list, _= padded_everygram_pipeline(k, this_tokenized_text)
        for i, test in enumerate(this_data_list):
            print("Perplexity( {0}):{1}".format(this_texts[i], model_wf_1.get_model().perplexity(test)))
    except Exception as e:
                print("Exception:", e.__class__, "get_perplexity")
                traceback.print_exc()
                

In [42]:
def find_unclear_characters(model1, model2, a, fill_using_position, seed):
    #For each of the text with unclear character go through it
    
    ans=-1
    unclear_chars = []
    first_unclear=0
    
    model_wf_1 = ModelWithFilling(model1,posNgramModel, fill_using_position)
    model_wf_2 = ModelWithFilling(model2,posNgramModel, fill_using_position)
    term = None
    rank_algo_1=-1
    rank_algo_2=-1

    
    for text in a :
        try:
            #print("text:", text)
            #Identify the position of the unclear text and get its immediate neighbors
            
            #list_tokens = nltk.word_tokenize(text)
            list_tokens = text.split(' ')
            
            last_token_index = len(list_tokens)-1  
            index_unclear_signs = []
            param =[]
            param1, param2 = [],[]
            
            for i in range(0, last_token_index+1) :
                    if(list_tokens[i]=='000'):
                        index_unclear_signs.append(i)
                        first_unclear=i

            position_unclear_char = len(list_tokens)- first_unclear
            
            #if(verbose_debug): 
                #print("Text:Index of Unclear signs:", str(text) + ": " + str(index_unclear_signs), "position (RtoL):", position_unclear_char)

            """assuming one unclear sign in a text. TBD: Extend this later to more than one unclear sign"""
            j=0
            rank_algo_1,rank_algo_2=0,0
            
            if(index_unclear_signs[j]==1):
                """Initial char is missing"""
                
                if(verbose_debug): print("L to R: Initial char is unclear", text)
                param = get_list_token(True, j, list_tokens, index_unclear_signs, 1,2,3)
                 
                if(use_initial_terminal_model):
                    """ Get the Terminal character as a list
                        Pass it to the Initial Terminal Model """
                    term = get_char_by_position(text, "Terminal")
                    ans = model_it_bigram_kn.generate(1,get_char_by_position(text, "Terminal"), seed ) 
                    #print("Text:", text, "Terminal Char:", get_char_by_position(text, "Terminal"), "Answer:", ans)
                else:
                    """This is the algo_1"""
                    #ans_algo_1, rank_algo_1 = model_wf_1.find_char_by_rank(text,random_seed=seed, position=position_unclear_char)
                    #print("Char and rank using algo-1", ans_algo_1, rank_algo_1)
                    
                    #Use fwd model
                    ans_algo_1, rank_algo_1, order_of_match_algo_1 = model_wf_1.get_end_chars(text, random_seed=seed)
                    if(verbose_debug): print("Char, score, order of match using algo-1", ans_algo_1, rank_algo_1, order_of_match_algo_1)

                    """ This is the algo_2"""
                    if(verbose_debug): print( "Sending: ", param , " to generate next char from second model")

                    #Use reverse model
                    ans1, dict1, context = model_wf_2.generate_sent0(1, param , random_seed=seed, position=position_unclear_char)
                    order_of_match_algo_2 = len(context)
                    ans_algo_2, rank_algo_2 = model_wf_2.get_char_and_rank(dict1, None)

                    if(verbose_debug): print("Char, score, order of match using algo-2", ans, rank_algo_2, order_of_match_algo_2)

                    
                if(order_of_match_algo_1>=order_of_match_algo_2):
                    ans = ans_algo_1
                    rank = rank_algo_1
                else:
                    print("Initial: !!!!!! algo_2 is better than algo_1")
                    ans= ans_algo_2
                    rank= rank_algo_2      
                    
                if(verbose_debug): print("Answer:", ans)
                unclear_chars.append(ans)

            elif(index_unclear_signs[j]==last_token_index-1):
                """Terminal char is missing"""
                ans_algo_1, rank_algo_1, order_of_match_algo_1= "",-1,-1
                ans_algo_2, rank_algo_2, order_of_match_algo_2 = "", -1, -1
                
                param = get_list_token(False, j, list_tokens, index_unclear_signs, -1,-2,-3)
                if(verbose_debug): print("L to R: Terminal char is unclear", text)
                
                """This is the algo_1"""
                ans_algo_1, rank_algo_1, order_of_match_algo_1 = model_wf_1.get_end_chars(text, random_seed=seed)
                if(verbose_debug): print("Char, score, order of match using algo-1", ans_algo_1, rank_algo_1, order_of_match_algo_1)

                try:
                    #ans = model_wf_1.generate_sent(1, param, random_seed=seed, position=position_unclear_char)
                    if(verbose_debug): print("Sending: ", param , " to generate next char from first model")
                    
                    ans1, dict1, context = model_wf_1.generate_sent0(1, param , random_seed=seed, position=position_unclear_char)
                    order_of_match_algo_2 = len(context)
                    ans_algo_2, rank_algo_2 = model_wf_1.get_char_and_rank(dict1, None)

                    if(verbose_debug): print("Char, score, order of match using algo-2", ans, rank_algo_2, order_of_match_algo_2)

                except Exception as e:
                    print("Exception:", e.__class__, "find_unclear_characters:generate_sent0")
                
                if(order_of_match_algo_1>=order_of_match_algo_2):
                    ans = ans_algo_1
                    rank = rank_algo_1
                else:
                    print("Terminal: !!!!!! algo_2 is better than algo_1")
                    ans= ans_algo_2
                    rank= rank_algo_2    

                if(verbose_debug): print("Answer:", ans)
                unclear_chars.append(ans)

            else:

                #Not proceeding if more than one char is unclear
                if(len(index_unclear_signs)>1):
                    #print("Many chars are unclear in the text, not able to decipher the text. Moving on ...\n")
                    continue
                    
                """Medial char is missing"""
                
                if(verbose_debug): print("L to R: One of the middle char is unclear", text)
                
                #print("Finding char by algo-1")
                
                ans_algo_1, rank_algo_1 = model_wf_1.find_char_by_rank(text,random_seed=seed, position=position_unclear_char)
              
                """Max of Trigram match in algo_1"""
                context_length_algo1=3
                #print("Finding char by algo-2")
                
                param1 = get_list_token(False,j, list_tokens, index_unclear_signs, -1,-2,-3)
                if(verbose_debug): print("L to R - part-1:", text, "Sending: ", param1 , " to generate next char from first model")
                ans1, dict1, context1 = model_wf_1.generate_sent0(1, param1, random_seed=seed, position=position_unclear_char)           
                
                param2 = get_list_token(True,j, list_tokens, index_unclear_signs, 1,2,3)
                if(verbose_debug): print("L to R - part-2:", text, "Sending: ", param2 , " to generate next char from second model")
                ans2, dict2, context2 = model_wf_2.generate_sent0(1, param2, random_seed=seed, position=position_unclear_char)

                ans_algo_2, rank_algo_2 = model_wf_1.get_char_and_rank(dict1, dict2)
               
                if(len(context1)>len(context2)):
                    ans_algo2=ans1
                    context_algo2 = context1
                else:
                    ans_algo2=ans2
                    context_algo2 = context2
                
                if(context_length_algo1>len(context_algo2)):
                    ans = ans_algo_1
                    rank = rank_algo_1
                else:
                    if(verbose_debug):print("!!!!!! algo_2 is better than algo_1",rank_algo_1,rank_algo_2,context1,context2 )
                    ans= ans_algo_2
                    rank= rank_algo_2
                
                #TBD remove this
                #ans = ans_algo_1
                
                if(verbose_debug): print("Answer:", ans, rank)
                unclear_chars.append(ans)
        except Exception as e:
            if(verbose_debug): 
                print("Exception:", e.__class__, "find_unclear_characters.")
                traceback.print_exc()
            
    return  unclear_chars


In [43]:
def get_group_for_sign(id_sign):
    for graph in orig_sign_df[orig_sign_df.id_sign==id_sign].graph :
        return(graph)

In [44]:
def add_answers(text,answer_list, answer, type_unclear_char):
    dict_row = {'text':text, 'len_text':len(text), 'answer':answer, 'type': type_unclear_char}
    answer_list.append(dict_row)

In [45]:
def add_wrong_answers(wrong_answer_list, text, predicted_answer,predicted_answer_group, correct_answer,correct_answer_group, type_unclear_char):
    chars = text.split(' ')
    dict_row = {'text':text, 'len_text':len(chars),'pred_answer':predicted_answer, 'pred_answer_group': predicted_answer_group, 'correct_answer': correct_answer,'correct_answer_group': correct_answer_group, 'type': type_unclear_char}
    wrong_answer_list.append(dict_row)
    

In [46]:
def check_answers(ans, test_correct_answers):

    try:
        beg_hit,ter_hit, med_hit,total_hit=0,0,0,0
        beg_hit_category,ter_hit_category, med_hit_category,total_hit_category=0,0,0,0
        
        beg_count,ter_count,med_count,total_count=0,0,0,0
        
        beg_percent,ter_percent,med_percent,total_percent=0,0,0,0
        beg_percent,ter_percent_category,med_percent_category,total_percent_category=0,0,0,0
        
        wrong_answer_list = []
        
        i=0
        for answers in ans:
            correct_ans = test_correct_answers[i].get('answer')
            correct_ans_type = test_correct_answers[i].get('type')
            correct_ans_text = test_correct_answers[i].get('text')
            
            this_ans =answers.replace(" ", "")
            type_unclear_char = correct_ans_type
            
            # Full Match
            if(this_ans==correct_ans):
                if(type_unclear_char==CONST_INITIAL):
                    beg_hit= beg_hit+1
                    beg_count= beg_count+1
                elif(type_unclear_char==CONST_TERMINAL):
                    ter_hit= ter_hit+1
                    ter_count= ter_count+1
                elif(type_unclear_char==CONST_MEDIAL):
                    med_hit= med_hit+1
                    med_count= med_count+1
                    #TBD Remove these
                    #print("Medial match: this_ans, correct_ans", this_ans,correct_ans)
                    
                total_hit=total_hit+1
                total_count= total_count+1
                
            # Only a Category match
            elif(get_group_for_sign(this_ans)==get_group_for_sign(correct_ans)):
                if(type_unclear_char==CONST_INITIAL):
                    beg_hit_category= beg_hit_category+1
                    beg_count= beg_count+1
                elif(type_unclear_char==CONST_TERMINAL):
                    ter_hit_category= ter_hit_category+1
                    ter_count= ter_count+1
                elif(type_unclear_char==CONST_MEDIAL):
                    med_hit_category= med_hit_category+1
                    med_count= med_count+1
                    #TBD Remove these
                    #print("Medial Category match: this_ans, correct_ans", this_ans,correct_ans)
                    
                total_hit_category=total_hit_category+1
                total_count= total_count+1
                
                # Category match is still wrong, so add it to wrong answers
                add_wrong_answers(wrong_answer_list, correct_ans_text, this_ans, get_group_for_sign(this_ans), correct_ans, get_group_for_sign(correct_ans),type_unclear_char)
            
            #Not any match
            else: 
                if(type_unclear_char==CONST_INITIAL):
                    beg_count= beg_count+1
                elif(type_unclear_char==CONST_TERMINAL):
                    ter_count= ter_count+1
                elif(type_unclear_char==CONST_MEDIAL):
                    med_count= med_count+1
                    
                total_count= total_count+1
                
                add_wrong_answers(wrong_answer_list, correct_ans_text, this_ans, get_group_for_sign(this_ans), correct_ans, get_group_for_sign(correct_ans),type_unclear_char)
                
            i=i+1
        
        if(beg_count>0):
            beg_percent = (beg_hit/beg_count)*100
        else:
            beg_percent = -1
            
        if(beg_count>0):
            beg_percent_category = (beg_hit_category/beg_count)*100
        else:
            beg_percent_category  = -1
            
        
        if(ter_count>0):
            ter_percent = (ter_hit/ter_count)*100
        else:
            ter_percent = -1
            
        if(ter_count>0):
            ter_percent_category = (ter_hit_category/ter_count)*100
        else:
            ter_percent_category = -1
            
            
        if(med_count>0):
            med_percent = (med_hit/med_count)*100
        else:
            med_percent = -1
            
        if(med_count>0):
            med_percent_category  = (med_hit_category /med_count)*100
        else:
            med_percent_category  = -1
            
            
        if(total_count>0):
            total_percent = (total_hit/total_count)*100
        else:
            total_percent = -1
            
        if(total_count>0):
            total_percent_category  = (total_hit_category /total_count)*100
        else:
            total_percent_category  = -1
            
        
    except Exception as e:
            print("Exception:", e.__class__, "in check_answers")
            traceback.print_exc()

    return beg_percent,ter_percent,med_percent,total_percent, beg_percent_category,ter_percent_category,med_percent_category,total_percent_category,wrong_answer_list


In [47]:
def reverse_single_text(text):
    list_reversed_text = []
    # Tokenize to words
    # first split the string into chars
    chars = text.split(' ')

    # then reverse the split string list and join with a space
    reversed_text = ' '.join(reversed(chars))
    return reversed_text


In [48]:
def reverse_text(a):
    list_reversed_text = []
    for text in a :
        # Tokenize to words
        # first split the string into chars
        chars = text.split(' ')

        # then reverse the split string list and join with a space
        reversed_text = ' '.join(reversed(chars))
        list_reversed_text.append(reversed_text)
    return list_reversed_text 
    

## TESTING
1. Test 1: Use random sample of Training data but one character is made unclear: Take a few samples from Training data, make some characters as 000 (unclear). Do it in begining, terminal and medial regions of the text and see if the model is able to figure out the unclear characters
2. Test 2: Use Testing data: Take the Testing data and make some charcters as 000 (unclear). Do it in begining, terminal and medial regions of the text to test and see if the model is able to figure out the unclear characters
3. Test 3: Real Unclear texts: Use the actual texts with unclear data and what the model is able to come up with for the unclear characters

## Test 0 - Manual test


In [49]:
model_wf_kn = ModelWithFilling(model_kn,posNgramModel, True)
model_wf_kn_rev = ModelWithFilling(model_kn_rev,posNgramModel, True)
#Initial '327 740 090'
print("Train Data: Initial: 327:", model_wf_kn_rev.generate_sent0(1,['740', '772', '240'],8))

#model_wf_kn.get_model_ngrams_list0("<s>", 2,"left_bigram")

#print("Model Counts:", model_wf_kn.get_model().counts[['<s>']]['803'] )
print("Model Prob:", model_wf_kn.get_model().score('240', '<s> 803'.split()))
print("Model Prob:", model_wf_kn.get_model().score('740', '<s> 803 240 772'.split()))

print("Bigram score:", model_wf_kn.get_the_score("<s> 803"))

print("Trigram score:", model_wf_kn.get_the_score("<s> 803"))

#print("Quadragram score:", model_wf_kn.get_the_score("<s> 803 240 772"))

#print("Compute score:", model_wf_kn.compute_score(["803"], "<s>", "240", "172", "quadragram"))

print("-----Train Data: Initial: Ans:803:")
character, rank = model_wf_kn.find_char_by_rank("000 240 772 740", seed)
print(character, rank)

Train Data: Initial: 327: ('0 0 2', defaultdict(None, {'002': 0.33174186750591245, '060': 0.33052558457333736, '803': 0.33029744223283547}), ['740', '772', '240'])
Model Prob: 0.058166600530281956
Model Prob: 0.9999206144346594
Bigram score: 0.00286865545461898
Trigram score: 0.00286865545461898
-----Train Data: Initial: Ans:803:
803 0.2570503620444673


In [50]:
print("-----Train Data: Initial: Ans:?:")
character, rank = model_wf_kn.find_char_by_rank("000 590 390 741 032 220 440 740", seed)
print(character, rank)

ans1, dict1, context = model_wf_kn_rev.generate_sent0(1,['032', '741', '390', '590'] ,8)
print(ans1, dict1, context)
print(len(context))               

-----Train Data: Initial: Ans:?:
111 0.29375185188011116
0 3 2 defaultdict(None, {'002': 0.16346600686488327, '031': 0.03936164522540927, '032': 0.0867091235771011, '033': 0.040304588935206914, '060': 0.03839857157975135, '070': 0.038362337695202864, '220': 0.039358625735030225, '231': 0.03837441565671903, '233': 0.03840159107013039, '235': 0.04028848498651869, '240': 0.038410649541267515, '741': 0.0810262988984899, '742': 0.03836837667596094, '900': 0.039321385353688725}) ['390', '590']
2


In [51]:
character, score, order_of_match = model_wf_kn.get_end_chars("<s> 000 590 390 741 032 220 440 740 </s>", random_seed=seed)
print(character, score, order_of_match)

111 0.9180850288490738 3


In [52]:
#Check one of the models and play with it
k=7
model_kn_sept = KneserNeyInterpolated(k) #sept model
model_kn_sept_rev = KneserNeyInterpolated(k) # Reverse string, Quadgram model

train_data, padded_sents = padded_everygram_pipeline(k, tokenized_text)
train_data_rev, padded_sents_rev = padded_everygram_pipeline(k, reverse_tokenized_text)

model_kn_sept.fit(train_data, padded_sents)
print(model_kn_sept.vocab)

model_kn_sept_rev.fit(train_data_rev, padded_sents_rev)
print(model_kn_sept_rev.vocab)

print(model_kn_sept.counts)

model_wf_kn_sept = ModelWithFilling(model_kn_sept,posNgramModel, True)
model_wf_kn_sept_rev = ModelWithFilling(model_kn_sept_rev,posNgramModel, True)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 582 items>
<Vocabulary with cutoff=1 unk_label='<UNK>' and 582 items>
<NgramCounter with 7 ngram orders and 181230 ngrams>


In [53]:
# 000 740 045  ->(806)

ans1, dict1, context = model_wf_kn_sept.generate_sent0(1,['806', '740'] ,8)
print(ans1, dict1,context)

ans1, dict1, context = model_wf_kn_sept_rev.generate_sent0(1,['045', '740'] ,8)
print(ans1, dict1, context)

print("Score:", model_wf_kn_sept.get_the_score("806 740 045"))

print("Score:", model_wf_kn_sept.get_the_score("222 740 045"))
#ans1, dict1, context = print(model_wf_kn_sept.generate_sent0(1,['740', '045'],8))
#print(ans1, dict1, context)

print("Model Counts of 6:", model_wf_kn_sept.get_model().counts[6])
print("Model Counts of 5:", model_wf_kn_sept.get_model().counts[5])
print("Model Counts of 4:", model_wf_kn_sept.get_model().counts[4])
print("Model Counts of 3:", model_wf_kn_sept.get_model().counts[3])
print("Model Counts of 2:", model_wf_kn_sept.get_model().counts[2])


#print("Values in Model Counts of 5:",list(model_wf_kn_sept.get_model().counts[5]))

context_text_token_list = ['<s>', '740', '045']
#context_text_token_list = ["806", "740", "045"]
                           
#ngrams_list =  model_wf_kn_sept.get_model_ngrams_list_for_missing_ends(context_text_token_list)
#print(ngrams_list)


# 000 740 045  ->(806)
character, score, order_of_match = model_wf_kn_sept.get_end_chars("<s> 000 740 045 </s>", random_seed=seed)
print(character, score, order_of_match)

#176 740 000 ->(436)
character, score, order_of_match = model_wf_kn_sept.get_end_chars("<s> 176 740 000 </s>", random_seed=seed)
print(character, score, order_of_match)

print("Score1:", model_wf_kn_sept.get_the_score("<s> 176 740 436 </s>"))
print("Score2:", model_wf_kn_sept.get_model().score('</s>', "176 740 436".split()))

0 4 5 defaultdict(None, {'045': 0.45029780141694703}) ['806', '740']
8 0 6 defaultdict(None, {'806': 0.9006630225731349}) ['045', '740']
Score: 0.45029780141694703
Score: 0.00017868085016822365
Model Counts of 6: <ConditionalFreqDist with 9586 conditions>
Model Counts of 5: <ConditionalFreqDist with 7784 conditions>
Model Counts of 4: <ConditionalFreqDist with 5815 conditions>
Model Counts of 3: <ConditionalFreqDist with 3293 conditions>
Model Counts of 2: <ConditionalFreqDist with 581 conditions>
806 0.9999076829638628 5
400 0.9999553360788188 5
Score1: 0.9994576829638628
Score2: 0.9945768296386275


In [54]:
# Medial various tests

#<s> 220 000 740 090 </s> -> (240)
character, rank = model_wf_kn_sept.find_char_by_rank("<s> 220 000 740 090 </s>", seed)
print("ans", character, "rank", rank)
print("Done finding char by Rank")

#Try forward
print("media:240 fwd generate_sent", model_wf_kn_sept.generate_sent(1,['<s>'],8))

context = ['<s>', '220', '240']

samples= model_wf_kn_sept.get_model().context_counts(model_wf_kn_sept.get_model().vocab.lookup(context))
print("Context samples-1",samples)
for w in samples:
    print(w, model_wf_kn_sept.get_model().score(w, context) )


context = ['<s>','090', '740', '240']

samples= model_wf_kn_sept_rev.get_model().context_counts(model_wf_kn_sept_rev.get_model().vocab.lookup(context))
print("Context samples-2", samples)

for w in samples:
    print(w, model_wf_kn_sept.get_model().score(w, context) )

    
#Try forward
ans1, dict1, context = model_wf_kn_sept.generate_sent0(1,['<s>', '220'],8)
print("medial:240 fwd","ans", ans1, "dict1", dict1, "context", context)


#Try reverse

ans1, dict1, context = model_wf_kn_sept_rev.generate_sent0(1,['<s>','090', '740'],8)
print("medial:240 rev","ans", ans1, "dict1", dict1, "context", context)


ans 222 rank 0.3981770972654006
Done finding char by Rank
media:240 fwd generate_sent 1 7 5
Context samples-1 <FreqDist with 1 samples and 1 outcomes>
740 0.9094210066808381
Context samples-2 <FreqDist with 4 samples and 8 outcomes>
235 0.0022249905676871968
220 0.005283314468707728
575 1.7346253301309482e-05
031 5.604174143499987e-05
medial:240 fwd ans 1 4 2 dict1 defaultdict(None, {'017': 0.05139397501130587, '032': 0.052158513699213725, '081': 0.050651348948172165, '100': 0.051396552967161654, '142': 0.050669394639162646, '151': 0.051408153768512675, '176': 0.05356384919395186, '233': 0.05283153495424128, '236': 0.05137850727617117, '240': 0.05284055779973652, '440': 0.0513888190995943, '520': 0.057871373111046544, '550': 0.05067712850673, '705': 0.052126289251016436, '740': 0.05876223163588777, '798': 0.0535574043043124, '840': 0.051396552967161654}) context ['<s>', '220']
medial:240 rev ans 1 0 3 dict1 defaultdict(None, {'032': 0.050549759999816736, '035': 0.05048383181331451, '10

In [55]:
#list(model_wf_kn.get_model().counts[3])

In [56]:
print("-----Testing-------------------")

model_wf_kn = ModelWithFilling(model_kn,posNgramModel, True)
model_wf_kn_rev = ModelWithFilling(model_kn_rev,posNgramModel, True)
    
print("-----RtoL: Terminal char----- Correct Ans: 520")
print("Send three characters")
print(model_wf_kn.generate_sent(1,['060','705', '033'],8))

#ALL OF THIS IS L TO R

#Initial: 741
#741 031 221 400
print("Train Data: Initial: 741:", model_wf_kn_rev.generate_sent(1,['400','221','031'],8))
#-> Not able to find

print("Directly looking")
samples = model_kn_rev.context_counts(model_kn_rev.vocab.lookup(['400','221','031']))
print(samples)
tpl = tuple(model_kn.score(w, context) for w in samples)
for w in samples:
    print(w)
print("tpl:", tpl)

samples = model_kn_rev.context_counts(model_kn_rev.vocab.lookup(['031']))
#print(samples)


#medial: 001
#255 435 690 740 900 001 003 424
print("Train Data: medial: 001:",model_wf_kn.generate_sent(1,['435','690', '740', '900'],8))


#Terminal 740
#321 405 002 806 233 320 920 740
print("Train Data: Terminal 740:", model_wf_kn.generate_sent(1,['233','320', '920'],8))

#Initial: 820
#820 798 740
print("Train Data: Initial: 820:", model_wf_kn_rev.generate_sent(1,['740', '798'],8))

print("Directly looking")
samples = model_kn_rev.context_counts(model_kn_rev.vocab.lookup(['740','798']))
print(samples)
tpl = tuple(model_kn.score(w, context) for w in samples)
for w in samples:
    print(w)
print("tpl:", tpl)


#{'text': '746 130 400', 'len_text': 3, 'pred_answer': '032', 'pred_answer_group': None, 'correct_answer': '746', 'correct_answer_group': 'U-shape', 'type': 'Intial

print("Test Data: Initial: 746:",model_wf_kn_rev.generate_sent(1,['400', '130'],8))
print("Test Data: Initial: 746:",model_wf_kn.generate_sent(1,['130', '400'],8))

print(model_wf_kn.get_the_score("838 798"))
print(model_wf_kn.get_the_score("798 740"))
print(model_wf_kn.get_the_score("838 798 740"))

#medial:798
#850 092 741 838 798 740 621

#Try forward
print("Train Data: medial:798:", model_wf_kn.generate_sent0(1,['092', '741', '838'],8))

#Try reverse
print("Train Data: medial:798:", model_wf_kn_rev.generate_sent0(1,['621', '740'],8))

character =""
print("Finding char by Rank. This will take few min ...")

character, rank = model_wf_kn.find_char_by_rank("850 092 741 838 000 740 621", seed)
print(character, rank)
print("Done finding char by Rank")

#model_wf_kn.get_model_ngrams_list("741")
#print("List of trigram:", model_wf_kn.get_model_ngrams_list("838 740"))


-----Testing-------------------
-----RtoL: Terminal char----- Correct Ans: 520
Send three characters
5 2 0
Train Data: Initial: 741: 0 3 3
Directly looking
<FreqDist with 0 samples and 0 outcomes>
tpl: ()
Train Data: medial: 001: 0 0 1
Train Data: Terminal 740: 7 4 0
Train Data: Initial: 820: 2 3 1
Directly looking
<FreqDist with 12 samples and 25 outcomes>
460
235
231
220
240
233
820
745
838
803
700
003
tpl: (1.0032281697650481e-05, 1.620599351158924e-05, 8.488853744165792e-06, 0.00032944168999347553, 0.0006478989836426704, 0.00031246398250514394, 0.00031863769431908273, 0.0003039751287609782, 4.630283860454069e-06, 0.00031246398250514394, 1.929284941855862e-05, 2.392313327901269e-05)
Test Data: Initial: 746: 7 4 6
Test Data: Initial: 746: </ s>
0.29060734892195567
0.2264947372642942
0.32264947372642944
Train Data: medial:798: ('7 9 8', defaultdict(None, {'798': 0.9329060734892196}), ['092', '741', '838'])
Train Data: medial:798: ('3 9 0', defaultdict(None, {'390': 0.3010024364112694,

In [57]:
#Testing TBD remove these
#model_wf_kn.get_model_ngrams_list("838")
#print("Unigrams:", sorted(model_wf_kn.get_model().counts.unigrams.items()))
#print("Bigrams:", model_wf_kn.get_model().counts[2]['741'].items())

#list_ngrams = sorted(model_wf_kn.get_model().counts[['838 740']].items(),reverse=True, key = lambda x: x[1])
#print(list_ngrams)

#print(self.counts[2][context])

dict_trigrams = model_wf_kn.get_model().counts[4]
#print(dict_trigrams)
list_dict_trigrams = list(dict_trigrams)
print((list_dict_trigrams))

for tpl in list_dict_trigrams:
    #print(tpl)
    if(tpl[0]=='838'and tpl[2]=='740'):
        print(tpl)

[('<s>', '<s>', '<s>'), ('<s>', '<s>', '806'), ('<s>', '806', '740'), ('806', '740', '045'), ('740', '045', '</s>'), ('045', '</s>', '</s>'), ('<s>', '<s>', '455'), ('<s>', '455', '590'), ('455', '590', '002'), ('590', '002', '005'), ('002', '005', '368'), ('005', '368', '</s>'), ('368', '</s>', '</s>'), ('<s>', '<s>', '365'), ('<s>', '365', '527'), ('365', '527', '</s>'), ('527', '</s>', '</s>'), ('<s>', '<s>', '327'), ('<s>', '327', '740'), ('327', '740', '090'), ('740', '090', '</s>'), ('090', '</s>', '</s>'), ('<s>', '<s>', '140'), ('<s>', '140', '920'), ('140', '920', '944'), ('920', '944', '002'), ('944', '002', '240'), ('002', '240', '482'), ('240', '482', '740'), ('482', '740', '</s>'), ('740', '</s>', '</s>'), ('<s>', '<s>', '176'), ('<s>', '176', '125'), ('176', '125', '350'), ('125', '350', '299'), ('350', '299', '061'), ('299', '061', '400'), ('061', '400', '740'), ('400', '740', '</s>'), ('<s>', '<s>', '415'), ('<s>', '415', '220'), ('415', '220', '845'), ('220', '845', '4

In [58]:
#Testing TBD remove these
samples = model_kn.context_counts(model_kn.vocab.lookup(['740', '838']))
for w in samples:
    print(w)


In [59]:
""" Add initial and terminal padding to each text"""
def pad(text):
    return "<s> " + text + " </s>"

In [60]:
# Function for Data Preparation
# Data preparation for testing
# Take n rows from given set, convert a known sign to unclear sign and produce a dataframe

def prepare_data(a,max_text_chars,min_text_chars,max_num_of_rows, seed):

    list_changed_texts = []
    list_changed_reversed_text = []
    test_correct_answers= []
    row_count=0
    ls_made_up_row = []
    random.seed(seed)

    for text in a:
        # Tokenize to words, first split the string into chars
        
        chars = text.split(' ')
        new_text = chars
        #print("text, len(chars),max_text_chars,min_text_chars,row_count",text, len(chars),max_text_chars,min_text_chars,row_count)

        if(len(chars)<=max_text_chars and len(chars)>=min_text_chars) :
            if(len(chars)>=min_text_chars):
                #randomly pick an index in the tokenized_text and change it to unclear
                r = random.randrange(0, len(chars))
            else: r=0

            if(r==0): type_unclear_char = CONST_INITIAL
            elif(r==len(chars)-1): type_unclear_char = CONST_TERMINAL
            else: type_unclear_char = CONST_MEDIAL

            """Pad it in answer and in changed text"""
            add_answers(pad(text), test_correct_answers, chars[r], type_unclear_char)

            new_text[r]= '000'
            # then join with a space
            changed_text = ' '.join((new_text))

            made_up_row= {'site' : 'fake_site',
               'changed_reversed_text'  : pad(reverse_single_text(changed_text)),
               'changed_text' : pad(changed_text)}

            ls_made_up_row.append(made_up_row)


            row_count=row_count+1
            if(row_count>=max_num_of_rows): break


    df_made_up = pd.DataFrame(ls_made_up_row)
    
    return df_made_up, test_correct_answers

In [61]:
def run_model(model_type, model_name_list,model_fwd, model_rev,a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose) :
    
    #beg_hit,term_hit, med_hit, total_hit=0,0,0,0
    beg_percent,ter_percent,med_percent,total_percent=0,0,0,0
    beg_percent_category,ter_percent_category,med_percent_category,total_percent_category=0,0,0,0
    wrong_answer_list = []
    ans= None
    
    try:
        print("\n_____Trying unclear texts in Fwd Direction:______")
        for index in range(0, len(model_name_list)):

            print("\n****************Model Name:", model_name_list[index], ", Order:", model_fwd[index].order)
            ans = find_unclear_characters(model_fwd[index], model_rev[index],a,fill_using_position, seed)

            print(model_type,":", model_name_list[index])
            if(check_the_answers==True):
                beg_percent,ter_percent,med_percent,total_percent,beg_percent_category,ter_percent_category,med_percent_category,total_percent_category, wrong_answer_list = check_answers(ans,test_correct_answers)

                print("beg%:",round(beg_percent,2), " ter%:",round(ter_percent,2), " med%:", round(med_percent,2), " tot%:", round(total_percent,2)," and a total of ", round((total_percent/100)*len(test_correct_answers)), " out of", len(test_correct_answers))
                
                print("beg_w_cat%:",round(beg_percent+beg_percent_category,2), " ter_w_cat%:",round(ter_percent+ter_percent_category,2), " med_w_cat%:", round(med_percent+med_percent_category,2), " tot_w_cat%:", round(total_percent+total_percent_category,2))
                
                
                if(wrong_answer_details_verbose): print("\n Wrong Answers:", wrong_answer_list)
            else:
                print("Answers:", ans)
    except Exception as e:
        print("Exception:", e.__class__, "run_model")
        traceback.print_exc()


    if(try_reverse):
        # Try unclear strings in reverse
        print("\n_____Trying unclear texts in Reverse Direction:______")
        try:
            for index in range(0, len(model_name_list)):
                print("\n****************Model Name:", model_name_list[index], ", Order:", model_rev[index].order)
                ans = find_unclear_characters(model_rev[index], model_fwd[index],a_rev,fill_using_position, seed)

                print(model_type,":", model_name_list[index]) 
                if(check_the_answers==True):
                    beg_percent,ter_percent,med_percent,total_percent, beg_percent_category,ter_percent_category,med_percent_category,total_percent_category, wrong_answer_list = check_answers(ans,test_correct_answers)
                    
                    print(" beg%:",round(beg_percent,2), " ter%:",round(ter_percent,2), " med%:", round(med_percent,2), " tot%:", round(total_percent,2)," and a total of ", round((total_percent/100)*len(test_correct_answers)), " out of", len(test_correct_answers))
                    
                    print("beg_w_cat%:",round(beg_percent+beg_percent_category,2), " ter_w_cat%:",round(ter_percent+ter_percent_category,2), " med_w_cat%:", round(med_percent+med_percent_category,2), " tot_w_cat%:", round(total_percent+total_percent_category,2))
               
                    if(wrong_answer_details_verbose): print("\n Wrong Answers:", wrong_answer_list)
                else:
                    print("Answers:", ans)
        except Exception as e:
            print("Exception:", e.__class__, "run_model")
            traceback.print_exc()
    
    return ans



In [62]:
# Function for running a test
def run_test(test_name,a, a_rev,fill_using_position, check_the_answers, test_correct_answers, try_reverse,wrong_answer_details_verbose, seed):
    
    verbose_debug= True
    
    unigram_models= False
    bigram_models= False
    trigram_models= False
    quadgram_models= False
    pentagram_models= False
    hexagram_models= False
    septagram_models= True

    print("_____________________________")
    print("_____ Running ", test_name, "_________")
    print("_____________________________")


    beg_hit,term_hit, med_hit, total_hit=0,0,0,0
    beg_percent,ter_percent,med_percent,total_percent=0,0,0,0
    wrong_answer_list = []

    try_reverse = False

    try:
        
        if(unigram_models):
            ans = run_model("Fwd Unigram Model:", model_name_list,models_list_fwd_unigram, models_list_rev_unigram, a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose)
    
        if(bigram_models):
            ans = run_model("Fwd Bigram Model:", model_name_list,models_list_fwd_bigram, models_list_rev_bigram, a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose)
    
        if(trigram_models):
            ans = run_model("Fwd Trigram Model:", model_name_list,models_list_fwd_trigram, models_list_rev_trigram, a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose)
    
        if(quadgram_models):
            ans = run_model("Fwd Quadgram Model:", model_name_list,models_list_fwd_quadgram, models_list_rev_quadgram, a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose)
    
        if(pentagram_models):
            ans = run_model("Fwd Pentagram Model:", model_name_list,models_list_fwd_pentagram, models_list_rev_pentagram, a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose)
    
        if(hexagram_models):
            ans = run_model("Fwd hexagram Model:", model_name_list,models_list_fwd_hexagram, models_list_rev_hexagram, a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose)
    
        if(septagram_models):
            ans = run_model("Fwd septagram Model:", model_name_list,models_list_fwd_septagram, models_list_rev_septagram, a, a_rev, seed, fill_using_position, try_reverse, check_the_answers,test_correct_answers, wrong_answer_details_verbose)
    
    
    except Exception as e:
        print("Exception:", e.__class__, "run_test")
        traceback.print_exc()

# Test 1

In [63]:
# Test1
# Data preparation for this test
# Take n rows from train set, convert a known sign to unclear sign and produce a dataframe

verbose_debug = True
min_text_chars = 2
max_text_chars = 40
max_num_of_rows= 200
    
df_made_up_from_train, test1_correct_answers =prepare_data(df_train_x[df_train_x.l_to_r_text!=''].l_to_r_text, max_text_chars,min_text_chars,max_num_of_rows, seed)

if(verbose_debug):
    print(df_made_up_from_train.changed_text)
    print(df_made_up_from_train.changed_reversed_text)
    
    print("test1_correct_answers: \n", test1_correct_answers)


0                      <s> 000 740 045 </s>
1              <s> 455 590 000 005 368 </s>
2                          <s> 365 000 </s>
3                      <s> 000 740 090 </s>
4      <s> 140 000 944 002 240 482 740 </s>
                       ...                 
195                <s> 000 060 032 226 </s>
196    <s> 140 136 400 255 435 000 617 </s>
197        <s> 803 233 000 706 033 520 </s>
198                        <s> 158 000 </s>
199                        <s> 000 740 </s>
Name: changed_text, Length: 200, dtype: object
0                      <s> 045 740 000 </s>
1              <s> 368 005 000 590 455 </s>
2                          <s> 000 365 </s>
3                      <s> 090 740 000 </s>
4      <s> 740 482 240 002 944 000 140 </s>
                       ...                 
195                <s> 226 032 060 000 </s>
196    <s> 617 000 435 255 400 136 140 </s>
197        <s> 520 033 706 000 233 803 </s>
198                        <s> 000 158 </s>
199                        <s

In [64]:
# Test1
# Run the test
a= df_made_up_from_train[df_made_up_from_train.changed_text!=''].changed_text
a_rev= df_made_up_from_train[df_made_up_from_train.changed_reversed_text!=''].changed_reversed_text
check_the_answers = True
try_reverse = False
wrong_answer_details_verbose = True
verbose_debug = False
fill_using_position =False
use_initial_terminal_model= False
run_test("Test-1",a, a_rev, fill_using_position, check_the_answers, test1_correct_answers, try_reverse, wrong_answer_details_verbose, seed, )
    

_____________________________
_____ Running  Test-1 _________
_____________________________

_____Trying unclear texts in Fwd Direction:______

****************Model Name: MLE , Order: 7
Fwd septagram Model: : MLE
beg%: 91.07  ter%: 94.34  med%: 81.32  tot%: 87.5  and a total of  175  out of 200
beg_w_cat%: 94.64  ter_w_cat%: 94.34  med_w_cat%: 85.71  tot_w_cat%: 90.5

 Wrong Answers: [{'text': '<s> 220 240 740 090 </s>', 'len_text': 6, 'pred_answer': '879', 'pred_answer_group': 'rhomb', 'correct_answer': '240', 'correct_answer_group': 'fish', 'type': 'Medial'}, {'text': '<s> 400 740 178 </s>', 'len_text': 5, 'pred_answer': '</s>', 'pred_answer_group': None, 'correct_answer': '740', 'correct_answer_group': 'U-shape', 'type': 'Medial'}, {'text': '<s> 590 892 350 298 </s>', 'len_text': 6, 'pred_answer': '741', 'pred_answer_group': 'U-shape', 'correct_answer': '892', 'correct_answer_group': 'half-moon', 'type': 'Medial'}, {'text': '<s> 416 740 </s>', 'len_text': 4, 'pred_answer': '519', '

Fwd septagram Model: : Laplace
beg%: 85.71  ter%: 92.45  med%: 52.75  tot%: 72.5  and a total of  145  out of 200
beg_w_cat%: 91.07  ter_w_cat%: 94.34  med_w_cat%: 58.24  tot_w_cat%: 77.0

 Wrong Answers: [{'text': '<s> 176 125 350 299 061 400 740 </s>', 'len_text': 9, 'pred_answer': '100', 'pred_answer_group': 'human', 'correct_answer': '400', 'correct_answer_group': 'comb', 'type': 'Medial'}, {'text': '<s> 220 240 740 090 </s>', 'len_text': 6, 'pred_answer': '176', 'pred_answer_group': '-', 'correct_answer': '240', 'correct_answer_group': 'fish', 'type': 'Medial'}, {'text': '<s> 240 235 083 740 </s>', 'len_text': 6, 'pred_answer': '233', 'pred_answer_group': 'fish', 'correct_answer': '235', 'correct_answer_group': 'fish', 'type': 'Medial'}, {'text': '<s> 014 220 </s>', 'len_text': 4, 'pred_answer': '016', 'pred_answer_group': None, 'correct_answer': '014', 'correct_answer_group': None, 'type': 'Initial'}, {'text': '<s> 025 032 740 </s>', 'len_text': 5, 'pred_answer': '100', 'pred_ans

Fwd septagram Model: : Lidstone
beg%: 85.71  ter%: 92.45  med%: 68.13  tot%: 79.5  and a total of  159  out of 200
beg_w_cat%: 91.07  ter_w_cat%: 94.34  med_w_cat%: 73.63  tot_w_cat%: 84.0

 Wrong Answers: [{'text': '<s> 176 125 350 299 061 400 740 </s>', 'len_text': 9, 'pred_answer': '100', 'pred_answer_group': 'human', 'correct_answer': '400', 'correct_answer_group': 'comb', 'type': 'Medial'}, {'text': '<s> 220 240 740 090 </s>', 'len_text': 6, 'pred_answer': '176', 'pred_answer_group': '-', 'correct_answer': '240', 'correct_answer_group': 'fish', 'type': 'Medial'}, {'text': '<s> 014 220 </s>', 'len_text': 4, 'pred_answer': '016', 'pred_answer_group': None, 'correct_answer': '014', 'correct_answer_group': None, 'type': 'Initial'}, {'text': '<s> 176 740 436 </s>', 'len_text': 5, 'pred_answer': '400', 'pred_answer_group': 'comb', 'correct_answer': '436', 'correct_answer_group': 'Z-shape', 'type': 'Terminal'}, {'text': '<s> 033 061 142 400 740 </s>', 'len_text': 7, 'pred_answer': '100',

Fwd septagram Model: : WittenBellInterpolated
beg%: 80.36  ter%: 90.57  med%: 82.42  tot%: 84.0  and a total of  168  out of 200
beg_w_cat%: 85.71  ter_w_cat%: 92.45  med_w_cat%: 87.91  tot_w_cat%: 88.5

 Wrong Answers: [{'text': '<s> 327 740 090 </s>', 'len_text': 5, 'pred_answer': '362', 'pred_answer_group': 'A-shape', 'correct_answer': '327', 'correct_answer_group': 'house', 'type': 'Initial'}, {'text': '<s> 220 240 740 090 </s>', 'len_text': 6, 'pred_answer': '222', 'pred_answer_group': 'fish', 'correct_answer': '240', 'correct_answer_group': 'fish', 'type': 'Medial'}, {'text': '<s> 014 220 </s>', 'len_text': 4, 'pred_answer': '017', 'pred_answer_group': None, 'correct_answer': '014', 'correct_answer_group': None, 'type': 'Initial'}, {'text': '<s> 176 740 436 </s>', 'len_text': 5, 'pred_answer': '400', 'pred_answer_group': 'comb', 'correct_answer': '436', 'correct_answer_group': 'Z-shape', 'type': 'Terminal'}, {'text': '<s> 400 740 178 </s>', 'len_text': 5, 'pred_answer': '</s>', '

In [65]:
a= df_made_up_from_train[df_made_up_from_train.changed_text=='000 320 920 740'].changed_text
a_rev= df_made_up_from_train[df_made_up_from_train.changed_reversed_text=='749 920 320 000'].changed_reversed_text
check_the_answers = False
try_reverse = False
wrong_answer_details_verbose = True
verbose_debug = True
fill_using_position =False

model_wf_quad_mle_fwd = ModelWithFilling(models_list_fwd_quadgram[0],posNgramModel, True)
model_wf_quad_mle_rev = ModelWithFilling(models_list_rev_quadgram[0],posNgramModel, True)

#answer 235
print(model_wf_quad_mle_rev.generate_sent(1,['749','920','320'],8))

print(model_wf_quad_mle_rev.generate_sent(1,['920','320'],8))

print(model_wf_quad_mle_fwd.generate_sent(1,['320'],8))

print("Directly looking-1")
samples = models_list_rev_quadgram[0].context_counts(models_list_rev_quadgram[0].vocab.lookup(['740','920']))
print(samples)
tpl = tuple(models_list_rev_quadgram[0].score(w, context) for w in samples)
for w in samples:
    print(w)
print("tpl:", tpl)

print("Directly looking-1")
samples = models_list_fwd_quadgram[0].context_counts(models_list_fwd_quadgram[0].vocab.lookup(['235','320', '920']))
print(samples)
tpl = tuple(models_list_fwd_quadgram[0].score(w, context) for w in samples)
for w in samples:
    print(w)
print("tpl:", tpl)

run_test("Test-1.1",a, a_rev, fill_using_position, check_the_answers, test1_correct_answers, try_reverse, wrong_answer_details_verbose, seed, )


2 3 3
2 3 3
9 0 0
Directly looking-1
<FreqDist with 4 samples and 10 outcomes>
320
323
318
140
tpl: (0.0, 0.0, 0.0, 0.0)
Directly looking-1
<FreqDist with 1 samples and 1 outcomes>
740
tpl: (0,)
_____________________________
_____ Running  Test-1.1 _________
_____________________________

_____Trying unclear texts in Fwd Direction:______

****************Model Name: MLE , Order: 7
Fwd septagram Model: : MLE
Answers: []

****************Model Name: KneserNeyInterpolated , Order: 7
Fwd septagram Model: : KneserNeyInterpolated
Answers: []

****************Model Name: Laplace , Order: 7
Fwd septagram Model: : Laplace
Answers: []

****************Model Name: Lidstone , Order: 7
Fwd septagram Model: : Lidstone
Answers: []

****************Model Name: StupidBackoff , Order: 7
Fwd septagram Model: : StupidBackoff
Answers: []

****************Model Name: WittenBellInterpolated , Order: 7
Fwd septagram Model: : WittenBellInterpolated
Answers: []


## Test 2

In [66]:
# Test 2
# Data preparation for this test
# Take n rows from test set, convert a known sign to unclear sign and produce a dataframe

verbose_debug = True
min_text_chars = 2
max_text_chars = 40
max_num_of_rows=200
    
df_made_up_from_test, test2_correct_answers =prepare_data(df_test_x[df_test_x.l_to_r_text!=''].l_to_r_text, max_text_chars,min_text_chars,max_num_of_rows,seed)

if(verbose_debug):
    print(df_made_up_from_test.changed_text)
    print("test2_correct_answers: \n", test2_correct_answers)


0                          <s> 000 861 096 </s>
1                          <s> 491 000 740 </s>
2                      <s> 933 002 004 000 </s>
3      <s> 924 001 000 031 055 002 150 416 </s>
4              <s> 322 000 740 001 003 137 </s>
                         ...                   
195                    <s> 741 031 000 400 </s>
196                <s> 692 060 231 927 000 </s>
197        <s> 165 031 002 000 100 740 090 </s>
198    <s> 416 840 060 000 220 590 390 740 </s>
199                    <s> 000 578 032 824 </s>
Name: changed_text, Length: 200, dtype: object
test2_correct_answers: 
 [{'text': '<s> 002 861 096 </s>', 'len_text': 20, 'answer': '002', 'type': 'Initial'}, {'text': '<s> 491 817 740 </s>', 'len_text': 20, 'answer': '817', 'type': 'Medial'}, {'text': '<s> 933 002 004 405 </s>', 'len_text': 24, 'answer': '405', 'type': 'Terminal'}, {'text': '<s> 924 001 319 031 055 002 150 416 </s>', 'len_text': 40, 'answer': '319', 'type': 'Medial'}, {'text': '<s> 322 920 740 001 00

In [67]:
# Test2
# Run the test
verbose_debug = False
a= df_made_up_from_test[df_made_up_from_test.changed_text!=''].changed_text
a_rev= df_made_up_from_test[df_made_up_from_test.changed_reversed_text!=''].changed_reversed_text
check_the_answers = True
try_reverse = False
wrong_answer_details_verbose = True
fill_using_position = False
use_initial_terminal_model= False

ans = run_test("Test-2",a, a_rev, fill_using_position, check_the_answers, test2_correct_answers, try_reverse, wrong_answer_details_verbose, seed)

_____________________________
_____ Running  Test-2 _________
_____________________________

_____Trying unclear texts in Fwd Direction:______

****************Model Name: MLE , Order: 7
Fwd septagram Model: : MLE
beg%: 22.41  ter%: 46.0  med%: 27.17  tot%: 30.5  and a total of  61  out of 200
beg_w_cat%: 39.66  ter_w_cat%: 58.0  med_w_cat%: 41.3  tot_w_cat%: 45.0

 Wrong Answers: [{'text': '<s> 002 861 096 </s>', 'len_text': 5, 'pred_answer': '392', 'pred_answer_group': 'stick', 'correct_answer': '002', 'correct_answer_group': None, 'type': 'Initial'}, {'text': '<s> 933 002 004 405 </s>', 'len_text': 6, 'pred_answer': '390', 'pred_answer_group': 'stick', 'correct_answer': '405', 'correct_answer_group': 'stick', 'type': 'Terminal'}, {'text': '<s> 924 001 319 031 055 002 150 416 </s>', 'len_text': 10, 'pred_answer': '904', 'pred_answer_group': 'bow', 'correct_answer': '319', 'correct_answer_group': 'net', 'type': 'Medial'}, {'text': '<s> 322 920 740 001 003 137 </s>', 'len_text': 8, 'pr

Fwd septagram Model: : KneserNeyInterpolated
beg%: 25.86  ter%: 42.0  med%: 23.91  tot%: 29.0  and a total of  58  out of 200
beg_w_cat%: 39.66  ter_w_cat%: 56.0  med_w_cat%: 34.78  tot_w_cat%: 41.5

 Wrong Answers: [{'text': '<s> 002 861 096 </s>', 'len_text': 5, 'pred_answer': '216', 'pred_answer_group': 'circle', 'correct_answer': '002', 'correct_answer_group': None, 'type': 'Initial'}, {'text': '<s> 933 002 004 405 </s>', 'len_text': 6, 'pred_answer': '390', 'pred_answer_group': 'stick', 'correct_answer': '405', 'correct_answer_group': 'stick', 'type': 'Terminal'}, {'text': '<s> 924 001 319 031 055 002 150 416 </s>', 'len_text': 10, 'pred_answer': '904', 'pred_answer_group': 'bow', 'correct_answer': '319', 'correct_answer_group': 'net', 'type': 'Medial'}, {'text': '<s> 322 920 740 001 003 137 </s>', 'len_text': 8, 'pred_answer': '222', 'pred_answer_group': 'fish', 'correct_answer': '920', 'correct_answer_group': 'bow', 'type': 'Medial'}, {'text': '<s> 540 002 806 240 017 575 740 67

Fwd septagram Model: : Laplace
beg%: 31.03  ter%: 52.0  med%: 30.43  tot%: 36.0  and a total of  72  out of 200
beg_w_cat%: 44.83  ter_w_cat%: 62.0  med_w_cat%: 50.0  tot_w_cat%: 51.5

 Wrong Answers: [{'text': '<s> 002 861 096 </s>', 'len_text': 5, 'pred_answer': '603', 'pred_answer_group': 'square', 'correct_answer': '002', 'correct_answer_group': None, 'type': 'Initial'}, {'text': '<s> 491 817 740 </s>', 'len_text': 5, 'pred_answer': '100', 'pred_answer_group': 'human', 'correct_answer': '817', 'correct_answer_group': 'circle', 'type': 'Medial'}, {'text': '<s> 933 002 004 405 </s>', 'len_text': 6, 'pred_answer': '390', 'pred_answer_group': 'stick', 'correct_answer': '405', 'correct_answer_group': 'stick', 'type': 'Terminal'}, {'text': '<s> 924 001 319 031 055 002 150 416 </s>', 'len_text': 10, 'pred_answer': '904', 'pred_answer_group': 'bow', 'correct_answer': '319', 'correct_answer_group': 'net', 'type': 'Medial'}, {'text': '<s> 322 920 740 001 003 137 </s>', 'len_text': 8, 'pred_a

Fwd septagram Model: : Lidstone
beg%: 31.03  ter%: 52.0  med%: 35.87  tot%: 38.5  and a total of  77  out of 200
beg_w_cat%: 44.83  ter_w_cat%: 62.0  med_w_cat%: 54.35  tot_w_cat%: 53.5

 Wrong Answers: [{'text': '<s> 002 861 096 </s>', 'len_text': 5, 'pred_answer': '603', 'pred_answer_group': 'square', 'correct_answer': '002', 'correct_answer_group': None, 'type': 'Initial'}, {'text': '<s> 491 817 740 </s>', 'len_text': 5, 'pred_answer': '100', 'pred_answer_group': 'human', 'correct_answer': '817', 'correct_answer_group': 'circle', 'type': 'Medial'}, {'text': '<s> 933 002 004 405 </s>', 'len_text': 6, 'pred_answer': '390', 'pred_answer_group': 'stick', 'correct_answer': '405', 'correct_answer_group': 'stick', 'type': 'Terminal'}, {'text': '<s> 924 001 319 031 055 002 150 416 </s>', 'len_text': 10, 'pred_answer': '904', 'pred_answer_group': 'bow', 'correct_answer': '319', 'correct_answer_group': 'net', 'type': 'Medial'}, {'text': '<s> 322 920 740 001 003 137 </s>', 'len_text': 8, 'pred

Fwd septagram Model: : StupidBackoff
beg%: 22.41  ter%: 46.0  med%: 27.17  tot%: 30.5  and a total of  61  out of 200
beg_w_cat%: 39.66  ter_w_cat%: 58.0  med_w_cat%: 41.3  tot_w_cat%: 45.0

 Wrong Answers: [{'text': '<s> 002 861 096 </s>', 'len_text': 5, 'pred_answer': '392', 'pred_answer_group': 'stick', 'correct_answer': '002', 'correct_answer_group': None, 'type': 'Initial'}, {'text': '<s> 933 002 004 405 </s>', 'len_text': 6, 'pred_answer': '390', 'pred_answer_group': 'stick', 'correct_answer': '405', 'correct_answer_group': 'stick', 'type': 'Terminal'}, {'text': '<s> 924 001 319 031 055 002 150 416 </s>', 'len_text': 10, 'pred_answer': '904', 'pred_answer_group': 'bow', 'correct_answer': '319', 'correct_answer_group': 'net', 'type': 'Medial'}, {'text': '<s> 322 920 740 001 003 137 </s>', 'len_text': 8, 'pred_answer': '327', 'pred_answer_group': 'house', 'correct_answer': '920', 'correct_answer_group': 'bow', 'type': 'Medial'}, {'text': '<s> 540 002 806 240 017 575 740 679 </s>', 

Fwd septagram Model: : WittenBellInterpolated
beg%: 25.86  ter%: 48.0  med%: 28.26  tot%: 32.5  and a total of  65  out of 200
beg_w_cat%: 43.1  ter_w_cat%: 58.0  med_w_cat%: 46.74  tot_w_cat%: 48.5

 Wrong Answers: [{'text': '<s> 002 861 096 </s>', 'len_text': 5, 'pred_answer': '216', 'pred_answer_group': 'circle', 'correct_answer': '002', 'correct_answer_group': None, 'type': 'Initial'}, {'text': '<s> 933 002 004 405 </s>', 'len_text': 6, 'pred_answer': '390', 'pred_answer_group': 'stick', 'correct_answer': '405', 'correct_answer_group': 'stick', 'type': 'Terminal'}, {'text': '<s> 924 001 319 031 055 002 150 416 </s>', 'len_text': 10, 'pred_answer': '904', 'pred_answer_group': 'bow', 'correct_answer': '319', 'correct_answer_group': 'net', 'type': 'Medial'}, {'text': '<s> 322 920 740 001 003 137 </s>', 'len_text': 8, 'pred_answer': '772', 'pred_answer_group': 'U-shape', 'correct_answer': '920', 'correct_answer_group': 'bow', 'type': 'Medial'}, {'text': '<s> 540 002 806 240 017 575 740

In [68]:
#Perplexity calculations for Test texts
#use a model to check the perplexity of test texts

test_texts_list = list(df_test_x[df_test_x.l_to_r_text!=''].l_to_r_text)
tokenized_text_test = list(df_test_x[df_test_x.l_to_r_text!=''].l_to_r_text.apply(word_tokenize))

test_texts_list_rev = list(df_test_x_rev[df_test_x_rev.reversed_text!=''].reversed_text)
tokenized_text_test_rev = list(df_test_x_rev[df_test_x_rev.reversed_text!=''].reversed_text.apply(word_tokenize))
#print(tokenized_text_test)

#get_perplexity(model_kn, 4,test_texts_list,tokenized_text_test,seed)

print("Fwd Model")
get_perplexity(models_list_fwd_quadgram[3], 4,test_texts_list,tokenized_text_test,seed)

print("Reverse Model")
print()
get_perplexity(models_list_rev_quadgram[3], 4,test_texts_list,tokenized_text_test,seed)



Fwd Model
Perplexity( 002 861 096):63.96243763588799
Perplexity( 491 817 740):32.37154342633229
Perplexity( 933 002 004 405):70.3266849457204
Perplexity( 924 001 319 031 055 002 150 416):164.98812300717668
Perplexity( 322 920 740 001 003 137):112.00035572253769
Perplexity( 540 002 806 240 017 575 740 679):109.3982397800818
Perplexity( 156 176 741 060 920):96.54314273798006
Perplexity( 720 405 065 060 001 003 422):144.8161181609683
Perplexity( 700 034):34.10414188210058
Perplexity( 031 101 740 090):41.18281642869956
Perplexity( 034 700):29.152680808999932
Perplexity( 820 060 773 740):33.122860969975086
Perplexity( 920 060 803 032 220 904 590 407 740):78.53227318723326
Perplexity( 806 706 033 590 405 798 740):65.39586117127432
Perplexity( 415):23.933738335902433
Perplexity( 861 002 415 220 705 033 520):43.21112099729522
Perplexity( 140 156):33.01704496657195
Perplexity( 221 740 151):50.22063007112599
Perplexity( 390 155 004 390):54.91171844419296
Perplexity( 001 220 140 416 031 565):110.

Perplexity( 031 125 455 220):66.56249276183685
Perplexity( 142 171 035):71.22143821650677
Perplexity( 350 125 002 003 390):87.58831996276986
Perplexity( 234):32.06979397562378
Perplexity( 141 740 400):51.613940337412956
Perplexity( 803 002 003 390):65.30542731399869
Perplexity( 323 690):47.835192972954744
Perplexity( 861 002 035 405):71.77458796877231
Perplexity( 820 060 235 240 455 740):85.33820643735841
Perplexity( 817 002 004 390 621):93.40495258923615
Perplexity( 820 002 240 220 520):71.9829528401469
Perplexity( 016 840 002 803 140 850):107.40889850961388
Perplexity( 861 002 125):49.255529309458
Perplexity( 235 220 176 740):58.712288721037375
Perplexity( 920 060 741 005 368 705 167):138.72250595941586
Perplexity( 117):34.920443004009286
Perplexity( 016 617):43.061831889437244
Perplexity( 798 240 845 061 407):104.60837809882337
Perplexity( 415 220 520):55.28535109258812
Perplexity( 503 615 740):55.154364797527386
Perplexity( 840 032 740 400):55.22052124567655
Perplexity( 005 154):52

## Test 3

In [69]:
# Test3
# Run the test
a = df_unclear[df_unclear.l_to_r_text!=''].l_to_r_text
a= df_unclear[df_unclear.reversed_text!=''].reversed_text
try_reverse = False
check_the_answers = False
wrong_answer_details_verbose = False
fill_using_position = True

#ans = run_test("Test-3",a, a_rev, fill_using_position, check_the_answers, None, try_reverse, wrong_answer_details_verbose, seed)

# Text Positional Analysis - Tests

In [70]:
# Test1 - PositionalNgramModel
# Run the test
#a= df_made_up_from_train[df_made_up_from_train.changed_text!=''].changed_text #l_to_r


a_rev= df_made_up_from_train[df_made_up_from_train.changed_reversed_text!=''].changed_reversed_text

check_the_answers = True
try_reverse = False
wrong_answer_details_verbose = True

#posNgramModel.get_text_norm_position_unigrams_char_with_max_prob(10)

ans =posNgramModel.find_characters(a_rev, seed)
print("ans:", ans)

if(check_the_answers==True):
    beg_percent,ter_percent,med_percent,total_percent, wrong_answer_list = check_answers(ans,test1_correct_answers)
    print("beg%:",round(beg_percent,2), " ter%:",round(ter_percent,2), " med%:", round(med_percent,2), " tot%:", round(total_percent,2)," and a total of ", round((total_percent/100)*len(test1_correct_answers)), " out of", len(test1_correct_answers))
    if(wrong_answer_details_verbose): print("\n Wrong Answers:", wrong_answer_list)
else:
    print("Answers:", ans)

Finding Unclear character for <s> 045 740 000 </s>
2
Max Prob for Position: 2 is for character: 002
Index, out_char: 2 002
Finding Unclear character for <s> 368 005 000 590 455 </s>
4
Max Prob for Position: 4 is for character: 002
Index, out_char: 4 002
Finding Unclear character for <s> 000 365 </s>
3
Max Prob for Position: 3 is for character: 002
Index, out_char: 3 002
Finding Unclear character for <s> 090 740 000 </s>
2
Max Prob for Position: 2 is for character: 002
Index, out_char: 2 002
Finding Unclear character for <s> 740 482 240 002 944 000 140 </s>
3
Max Prob for Position: 3 is for character: 002
Index, out_char: 3 002
Finding Unclear character for <s> 740 000 061 299 350 125 176 </s>
7
Max Prob for Position: 7 is for character: 033
Index, out_char: 7 033
Finding Unclear character for <s> 407 845 220 000 </s>
2
Max Prob for Position: 2 is for character: 002
Index, out_char: 2 002
Finding Unclear character for <s> 692 000 </s>
2
Max Prob for Position: 2 is for character: 002
Ind

Traceback (most recent call last):
  File "/var/folders/0n/5lj8g_xs6xngc99x77p4k50h0000gp/T/ipykernel_19500/1390874887.py", line 206, in get_text_norm_position_unigrams_char_with_max_prob
    char= self.text_norm_position_unigrams_maxValueIndex_dict[position]
KeyError: 11


ValueError: too many values to unpack (expected 5)

## Other Tests