In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import re
from collections import Counter
from catboost import CatBoostClassifier

In [2]:
class MultiLabelCatBoostClassifier:
    def __init__(self, num_classes=26, catboost_params=None):
        # Initialize 26 CatBoost classifiers, one for each label
        classifiers = [CatBoostClassifier(iterations=100) for _ in range(num_classes)]
        
    def fit(self, X, y):
        # X is the feature matrix, y is a binary matrix indicating label presence
        for i in range(len(classifiers)):
            # Train each classifier on the corresponding label
            self.classifiers[i].fit(X, y[alpha[i]], verbose=100)
    
    def predict(self, X):
        # Predict probabilities for each label
        predictions = np.zeros((len(X), len(self.classifiers)))
        for i, clf in enumerate(self.classifiers):
            predictions[:, i] = clf.predict_proba(X)[:, 1]  # Probability of class '1'
        return predictions
    
    def save(self, filename):
        # Save the model to a pickle file
        with open(filename, 'wb') as file:
            pickle.dump(self, file)
    
    @classmethod
    def load(cls, filename):
        # Load the model from a pickle file
        with open(filename, 'rb') as file:
            model = pickle.load(file)
        return model

In [3]:
def build_dictionary(dictionary_file_location):
    text_file = open(dictionary_file_location,"r")
    full_dictionary = text_file.read().splitlines()
    text_file.close()
    return full_dictionary

words = build_dictionary("words_250000_train.txt")

In [4]:
model = MultiLabelCatBoostClassifier.load('models_trained/multilabel_catboost_model.pkl')

In [5]:
#calculate accuracy

from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score

In [6]:
alpha = "abcdefghijklmnopqrstuvwxyz_"

value =  {"a":1,"b":2,"c":3,"d":4,"e":5,"f":6,"g":7,
         "h":8,"i":9,"j":10,"k":11,"l":12,"m":13,"n":14,
         "o":15,"p":16,"q":17,"r":18,"s":19,"t":20,"u":21,
         "v":22,"w":23,"x":24,"y":25,"z":26,"_":0}

#create a reverse value dictionary

value_rev = {v: k for k, v in value.items()}

In [7]:
#array of size 80 with -1s
def prediction(tc):
    inp = np.array([-1 for x in range(80)])
    for i in range(len(tc)):
        inp[i] = value[tc[i]]
        inp[80-len(tc)+i] = value[tc[i]]

    inp = model.predict(np.array([inp]))[0]

    perm = ""
    prob = []
    for i in range(26):
        perm += alpha[np.argmax(inp)]
        prob.append(inp[np.argmax(inp)])
        inp[np.argmax(inp)] = -1
    return perm, prob

In [10]:
#create a subset of 6 character x which contain only one '_' and the rest are alphabets and replace '_' with '*'
def create_substrings(trial,guessed=[],n=6,threshold=0.1,multiple=False):
    substring = []
    for i in range(len(trial)):
        if trial[i] == '_' and i >= 0 and i <= len(trial)-1 :
            if multiple:
                #replace all '_' with '.'
                substring.append((trial[max(i-n+1,0):i] + '.' + trial[i+1:min(i+n,len(trial))]).replace("_","*"))
            
            if not multiple and (i==0 or trial[i-1] != '_') and (i==len(trial)-1 or trial[i+1] != '_'): 
                substring.append(trial[max(i-n+1,0):i] + '.' + trial[i+1:min(i+n,len(trial))])
    
    #if any string in substring has more than two '_' then remove it
    substrings = []
    for trial in substring:
        for i in range(0,len(trial)-n+1):
            substrings.append(trial[i:i+n])
    
    if multiple:
        substrings  = [x for x in substrings if x.count('*') < 2]
    else:
        substrings  = [x for x in substrings if x.count('_') == 0]
    
    substrings  = [x for x in substrings if len(x) == n]

    if(substrings == []):
        # print("No substring found")
        return None,None
    
    ans = []
    
    for x in substrings:
        ind = x.index('.')
        letters = []

        if multiple:
            x = x.replace("*",".")
        
        for word in words:
            if re.search(x,word):
                l = word[re.search(x,word).start()+ind]
                if l not in guessed:
                    letters.append(l)

        if letters == []:
            continue
        
        b = [[a[0],a[1]/len(letters),a[1]] for a in Counter(letters).most_common()]
        ans.append(b)
    
    if ans == []:
        # print("No substring found")
        return None,None
    
    #create an array with 26 0s representing all alphabets and then add each letter's probability to it
    # print(ans)
    bb = np.array([0.0 for aaaaa in range(26)])

    for x in ans:
        for y in x:
            bb[value[y[0]]-1] += y[1]

    for i in range(len(bb)):
        bb[i] = bb[i]/bb.sum()
    # print(bb)
    if bb[np.argmax(bb)]<threshold:
        return None,None
    #return the most probable letter
    return value_rev[np.argmax(bb)+1],bb[np.argmax(bb)]

trial ="sur_ers"
guess = ['d','a','r','l','t','m','p','g']

print(create_substrings(trial,guess,8,multiple=True))
# print("not8")
print(create_substrings(trial,guess,7,multiple=True))
# print("not7")
print(create_substrings(trial,guess,6,multiple=True))
# print("not6")
print(create_substrings(trial,guess,5,multiple=True))
# print("not5")
print(create_substrings(trial,guess,4,multiple=True))
# print("not4")
print(create_substrings(trial,guess,3))

(None, None)
(None, None)
('f', 0.5263157894736842)
('v', 0.38181589433447655)
('v', 0.3298100553991867)
('i', 0.23065461637287904)


In [11]:
def guessing(word, guessed_letters,verbose = False):

    aa,probs = prediction(word)
    # print(aa,probs)

    pred_prob = 0

    for i in range(len(aa)):
        if(aa[i] not in guessed_letters):
            pred_letter = aa[i]
            pred_prob = probs[i]
            break
    
    a,prob = create_substrings(word,guessed_letters,8,multiple=True)
    if a != None and prob > pred_prob:
        if verbose:
            print("used8",a)
        return a
    
    [a,prob] = create_substrings(word,guessed_letters,7,multiple=True)
    if a != None and prob > pred_prob:
        if verbose:
            print("used7",a)
        return a
    
    [a,prob] = create_substrings(word,guessed_letters,6,multiple=True)
    if a != None and prob > pred_prob:
        if verbose:
            print("used6",a)
        return a

    [a,prob] = create_substrings(word,guessed_letters,5,multiple=True)
    if a != None and prob > pred_prob:
        if verbose:
            print("used5",a)
        return a

    [a,prob] = create_substrings(word,guessed_letters,4,multiple=True)
    if a != None and prob > pred_prob:
        if verbose:
            print("used4",a)
        return a
    
    [a,prob] = create_substrings(word,guessed_letters,3)
    if a != None and prob > pred_prob:
        if verbose:
            print("used3",a)
        return a
    
    return pred_letter


In [25]:
#create a string contianing the permutation of "abcdefghijklmnopqrstuvwxyz" in descending order of values in the np array of length 26

def tries_taken(word,verbose=False,limit=6):
    tc = ['_' for x in word]
    tc = "".join(tc)
    guesslist = []
    count=0
    # print(tc)
    while(1):
        if(count>=limit):
            return count
        i = guessing(tc, guesslist,verbose=verbose)
        guesslist.append(i)
        if(i in word):
            #replace all occurences of i in tc with i at positions in word
            if(verbose):
                print("correct guess : ",i)
            for j in range(len(word)):
                if(word[j]==i):
                    tc = list(tc)
                    tc[j]=i
                    tc = "".join(tc)
            if(tc == word):
                print("word guessed completely : ",tc)
                return count
        else:
            # print("wrong guess : ",i)
            count+=1

        if(verbose):
            print(tc , i)

In [28]:
#randomly select 2 words from words
import random
random_words = random.sample(words,2)

In [29]:
aa = []
for i in random_words:
    # print(i)
    aa.append(tries_taken(i,verbose=True,limit=6))

correct guess :  i
____i_i____i__ i
correct guess :  n
____i_in___i_n n
correct guess :  o
____i_in___ion o
correct guess :  t
___ti_in__tion t
correct guess :  a
___ti_in_ation a
correct guess :  e
_e_ti_ineation e
used8 l
correct guess :  l
_e_tilineation l
used8 c
correct guess :  c
_ectilineation c
used8 r
correct guess :  r
word guessed completely :  rectilineation
correct guess :  e
___e_______ e
correct guess :  i
_i_e_i_____ i
_i_e_i_____ r
correct guess :  a
_i_e_i_a___ a
_i_e_i_a___ l
correct guess :  t
_i_e_i_at__ t
_i_e_i_at__ d
_i_e_i_at__ c
correct guess :  s
_i_esi_at__ s
used8 p
correct guess :  p
_i_esipat__ p
used8 n
correct guess :  n
_inesipat__ n
used8 y
correct guess :  y
_inesipat_y y
used8 h
correct guess :  h
_inesipathy h
used8 k
correct guess :  k
word guessed completely :  kinesipathy


In [17]:
#percentage of values less than 6 in aa

count = 0
for i in aa:
    if(i<7):
        count+=1
print(count/10)

1.0
