In [14]:
from spylls.hunspell import Dictionary
from textdistance import jaro_winkler, mlipns, levenshtein, gotoh
import os

# Load Dataset

In [12]:
def load_book(path):
    """Load a book from its file"""
    input_file = os.path.join(path)
    with open(input_file) as f:
        book = f.read()
    return book

In [17]:
# Collect all of the book file names
path = './books/'
book_files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
book_files = book_files[1:]

In [18]:
# Load the books using the file names
books = []
for book in book_files:
    books.append(load_book(path+book))

In [21]:
import re

def clean_text(text):
    '''Remove unwanted characters and extra spaces from the text'''
    text = re.sub(r'\n', ' ', text) 
    text = re.sub(r'[{}@_*>()\\#%+=\[\]]','', text)
    text = re.sub('a0','', text)
    text = re.sub('\'92t','\'t', text)
    text = re.sub('\'92s','\'s', text)
    text = re.sub('\'92m','\'m', text)
    text = re.sub('\'92ll','\'ll', text)
    text = re.sub('\'91','', text)
    text = re.sub('\'92','', text)
    text = re.sub('\'93','', text)
    text = re.sub('\'94','', text)
    text = re.sub('\.','. ', text)
    text = re.sub('\!','! ', text)
    text = re.sub('\?','? ', text)
    text = re.sub(' +',' ', text)
    return text

In [22]:
clean_books = []
for book in books:
    clean_books.append(clean_text(book))

In [24]:
# Compare the number of words in each book 
X = []
d = Dictionary.from_files('en_US')
for i in range(len(books)):
    for word in books[i].split():
        if d.lookup(word):
            X.append(word)

## Add misspells to dataset

In [32]:
import numpy as np

letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m',
           'n','o','p','q','r','s','t','u','v','w','x','y','z',]


def noise_maker(sentence, threshold):
    
    noisy_sentence = []
    i = 0
    while i < len(sentence):
        random = np.random.uniform(0,1,1)
        if random < threshold:
            noisy_sentence.append(sentence[i])
        else:
            new_random = np.random.uniform(0,1,1)
            if new_random > 0.67:
                if i == (len(sentence) - 1):
                    continue
                else:
                    noisy_sentence.append(sentence[i+1])
                    noisy_sentence.append(sentence[i])
                    i += 1
            elif new_random < 0.33:
                random_letter = np.random.choice(letters, 1)[0]
                noisy_sentence.append(random_letter)
                noisy_sentence.append(sentence[i])
            else:
                pass     
        i += 1
    return noisy_sentence

In [47]:
X_err = []
threshold = 0.85
for word in X[:100000]:
    X_err.append(noise_maker(word, threshold))

In [121]:
import random

X_train = []
y_train = []
qs = []

for i, (pred, target) in enumerate(zip(X_err, X)):
    if len(pred) < 2 or len(target) < 2:
        continue
    X_train.append(np.array([f(pred, target) for f in [jaro_winkler, mlipns, levenshtein, gotoh]]))
    y_train.append(1.)
    qs.append(i)
    
    negs = list(d.suggest(target))
    if len(negs) == 0:
        continue
    qs.append(i)
    target_neg = random.sample(negs, 1)[0]
    X_train.append(np.array([f(pred, target_neg) for f in [jaro_winkler, mlipns, levenshtein, gotoh]]))
    y_train.append(0.)

# Train CatBoost model to rank on features

In [63]:
from catboost import CatBoostRanker, Pool

In [88]:
from copy import deepcopy

default_parameters = {
    'iterations': 2000,
    'custom_metric': ['NDCG', 'PFound', 'AverageGain:top=10'],
    'verbose': True,
    'random_seed': 0,
}

def fit_model(loss_function, train_pool, test_pool, additional_params=None):
    parameters = deepcopy(default_parameters)
    parameters['loss_function'] = loss_function
    parameters['train_dir'] = loss_function
    
    if additional_params is not None:
        parameters.update(additional_params)
        
    model = CatBoostRanker(**parameters)
    model.fit(train_pool, eval_set=test_pool)
    
    return model

In [122]:
n = 160000

X_tr = X_train[:n]
y_tr = y_train[:n]
queries_train = qs[:n]

X_test = X_train[n:]
y_test = y_train[n:]
queries_test =  qs[n:]

In [123]:
train_model = Pool(
    data=X_tr,
    label=y_tr,
    group_id=queries_train
)

test_model = Pool(
    data=X_test,
    label=y_test,
    group_id=queries_test
)

model = fit_model('QuerySoftMax',
                  additional_params={'custom_metric': 'AverageGain:top=1'},
                  train_pool=train_model,
                  test_pool=test_model)

0:	learn: 0.6383217	test: 0.6368300	best: 0.6368300 (0)	total: 26.5ms	remaining: 52.9s
1:	learn: 0.5920323	test: 0.5864245	best: 0.5864245 (1)	total: 51.3ms	remaining: 51.2s
2:	learn: 0.5474971	test: 0.5391612	best: 0.5391612 (2)	total: 72.6ms	remaining: 48.3s
3:	learn: 0.5081949	test: 0.4992294	best: 0.4992294 (3)	total: 93.2ms	remaining: 46.5s
4:	learn: 0.4741333	test: 0.4639315	best: 0.4639315 (4)	total: 115ms	remaining: 45.7s
5:	learn: 0.4417270	test: 0.4308108	best: 0.4308108 (5)	total: 137ms	remaining: 45.4s
6:	learn: 0.4141101	test: 0.4029521	best: 0.4029521 (6)	total: 155ms	remaining: 44.3s
7:	learn: 0.3902969	test: 0.3773377	best: 0.3773377 (7)	total: 178ms	remaining: 44.2s
8:	learn: 0.3664974	test: 0.3541934	best: 0.3541934 (8)	total: 200ms	remaining: 44.2s
9:	learn: 0.3441048	test: 0.3326310	best: 0.3326310 (9)	total: 228ms	remaining: 45.5s
10:	learn: 0.3256649	test: 0.3140360	best: 0.3140360 (10)	total: 253ms	remaining: 45.7s
11:	learn: 0.3095904	test: 0.2976709	best: 0.297

98:	learn: 0.1042108	test: 0.1154767	best: 0.1154767 (98)	total: 2.26s	remaining: 43.5s
99:	learn: 0.1039336	test: 0.1152245	best: 0.1152245 (99)	total: 2.29s	remaining: 43.6s
100:	learn: 0.1036155	test: 0.1150351	best: 0.1150351 (100)	total: 2.31s	remaining: 43.5s
101:	learn: 0.1033084	test: 0.1146458	best: 0.1146458 (101)	total: 2.34s	remaining: 43.5s
102:	learn: 0.1030562	test: 0.1146083	best: 0.1146083 (102)	total: 2.36s	remaining: 43.4s
103:	learn: 0.1026900	test: 0.1144408	best: 0.1144408 (103)	total: 2.38s	remaining: 43.4s
104:	learn: 0.1025518	test: 0.1143971	best: 0.1143971 (104)	total: 2.4s	remaining: 43.2s
105:	learn: 0.1022915	test: 0.1144184	best: 0.1143971 (104)	total: 2.41s	remaining: 43.1s
106:	learn: 0.1018816	test: 0.1140789	best: 0.1140789 (106)	total: 2.43s	remaining: 43s
107:	learn: 0.1017553	test: 0.1140404	best: 0.1140404 (107)	total: 2.44s	remaining: 42.9s
108:	learn: 0.1015061	test: 0.1139854	best: 0.1139854 (108)	total: 2.47s	remaining: 42.9s
109:	learn: 0.101

194:	learn: 0.0905971	test: 0.1048536	best: 0.1048536 (194)	total: 4.49s	remaining: 41.6s
195:	learn: 0.0905455	test: 0.1047350	best: 0.1047350 (195)	total: 4.52s	remaining: 41.6s
196:	learn: 0.0904275	test: 0.1047058	best: 0.1047058 (196)	total: 4.53s	remaining: 41.5s
197:	learn: 0.0903323	test: 0.1046313	best: 0.1046313 (197)	total: 4.56s	remaining: 41.5s
198:	learn: 0.0902777	test: 0.1045704	best: 0.1045704 (198)	total: 4.58s	remaining: 41.4s
199:	learn: 0.0901820	test: 0.1044974	best: 0.1044974 (199)	total: 4.6s	remaining: 41.4s
200:	learn: 0.0900725	test: 0.1041351	best: 0.1041351 (200)	total: 4.62s	remaining: 41.4s
201:	learn: 0.0898988	test: 0.1041424	best: 0.1041351 (200)	total: 4.65s	remaining: 41.4s
202:	learn: 0.0898604	test: 0.1040469	best: 0.1040469 (202)	total: 4.66s	remaining: 41.3s
203:	learn: 0.0898257	test: 0.1040431	best: 0.1040431 (203)	total: 4.68s	remaining: 41.2s
204:	learn: 0.0896871	test: 0.1039970	best: 0.1039970 (204)	total: 4.71s	remaining: 41.2s
205:	learn:

290:	learn: 0.0843985	test: 0.0996818	best: 0.0996818 (290)	total: 6.71s	remaining: 39.4s
291:	learn: 0.0843306	test: 0.0996812	best: 0.0996812 (291)	total: 6.74s	remaining: 39.4s
292:	learn: 0.0842591	test: 0.0994192	best: 0.0994192 (292)	total: 6.77s	remaining: 39.4s
293:	learn: 0.0841901	test: 0.0994146	best: 0.0994146 (293)	total: 6.79s	remaining: 39.4s
294:	learn: 0.0841124	test: 0.0993666	best: 0.0993666 (294)	total: 6.82s	remaining: 39.4s
295:	learn: 0.0840436	test: 0.0993770	best: 0.0993666 (294)	total: 6.84s	remaining: 39.4s
296:	learn: 0.0839681	test: 0.0993070	best: 0.0993070 (296)	total: 6.86s	remaining: 39.3s
297:	learn: 0.0838784	test: 0.0993714	best: 0.0993070 (296)	total: 6.88s	remaining: 39.3s
298:	learn: 0.0838119	test: 0.0994917	best: 0.0993070 (296)	total: 6.9s	remaining: 39.3s
299:	learn: 0.0837528	test: 0.0992681	best: 0.0992681 (299)	total: 6.93s	remaining: 39.3s
300:	learn: 0.0836942	test: 0.0992542	best: 0.0992542 (300)	total: 6.94s	remaining: 39.2s
301:	learn:

387:	learn: 0.0792335	test: 0.0959994	best: 0.0958297 (357)	total: 9.11s	remaining: 37.8s
388:	learn: 0.0791887	test: 0.0960526	best: 0.0958297 (357)	total: 9.13s	remaining: 37.8s
389:	learn: 0.0791446	test: 0.0961059	best: 0.0958297 (357)	total: 9.16s	remaining: 37.8s
390:	learn: 0.0791253	test: 0.0961023	best: 0.0958297 (357)	total: 9.18s	remaining: 37.8s
391:	learn: 0.0790907	test: 0.0961678	best: 0.0958297 (357)	total: 9.2s	remaining: 37.7s
392:	learn: 0.0790560	test: 0.0960975	best: 0.0958297 (357)	total: 9.22s	remaining: 37.7s
393:	learn: 0.0790238	test: 0.0960877	best: 0.0958297 (357)	total: 9.24s	remaining: 37.7s
394:	learn: 0.0789902	test: 0.0960769	best: 0.0958297 (357)	total: 9.27s	remaining: 37.7s
395:	learn: 0.0789560	test: 0.0960770	best: 0.0958297 (357)	total: 9.3s	remaining: 37.7s
396:	learn: 0.0789153	test: 0.0960269	best: 0.0958297 (357)	total: 9.32s	remaining: 37.6s
397:	learn: 0.0788822	test: 0.0961259	best: 0.0958297 (357)	total: 9.35s	remaining: 37.6s
398:	learn: 

481:	learn: 0.0767231	test: 0.0964044	best: 0.0958297 (357)	total: 11.5s	remaining: 36.2s
482:	learn: 0.0767105	test: 0.0963878	best: 0.0958297 (357)	total: 11.5s	remaining: 36.2s
483:	learn: 0.0766987	test: 0.0963865	best: 0.0958297 (357)	total: 11.6s	remaining: 36.3s
484:	learn: 0.0766762	test: 0.0963797	best: 0.0958297 (357)	total: 11.6s	remaining: 36.3s
485:	learn: 0.0766577	test: 0.0964197	best: 0.0958297 (357)	total: 11.6s	remaining: 36.3s
486:	learn: 0.0766376	test: 0.0964627	best: 0.0958297 (357)	total: 11.7s	remaining: 36.3s
487:	learn: 0.0766140	test: 0.0964350	best: 0.0958297 (357)	total: 11.7s	remaining: 36.2s
488:	learn: 0.0765944	test: 0.0965151	best: 0.0958297 (357)	total: 11.7s	remaining: 36.3s
489:	learn: 0.0765735	test: 0.0965082	best: 0.0958297 (357)	total: 11.8s	remaining: 36.2s
490:	learn: 0.0765604	test: 0.0965484	best: 0.0958297 (357)	total: 11.8s	remaining: 36.2s
491:	learn: 0.0765398	test: 0.0965200	best: 0.0958297 (357)	total: 11.8s	remaining: 36.2s
492:	learn

580:	learn: 0.0752671	test: 0.0976089	best: 0.0958297 (357)	total: 14.1s	remaining: 34.4s
581:	learn: 0.0752545	test: 0.0975831	best: 0.0958297 (357)	total: 14.1s	remaining: 34.4s
582:	learn: 0.0752396	test: 0.0976388	best: 0.0958297 (357)	total: 14.1s	remaining: 34.4s
583:	learn: 0.0752271	test: 0.0976141	best: 0.0958297 (357)	total: 14.2s	remaining: 34.4s
584:	learn: 0.0752191	test: 0.0976452	best: 0.0958297 (357)	total: 14.2s	remaining: 34.3s
585:	learn: 0.0752063	test: 0.0976237	best: 0.0958297 (357)	total: 14.2s	remaining: 34.3s
586:	learn: 0.0751981	test: 0.0976320	best: 0.0958297 (357)	total: 14.2s	remaining: 34.3s
587:	learn: 0.0751859	test: 0.0976084	best: 0.0958297 (357)	total: 14.3s	remaining: 34.2s
588:	learn: 0.0751715	test: 0.0976641	best: 0.0958297 (357)	total: 14.3s	remaining: 34.3s
589:	learn: 0.0751595	test: 0.0976414	best: 0.0958297 (357)	total: 14.3s	remaining: 34.2s
590:	learn: 0.0751521	test: 0.0976674	best: 0.0958297 (357)	total: 14.3s	remaining: 34.2s
591:	learn

673:	learn: 0.0743443	test: 0.0980092	best: 0.0958297 (357)	total: 16.3s	remaining: 32.1s
674:	learn: 0.0743377	test: 0.0980102	best: 0.0958297 (357)	total: 16.3s	remaining: 32.1s
675:	learn: 0.0743313	test: 0.0980501	best: 0.0958297 (357)	total: 16.4s	remaining: 32.1s
676:	learn: 0.0743231	test: 0.0980366	best: 0.0958297 (357)	total: 16.4s	remaining: 32s
677:	learn: 0.0743157	test: 0.0980237	best: 0.0958297 (357)	total: 16.4s	remaining: 32s
678:	learn: 0.0743072	test: 0.0980258	best: 0.0958297 (357)	total: 16.4s	remaining: 32s
679:	learn: 0.0742999	test: 0.0980130	best: 0.0958297 (357)	total: 16.5s	remaining: 32s
680:	learn: 0.0742902	test: 0.0980081	best: 0.0958297 (357)	total: 16.5s	remaining: 32s
681:	learn: 0.0742831	test: 0.0979956	best: 0.0958297 (357)	total: 16.5s	remaining: 32s
682:	learn: 0.0742761	test: 0.0979834	best: 0.0958297 (357)	total: 16.6s	remaining: 32s
683:	learn: 0.0742705	test: 0.0980097	best: 0.0958297 (357)	total: 16.6s	remaining: 31.9s
684:	learn: 0.0742650	te

769:	learn: 0.0736749	test: 0.0988885	best: 0.0958297 (357)	total: 18.9s	remaining: 30.2s
770:	learn: 0.0736700	test: 0.0988683	best: 0.0958297 (357)	total: 19s	remaining: 30.2s
771:	learn: 0.0736655	test: 0.0989053	best: 0.0958297 (357)	total: 19s	remaining: 30.2s
772:	learn: 0.0736583	test: 0.0989024	best: 0.0958297 (357)	total: 19s	remaining: 30.2s
773:	learn: 0.0736534	test: 0.0988930	best: 0.0958297 (357)	total: 19.1s	remaining: 30.2s
774:	learn: 0.0736480	test: 0.0988926	best: 0.0958297 (357)	total: 19.1s	remaining: 30.2s
775:	learn: 0.0736427	test: 0.0988923	best: 0.0958297 (357)	total: 19.1s	remaining: 30.1s
776:	learn: 0.0736379	test: 0.0988759	best: 0.0958297 (357)	total: 19.1s	remaining: 30.1s
777:	learn: 0.0736331	test: 0.0988714	best: 0.0958297 (357)	total: 19.2s	remaining: 30.1s
778:	learn: 0.0736269	test: 0.0988670	best: 0.0958297 (357)	total: 19.2s	remaining: 30.1s
779:	learn: 0.0736219	test: 0.0988681	best: 0.0958297 (357)	total: 19.2s	remaining: 30s
780:	learn: 0.0736

861:	learn: 0.0732101	test: 0.0994360	best: 0.0958297 (357)	total: 21.6s	remaining: 28.5s
862:	learn: 0.0732060	test: 0.0994584	best: 0.0958297 (357)	total: 21.7s	remaining: 28.5s
863:	learn: 0.0732026	test: 0.0994805	best: 0.0958297 (357)	total: 21.7s	remaining: 28.5s
864:	learn: 0.0731986	test: 0.0995029	best: 0.0958297 (357)	total: 21.7s	remaining: 28.5s
865:	learn: 0.0731947	test: 0.0995252	best: 0.0958297 (357)	total: 21.7s	remaining: 28.5s
866:	learn: 0.0731898	test: 0.0995343	best: 0.0958297 (357)	total: 21.8s	remaining: 28.4s
867:	learn: 0.0731861	test: 0.0995534	best: 0.0958297 (357)	total: 21.8s	remaining: 28.4s
868:	learn: 0.0731820	test: 0.0995559	best: 0.0958297 (357)	total: 21.8s	remaining: 28.4s
869:	learn: 0.0731774	test: 0.0995552	best: 0.0958297 (357)	total: 21.8s	remaining: 28.4s
870:	learn: 0.0731734	test: 0.0995609	best: 0.0958297 (357)	total: 21.9s	remaining: 28.4s
871:	learn: 0.0731686	test: 0.0995601	best: 0.0958297 (357)	total: 21.9s	remaining: 28.3s
872:	learn

953:	learn: 0.0728665	test: 0.1002130	best: 0.0958297 (357)	total: 24.1s	remaining: 26.4s
954:	learn: 0.0728635	test: 0.1002146	best: 0.0958297 (357)	total: 24.1s	remaining: 26.4s
955:	learn: 0.0728602	test: 0.1002222	best: 0.0958297 (357)	total: 24.2s	remaining: 26.4s
956:	learn: 0.0728565	test: 0.1002210	best: 0.0958297 (357)	total: 24.2s	remaining: 26.4s
957:	learn: 0.0728539	test: 0.1002499	best: 0.0958297 (357)	total: 24.2s	remaining: 26.3s
958:	learn: 0.0728500	test: 0.1002491	best: 0.0958297 (357)	total: 24.2s	remaining: 26.3s
959:	learn: 0.0728470	test: 0.1002535	best: 0.0958297 (357)	total: 24.3s	remaining: 26.3s
960:	learn: 0.0728439	test: 0.1002384	best: 0.0958297 (357)	total: 24.3s	remaining: 26.3s
961:	learn: 0.0728402	test: 0.1002617	best: 0.0958297 (357)	total: 24.3s	remaining: 26.2s
962:	learn: 0.0728369	test: 0.1002533	best: 0.0958297 (357)	total: 24.3s	remaining: 26.2s
963:	learn: 0.0728338	test: 0.1002581	best: 0.0958297 (357)	total: 24.4s	remaining: 26.2s
964:	learn

1045:	learn: 0.0725970	test: 0.1007677	best: 0.0958297 (357)	total: 26.8s	remaining: 24.4s
1046:	learn: 0.0725939	test: 0.1007664	best: 0.0958297 (357)	total: 26.8s	remaining: 24.4s
1047:	learn: 0.0725912	test: 0.1007880	best: 0.0958297 (357)	total: 26.9s	remaining: 24.4s
1048:	learn: 0.0725883	test: 0.1007831	best: 0.0958297 (357)	total: 26.9s	remaining: 24.4s
1049:	learn: 0.0725857	test: 0.1007857	best: 0.0958297 (357)	total: 27s	remaining: 24.4s
1050:	learn: 0.0725829	test: 0.1007808	best: 0.0958297 (357)	total: 27s	remaining: 24.4s
1051:	learn: 0.0725802	test: 0.1007807	best: 0.0958297 (357)	total: 27s	remaining: 24.4s
1052:	learn: 0.0725780	test: 0.1007982	best: 0.0958297 (357)	total: 27.1s	remaining: 24.3s
1053:	learn: 0.0725753	test: 0.1007982	best: 0.0958297 (357)	total: 27.1s	remaining: 24.3s
1054:	learn: 0.0725729	test: 0.1007958	best: 0.0958297 (357)	total: 27.1s	remaining: 24.3s
1055:	learn: 0.0725700	test: 0.1008173	best: 0.0958297 (357)	total: 27.2s	remaining: 24.3s
1056:

1141:	learn: 0.0723695	test: 0.1013559	best: 0.0958297 (357)	total: 29.9s	remaining: 22.5s
1142:	learn: 0.0723673	test: 0.1013598	best: 0.0958297 (357)	total: 29.9s	remaining: 22.5s
1143:	learn: 0.0723648	test: 0.1013584	best: 0.0958297 (357)	total: 30s	remaining: 22.4s
1144:	learn: 0.0723634	test: 0.1013626	best: 0.0958297 (357)	total: 30s	remaining: 22.4s
1145:	learn: 0.0723619	test: 0.1013598	best: 0.0958297 (357)	total: 30s	remaining: 22.4s
1146:	learn: 0.0723600	test: 0.1013692	best: 0.0958297 (357)	total: 30.1s	remaining: 22.4s
1147:	learn: 0.0723575	test: 0.1013681	best: 0.0958297 (357)	total: 30.1s	remaining: 22.4s
1148:	learn: 0.0723553	test: 0.1013687	best: 0.0958297 (357)	total: 30.2s	remaining: 22.3s
1149:	learn: 0.0723536	test: 0.1013563	best: 0.0958297 (357)	total: 30.2s	remaining: 22.3s
1150:	learn: 0.0723520	test: 0.1013736	best: 0.0958297 (357)	total: 30.2s	remaining: 22.3s
1151:	learn: 0.0723504	test: 0.1013913	best: 0.0958297 (357)	total: 30.2s	remaining: 22.3s
1152:

1234:	learn: 0.0721919	test: 0.1020066	best: 0.0958297 (357)	total: 32.9s	remaining: 20.4s
1235:	learn: 0.0721903	test: 0.1020108	best: 0.0958297 (357)	total: 32.9s	remaining: 20.3s
1236:	learn: 0.0721882	test: 0.1020086	best: 0.0958297 (357)	total: 32.9s	remaining: 20.3s
1237:	learn: 0.0721868	test: 0.1020226	best: 0.0958297 (357)	total: 33s	remaining: 20.3s
1238:	learn: 0.0721852	test: 0.1020293	best: 0.0958297 (357)	total: 33s	remaining: 20.3s
1239:	learn: 0.0721837	test: 0.1020599	best: 0.0958297 (357)	total: 33s	remaining: 20.2s
1240:	learn: 0.0721822	test: 0.1020489	best: 0.0958297 (357)	total: 33.1s	remaining: 20.2s
1241:	learn: 0.0721807	test: 0.1020668	best: 0.0958297 (357)	total: 33.1s	remaining: 20.2s
1242:	learn: 0.0721793	test: 0.1020846	best: 0.0958297 (357)	total: 33.1s	remaining: 20.2s
1243:	learn: 0.0721777	test: 0.1021019	best: 0.0958297 (357)	total: 33.2s	remaining: 20.2s
1244:	learn: 0.0721766	test: 0.1021079	best: 0.0958297 (357)	total: 33.2s	remaining: 20.1s
1245:

1326:	learn: 0.0720452	test: 0.1026447	best: 0.0958297 (357)	total: 35.9s	remaining: 18.2s
1327:	learn: 0.0720440	test: 0.1026357	best: 0.0958297 (357)	total: 36s	remaining: 18.2s
1328:	learn: 0.0720429	test: 0.1026434	best: 0.0958297 (357)	total: 36s	remaining: 18.2s
1329:	learn: 0.0720417	test: 0.1026479	best: 0.0958297 (357)	total: 36s	remaining: 18.1s
1330:	learn: 0.0720399	test: 0.1026546	best: 0.0958297 (357)	total: 36.1s	remaining: 18.1s
1331:	learn: 0.0720383	test: 0.1026726	best: 0.0958297 (357)	total: 36.1s	remaining: 18.1s
1332:	learn: 0.0720367	test: 0.1026721	best: 0.0958297 (357)	total: 36.1s	remaining: 18.1s
1333:	learn: 0.0720357	test: 0.1026761	best: 0.0958297 (357)	total: 36.2s	remaining: 18.1s
1334:	learn: 0.0720342	test: 0.1026753	best: 0.0958297 (357)	total: 36.2s	remaining: 18s
1335:	learn: 0.0720329	test: 0.1026917	best: 0.0958297 (357)	total: 36.2s	remaining: 18s
1336:	learn: 0.0720308	test: 0.1026867	best: 0.0958297 (357)	total: 36.3s	remaining: 18s
1337:	learn

1418:	learn: 0.0719226	test: 0.1031169	best: 0.0958297 (357)	total: 38.7s	remaining: 15.9s
1419:	learn: 0.0719213	test: 0.1031230	best: 0.0958297 (357)	total: 38.8s	remaining: 15.8s
1420:	learn: 0.0719200	test: 0.1031225	best: 0.0958297 (357)	total: 38.8s	remaining: 15.8s
1421:	learn: 0.0719183	test: 0.1031195	best: 0.0958297 (357)	total: 38.8s	remaining: 15.8s
1422:	learn: 0.0719170	test: 0.1031202	best: 0.0958297 (357)	total: 38.9s	remaining: 15.8s
1423:	learn: 0.0719158	test: 0.1031219	best: 0.0958297 (357)	total: 38.9s	remaining: 15.7s
1424:	learn: 0.0719149	test: 0.1031248	best: 0.0958297 (357)	total: 38.9s	remaining: 15.7s
1425:	learn: 0.0719135	test: 0.1031194	best: 0.0958297 (357)	total: 39s	remaining: 15.7s
1426:	learn: 0.0719122	test: 0.1031189	best: 0.0958297 (357)	total: 39s	remaining: 15.7s
1427:	learn: 0.0719109	test: 0.1031260	best: 0.0958297 (357)	total: 39s	remaining: 15.6s
1428:	learn: 0.0719098	test: 0.1031303	best: 0.0958297 (357)	total: 39.1s	remaining: 15.6s
1429:

1512:	learn: 0.0718145	test: 0.1036037	best: 0.0958297 (357)	total: 41.6s	remaining: 13.4s
1513:	learn: 0.0718136	test: 0.1036164	best: 0.0958297 (357)	total: 41.6s	remaining: 13.3s
1514:	learn: 0.0718126	test: 0.1036209	best: 0.0958297 (357)	total: 41.6s	remaining: 13.3s
1515:	learn: 0.0718115	test: 0.1036209	best: 0.0958297 (357)	total: 41.6s	remaining: 13.3s
1516:	learn: 0.0718104	test: 0.1036218	best: 0.0958297 (357)	total: 41.7s	remaining: 13.3s
1517:	learn: 0.0718088	test: 0.1036177	best: 0.0958297 (357)	total: 41.7s	remaining: 13.2s
1518:	learn: 0.0718079	test: 0.1036278	best: 0.0958297 (357)	total: 41.7s	remaining: 13.2s
1519:	learn: 0.0718068	test: 0.1036288	best: 0.0958297 (357)	total: 41.8s	remaining: 13.2s
1520:	learn: 0.0718060	test: 0.1036370	best: 0.0958297 (357)	total: 41.8s	remaining: 13.2s
1521:	learn: 0.0718047	test: 0.1036353	best: 0.0958297 (357)	total: 41.8s	remaining: 13.1s
1522:	learn: 0.0718037	test: 0.1036558	best: 0.0958297 (357)	total: 41.9s	remaining: 13.1s

1607:	learn: 0.0717199	test: 0.1040438	best: 0.0958297 (357)	total: 44.8s	remaining: 10.9s
1608:	learn: 0.0717189	test: 0.1040591	best: 0.0958297 (357)	total: 44.9s	remaining: 10.9s
1609:	learn: 0.0717177	test: 0.1040612	best: 0.0958297 (357)	total: 44.9s	remaining: 10.9s
1610:	learn: 0.0717168	test: 0.1040613	best: 0.0958297 (357)	total: 44.9s	remaining: 10.9s
1611:	learn: 0.0717159	test: 0.1040760	best: 0.0958297 (357)	total: 45s	remaining: 10.8s
1612:	learn: 0.0717151	test: 0.1040805	best: 0.0958297 (357)	total: 45s	remaining: 10.8s
1613:	learn: 0.0717143	test: 0.1040734	best: 0.0958297 (357)	total: 45s	remaining: 10.8s
1614:	learn: 0.0717133	test: 0.1040887	best: 0.0958297 (357)	total: 45.1s	remaining: 10.7s
1615:	learn: 0.0717124	test: 0.1040840	best: 0.0958297 (357)	total: 45.1s	remaining: 10.7s
1616:	learn: 0.0717116	test: 0.1040993	best: 0.0958297 (357)	total: 45.1s	remaining: 10.7s
1617:	learn: 0.0717108	test: 0.1041146	best: 0.0958297 (357)	total: 45.2s	remaining: 10.7s
1618:

1699:	learn: 0.0716400	test: 0.1044360	best: 0.0958297 (357)	total: 47.8s	remaining: 8.44s
1700:	learn: 0.0716391	test: 0.1044336	best: 0.0958297 (357)	total: 47.9s	remaining: 8.41s
1701:	learn: 0.0716383	test: 0.1044331	best: 0.0958297 (357)	total: 47.9s	remaining: 8.38s
1702:	learn: 0.0716375	test: 0.1044342	best: 0.0958297 (357)	total: 47.9s	remaining: 8.36s
1703:	learn: 0.0716367	test: 0.1044480	best: 0.0958297 (357)	total: 47.9s	remaining: 8.33s
1704:	learn: 0.0716359	test: 0.1044610	best: 0.0958297 (357)	total: 48s	remaining: 8.3s
1705:	learn: 0.0716350	test: 0.1044598	best: 0.0958297 (357)	total: 48s	remaining: 8.27s
1706:	learn: 0.0716344	test: 0.1044581	best: 0.0958297 (357)	total: 48s	remaining: 8.25s
1707:	learn: 0.0716338	test: 0.1044618	best: 0.0958297 (357)	total: 48.1s	remaining: 8.22s
1708:	learn: 0.0716329	test: 0.1044584	best: 0.0958297 (357)	total: 48.1s	remaining: 8.19s
1709:	learn: 0.0716322	test: 0.1044729	best: 0.0958297 (357)	total: 48.1s	remaining: 8.16s
1710:	

1790:	learn: 0.0715735	test: 0.1048180	best: 0.0958297 (357)	total: 50.5s	remaining: 5.89s
1791:	learn: 0.0715728	test: 0.1048418	best: 0.0958297 (357)	total: 50.5s	remaining: 5.86s
1792:	learn: 0.0715722	test: 0.1048431	best: 0.0958297 (357)	total: 50.5s	remaining: 5.83s
1793:	learn: 0.0715715	test: 0.1048437	best: 0.0958297 (357)	total: 50.6s	remaining: 5.8s
1794:	learn: 0.0715709	test: 0.1048505	best: 0.0958297 (357)	total: 50.6s	remaining: 5.78s
1795:	learn: 0.0715701	test: 0.1048503	best: 0.0958297 (357)	total: 50.6s	remaining: 5.75s
1796:	learn: 0.0715691	test: 0.1048472	best: 0.0958297 (357)	total: 50.6s	remaining: 5.72s
1797:	learn: 0.0715685	test: 0.1048605	best: 0.0958297 (357)	total: 50.7s	remaining: 5.69s
1798:	learn: 0.0715678	test: 0.1048644	best: 0.0958297 (357)	total: 50.7s	remaining: 5.67s
1799:	learn: 0.0715671	test: 0.1048646	best: 0.0958297 (357)	total: 50.7s	remaining: 5.64s
1800:	learn: 0.0715663	test: 0.1048640	best: 0.0958297 (357)	total: 50.8s	remaining: 5.61s


1882:	learn: 0.0715136	test: 0.1051280	best: 0.0958297 (357)	total: 53.4s	remaining: 3.31s
1883:	learn: 0.0715129	test: 0.1051278	best: 0.0958297 (357)	total: 53.4s	remaining: 3.29s
1884:	learn: 0.0715125	test: 0.1051312	best: 0.0958297 (357)	total: 53.4s	remaining: 3.26s
1885:	learn: 0.0715119	test: 0.1051297	best: 0.0958297 (357)	total: 53.5s	remaining: 3.23s
1886:	learn: 0.0715112	test: 0.1051426	best: 0.0958297 (357)	total: 53.5s	remaining: 3.2s
1887:	learn: 0.0715106	test: 0.1051434	best: 0.0958297 (357)	total: 53.5s	remaining: 3.17s
1888:	learn: 0.0715100	test: 0.1051468	best: 0.0958297 (357)	total: 53.6s	remaining: 3.15s
1889:	learn: 0.0715095	test: 0.1051597	best: 0.0958297 (357)	total: 53.6s	remaining: 3.12s
1890:	learn: 0.0715088	test: 0.1051607	best: 0.0958297 (357)	total: 53.6s	remaining: 3.09s
1891:	learn: 0.0715083	test: 0.1051642	best: 0.0958297 (357)	total: 53.7s	remaining: 3.06s
1892:	learn: 0.0715078	test: 0.1051677	best: 0.0958297 (357)	total: 53.7s	remaining: 3.03s


1976:	learn: 0.0714586	test: 0.1053990	best: 0.0958297 (357)	total: 56.2s	remaining: 654ms
1977:	learn: 0.0714582	test: 0.1054140	best: 0.0958297 (357)	total: 56.2s	remaining: 626ms
1978:	learn: 0.0714576	test: 0.1054306	best: 0.0958297 (357)	total: 56.3s	remaining: 597ms
1979:	learn: 0.0714570	test: 0.1054312	best: 0.0958297 (357)	total: 56.3s	remaining: 569ms
1980:	learn: 0.0714565	test: 0.1054385	best: 0.0958297 (357)	total: 56.3s	remaining: 540ms
1981:	learn: 0.0714559	test: 0.1054508	best: 0.0958297 (357)	total: 56.4s	remaining: 512ms
1982:	learn: 0.0714552	test: 0.1054508	best: 0.0958297 (357)	total: 56.4s	remaining: 483ms
1983:	learn: 0.0714548	test: 0.1054645	best: 0.0958297 (357)	total: 56.4s	remaining: 455ms
1984:	learn: 0.0714541	test: 0.1054646	best: 0.0958297 (357)	total: 56.5s	remaining: 427ms
1985:	learn: 0.0714536	test: 0.1054629	best: 0.0958297 (357)	total: 56.5s	remaining: 398ms
1986:	learn: 0.0714530	test: 0.1054630	best: 0.0958297 (357)	total: 56.5s	remaining: 370ms

# Spell Checker

In [130]:
class SpellChecker:
    def __init__(self, ranker, max_suggestions=10):
        self.max_suggestions = max_suggestions
        self.checker = Dictionary.from_files('en_US')
        self.ranker = ranker

    def check(self, word):
        return self.checker.lookup(word)

    def get_features(self, pred, target):
        return np.array([f(pred, target) for f in [jaro_winkler, mlipns, levenshtein, gotoh]])

    def suggest(self, word):
        suggestions = np.array(list((self.checker.suggest(word))))
        features = np.array([self.get_features(suggestion, word) for suggestion in suggestions])
        idx = np.argsort(-self.ranker.predict(features))
        return suggestions[idx]

In [131]:
sc = SpellChecker(model)

In [132]:
print(sc.suggest('botle'))

['bottle' 'bole' 'bootleg' 'bootless' 'bootlace' 'Boole' 'boodle' 'tootle']
