In [20]:
import re
import nltk
import dill as pickle
import numpy as np
from icecream import ic
from nltk.corpus import words
from nltk.metrics.distance import edit_distance
from nltk.lm.preprocessing import pad_both_ends
from tqdm import tqdm

In [21]:
with open('text.txt', 'r') as file:
    text = file.read()

text = text.lower().strip()
text = re.sub(r'[^\w\s]', ' ', text)

tokenize = nltk.tokenize.word_tokenize
stem = nltk.stem.porter.PorterStemmer().stem
lemmatize = nltk.stem.WordNetLemmatizer().lemmatize

In [22]:
with open('word-lists/words.txt', 'r') as file:
    words_list = file.read().split()
with open('word-lists/english3.txt', 'r') as file:
    english_list = file.read().split()

correct_words = words.words()
in_corr_words = {}

for word in tqdm(correct_words):
    in_corr_words[word] = 1

print('Adding words_list to correct words...')
for word in tqdm(words_list):
    try:
        in_corr_words[word]
    except KeyError:
        in_corr_words[word] = 1
        correct_words.append(word)

print('Adding english_list to correct words...')
for word in tqdm(english_list):
    try:
        in_corr_words[word]
    except KeyError:
        in_corr_words[word] = 1
        correct_words.append(word)

print('Tokenizing correct words...')
correct_tokens = [lemmatize(stem(word)) for word in tqdm(correct_words)]
tokens_dict = {}
print('Creating tokens dictionary...')
for t in tqdm(correct_tokens):
    tokens_dict[t] = 1


100%|██████████| 236736/236736 [00:00<00:00, 1849878.72it/s]


Adding words_list to correct words...


100%|██████████| 69903/69903 [00:00<00:00, 1426799.39it/s]


Adding english_list to correct words...


100%|██████████| 194432/194432 [00:00<00:00, 1157272.68it/s]


Tokenizing correct words...


100%|██████████| 355932/355932 [00:08<00:00, 44440.55it/s]


Creating tokens dictionary...


100%|██████████| 355932/355932 [00:00<00:00, 1858116.54it/s]


In [23]:
original_tokens = list(pad_both_ends(tokenize(text), 3))
tokens = [lemmatize(stem(token)) for token in original_tokens]

possible_words = {}
tokens = list(pad_both_ends(tokens, 1))
print(tokens[:3], tokens[-3:])

['<s>', '<s>', '1'] ['pigment', '</s>', '</s>']


In [24]:
with open('model.pk', 'rb') as fin:
    model = pickle.load(fin)

In [25]:
for token_index, token in tqdm(enumerate(tokens)):
    if '<s>' == token or token == '</s>':
        continue
    try:
        int(token)
        continue
    except ValueError:
        try:
            tokens_dict[token]
            continue
        except KeyError:
            pass
        t = []
        error = 0
        min = 0
        word = original_tokens[token_index]
        for i, w in enumerate(correct_tokens):
            error = edit_distance(token, w)
            t.append(error)
            if error < t[min]:
                ic(error, token, w)
                min = i
                possible_words[word] = [correct_words[i]]
            elif error == t[min]:
                try:
                    possible_words[word].append(correct_words[i])
                except KeyError:
                    possible_words[word] = [correct_words[i]]

        prev_words = original_tokens[token_index - 2:token_index]
        for possible_word in possible_words[word]:
            ic(prev_words, possible_word)
            ic(model.score(possible_word, prev_words))
            ic(model.score(possible_word, prev_words[-1]))
            ic(model.score(possible_word))

        ic(possible_words[word])
        guess = possible_words[word][np.argmax([model.score(possible_word) for possible_word in possible_words[word]])]
        print('Guess:', *prev_words, guess)

    print('Number of misspelled words:', len(possible_words.keys()))
    print('Spelling Accuracy:', '%.4f' % (1 - len(possible_words.keys()) / token_index))


0it [00:00, ?it/s]ic| error: 8, token: 'extreemli', w: 'aal'
ic| error: 7, token: 'extreemli', w: 'aardwolf'
ic| error: 6, token: 'extreemli', w: 'abasedli'
ic| error: 5, token: 'extreemli', w: 'alarmedli'
ic| error: 4, token: 'extreemli', w: 'beseemli'
ic| error: 3, token: 'extreemli', w: 'expressli'
ic| error: 2, token: 'extreemli', w: 'extremi'
ic| prev_words: ['and', 'talks'], possible_word: 'extremis'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0
ic| possible_words[word]: ['extremis']
18it [00:20,  1.14s/it]ic| error: 4, token: 'nergi', w: 'aalii'
ic| error: 3, token: 'nergi', w: 'adagi'
ic| error: 2,

Guess: and talks extremis
Number of misspelled words: 1
Spelling Accuracy: 0.9412


 token: 'nergi', w: 'aegi'
ic| error: 1, token: 'nergi', w: 'anergi'
ic| prev_words: ['3', 'e'], possible_word: 'anergy'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0
ic| prev_words: ['3', 'e'], possible_word: 'bergy'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0
ic| prev_words: ['3', 'e'], possible_word: 'energy'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 1.783687464660692e-05
ic| prev_words: ['3', 'e'], possible_word: 'Nereis'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0
ic| prev_words: ['3', 'e'], possible_word: 'Neri'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 

Guess: 3 e energy
Number of misspelled words: 2
Spelling Accuracy: 0.9806


ic| error: 2, token: 'howi', w: 'blowi'
ic| error: 1, token: 'howi', w: 'bowi'
ic| prev_words: ['our', 'topic'], possible_word: 'bowie'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0
ic| prev_words: ['our', 'topic'], possible_word: 'bowieful'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0
ic| prev_words: ['our', 'topic'], possible_word: 'cowy'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0
ic| prev_words: ['our', 'topic'], possible_word: 'dowie'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0
ic| prev_words: ['our', 'topic'], possible_word: 'dowiness'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| mod

Guess: our topic how
Number of misspelled words: 3
Spelling Accuracy: 0.9737


ic| error: 1, token: 'dj', w: 'd'
ic| prev_words: ['on', 'this'], possible_word: 'D'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0
ic| prev_words: ['on', 'this'], possible_word: 'd'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0017468245903910378
ic| prev_words: ['on', 'this'], possible_word: 'da'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 1.0107562299743922e-05
ic| prev_words: ['on', 'this'], possible_word: 'das'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 5.945624882202307e-07
ic| prev_words: ['on', 'this'], possible_word: 'de'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_wor

Guess: on this do
Number of misspelled words: 4
Spelling Accuracy: 0.9688


ic| error: 1, token: 'thier', w: 'shier'
ic| prev_words: ['been', 'losing'], possible_word: 'shier'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0
ic| prev_words: ['been', 'losing'], possible_word: 'theer'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0
ic| prev_words: ['been', 'losing'], possible_word: 'thief'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 1.605318718194623e-05
ic| prev_words: ['been', 'losing'], possible_word: 'thieve'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(possible_word): 0.0
ic| prev_words: ['been', 'losing'], possible_word: 'thieving'
ic| model.score(possible_word, prev_words): 0
ic| model.score(possible_word, prev_words[-1]): 0
ic| model.score(po

Guess: been losing thieves
Number of misspelled words: 5
Spelling Accuracy: 0.9624





True


KeyError: 'children'

In [None]:
lemmatize(stem('children'))