In [17]:
# code courtesy of https://www.youtube.com/watch?v=4yOKlWZk52M
import re
import string
import numpy as np
from collections import Counter

In [15]:
def read_corpus(path):
    with open(path,mode='r',encoding="utf8") as f:
        lines = f.readlines()
        words=[]
        for line in lines:
            words+=re.findall(r'\w+',line.lower()) 
    return words

In [31]:
words = read_corpus('./big.txt')
vocabulary = set(words)
word_counts = Counter(words)
total_word_counts = float(len(words))
word_probs = {word: word_counts[word] / total_word_count for word in word_counts.keys()}

In [34]:
def split(word):
    return [(word[:i], word[i:]) for i in range(len(word) + 1)]

In [35]:
print(split('trash'))

[('', 'trash'), ('t', 'rash'), ('tr', 'ash'), ('tra', 'sh'), ('tras', 'h'), ('trash', '')]


In [36]:
def insert(word):
    letters = string.ascii_lowercase
    return [l + c + r for l,r in split(word) for c in letters]

In [39]:
def delete(word):
    return [l+r[1:] for l,r in split(word) if r]

In [53]:
def replace(word):
    letters = string.ascii_lowercase
    return [l + c + r[1:] for l, r in split(word) if r for c in letters]

In [60]:
def swap(word):
    return [l + r[1] + r[0] + r[2:] for l,r in split(word) if len(r)>1]

In [62]:
def edit_level_one(word):
    return set(insert(word) + delete(word) + replace(word) + swap(word))

In [68]:
def edit_level_two(word):
    return [e2  for e1 in edit_level_one(word) for e2 in edit_level_one(e1)]

In [93]:
def check(word):
    if word in vocabulary:
        print(f"{word} is correct")
        return
    suggest = edit_level_one(word) or edit_level_two[word] or [word]
    best_suggest = [w for w in suggest if w in vocabulary]
    return sorted([(w,word_probs[w]) for w in best_suggest],key=lambda tup:tup[1], reverse=True)

In [179]:
class Spell_checker(object):
 
    def __init__(self,corpus_file_path):
        with open(corpus_file_path, mode='r', encoding='utf8') as f:
            lines = f.readlines()
            words = []
            for line in lines:
                words += re.findall(r'\w+',line.lower())
        self.vocabulary = set(words)
        self.total_words = float(len(words))
        self.word_count = Counter(words)
        self.word_probs = {w:self.word_count[w]/self.total_words for w in vocabulary}
        
    def __level_one_edit(self,word):
        split=[(word[:i],word[i:]) for i in range(len(word) + 1)]
        letters=string.ascii_lowercase
        insert=[ l + c + r for l,r in split for c in letters]
        delete=[ l + r[1:] for l,r in split if r]
        swap = [ l + r[1] + r[0] + r[2:] for l,r in split if len(r)>1]
        replace=[ l + c + r[1:] for l,r in split if r for c in letters]
        return set(delete + swap + insert + replace)
        
    def __level_two_edit(self,word):
        return set(e2 for e1 in self.__level_one_edit(word) for e2 in self.__level_one_edit(e1))
    
    def check(self,word):
        word = word.lower()
        if word in self.vocabulary:
            print(f"{word} is correct.")
            return
        condidates = set(list(self.__level_one_edit(word)) + list(self.__level_two_edit(word)))
        valid_condidates = [w for w in condidates if w in self.vocabulary]
        best_condidates = sorted([(w,self.word_probs[w]) for w in valid_condidates],key=lambda t:t[1], reverse=True)
        return best_condidates

In [180]:
fn = Spell_checker('./big.txt')

In [181]:
print(fn.check('teme'))

[('the', 0.05326408544237128), ('me', 0.005816456449854818), ('there', 0.0047356031253148675), ('were', 0.0031967611039359547), ('some', 0.002244144614510914), ('see', 0.0021250675533327837), ('come', 0.0014838833777582369), ('time', 0.0013831258644536654), ('them', 0.0013464867687065483), ('came', 0.0013373269947697692), ('here', 0.0012823683511490936), ('tell', 0.0009068176197411448), ('these', 0.000760261236752677), ('take', 0.00072362214100556), ('name', 0.0005770657580170921), ('same', 0.0005221071143964167), ('tm', 0.0005221071143964167), ('home', 0.00043966914896540355), ('ten', 0.0002747932181033772), ('true', 0.00026563344416659796), ('terms', 0.000228994348419481), ('eye', 0.00021983457448270178), ('seem', 0.00015571615692524707), ('times', 0.00014655638298846784), ('gems', 0.00011907706117813012), ('fee', 0.00011907706117813012), ('mere', 9.15977393677924e-05), ('game', 8.243796543101317e-05), ('tide', 8.243796543101317e-05), ('lee', 6.411841755745468e-05), ('test', 5.495864