Improving Spell Checker
My program starts with a spelling corrector written by Peter Norvig which can correct a word to what the user might have meant to type.


In [1]:
# Spelling Corrector
# this code is by Peter Norvig 
# norvig.com/spell-correct.html

# this code can take a word and correct it. Sometimes the word is already correct and it ought to
# stay the same, or if it is wrong, it will change it to what has the highest probability of being
# correct. One issue with his solution is that he doesn't account for words that typically come
# after other words.

import re, collections

def words(text): return re.findall('[a-z]+', text.lower()) 

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model

NWORDS = train(words(file('big.txt').read()))

alphabet = 'abcdeéfghijklmnopqrstuvwxyz'

def edits1(word):
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   deletes    = [a + b[1:] for a, b in splits if b]
   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
   replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
   inserts    = [a + c + b     for a, b in splits for c in alphabet]
   return set(deletes + transposes + replaces + inserts)

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words): return set(w for w in words if w in NWORDS)

def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return max(candidates, key=NWORDS.get)

In [2]:
correct("wil")

'will'

First Improvement:
Instead of working on just a single word, I add the ability to correct a whole sentence. The issue with this is that each word is still being corrected on an individual level with no respect for the words that come before or after it. Maybe the user typed the word 'cat' but meant the word 'bat' which based on the context of the surrounding words would be obvious. Otherwise, the word 'cat' is spelt correctly so the spell checker would just leave it as is.

In [172]:
def sentenceCorrect(string):
    word = [] # holds a word
    sentence = [] # holds the entire sentence in correct format
    
    # convert input into a list of words
    for i in range(len(string)):
        if string[i] == ' ': # store word once a space is found
            w = ''.join(word)
            sentence.append(w)
            word = []
        else: # if not a space yet, keep adding on characters
            word.append(string[i])
    
    # get last word added into sentence
    if word != []:
        w = ''.join(word)
        sentence.append(w)
        word = []
    
    # correct each word in sentence
    for i in range(len(sentence)):
        print correct(sentence[i]), # corrects sentence and then prints

In [173]:
sentenceCorrect("fsh fod thouht")

fish for thought


Now, to make the spell checker make corrections based off of the surrounding words, I've got to keep track of what words tend to come after other words.

In [174]:
# this will be a word that will be stored in a dictionary that keeps track of the past 10 words used
# right after it. By only keeping a limited number of words, the spell checker can change as the user 
# changes how he/she writes over time.
class Word(object):
    name = []
    wordsAfter = []
    
    def __init__(self, word):
        self.name = word
        
    def addWordsAfter(self, word):
        self.wordsAfter.append(word)
        
        # for now, a limit of the past 10 words used right after.
        if len(self.wordsAfter) > 40:
            self.wordsAfter.pop(0)
            
    def printWordsAfter(self):
        for i in range(len(self.wordsAfter)):
            print self.wordsAfter[i]

In [180]:
class myDictionary(object):
    filename = ""
    fDict = dict()
    
    def __init__(self, fn):
        filename = fn
        
        f = open(filename, 'r')
        words = map(lambda l: l.split(" "), f.readlines()) # reads in file and stores each word in words
        
        iMax = len(words)
        for i in range(iMax):
            jMax = len(words[i])
            for j in range(jMax):
                    
                if words[i][j] not in self.fDict: # if word from file is not in dictionary yet   
                    self.fDict[words[i][j]] = Word(words[i][j]) # add it to the dictionary using Word class as def above
                    
                # add the word that comes after current word to list
                if j < jMax - 1: # if not at last word on line
                    if words[i][j + 1] != '':
                        self.fDict[words[i][j]].addWordsAfter(words[i][j + 1])
                elif i < iMax - 1: # if last word in line, look at first word on next line
                    if words[i + 1][0] != '':
                        self.fDict[words[i][j]].addWordsAfter(words[i + 1][0])
    
        #print self.fDict['Emperor'].wordsAfter

In [181]:
fish = myDictionary("bigEdit.txt")
fish.fDict['The'].wordsAfter

['apprehensive',
 'terminate',
 'wilful',
 'conveniently',
 "'n'",
 'cleanliness',
 'collective',
 'angela',
 'filth',
 'philippines',
 'timely',
 'herein',
 'ignoble',
 'canton',
 'lamentations',
 'moslem',
 'ware',
 'adjective',
 'glen',
 'invade',
 'livid',
 'buggy',
 'prolong',
 'weaken',
 'folio',
 'dismissal',
 'quay',
 'enchanting',
 'heave',
 'purified',
 'syrian',
 'significantly',
 'experimental',
 'film',
 'repressed',
 'cooperation',
 'sequel',
 'wench',
 'calves',
 '\n']