In [1]:
import nltk
from nltk.tokenize import word_tokenize

#### Get a list of valid words in the English language

In [2]:
# 1.	Get a list of valid words in the English language
nltk.download("words")
valid_words = nltk.corpus.words.words()

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


In [3]:
len(valid_words)

236736

#### We see that case has not been normalized. Normalize it.

In [None]:
# 2.Look at the first 20 words in the list. Is the case normalized
valid_words[0:20]

['A',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'Aani',
 'aardvark',
 'aardwolf',
 'Aaron',
 'Aaronic',
 'Aaronical',
 'Aaronite',
 'Aaronitic',
 'Aaru',
 'Ab',
 'aba',
 'Ababdeh',
 'Ababua',
 'abac']

#### Get unique list of words after normalizing case

In [None]:
#	Normalize the casing for all the terms
valid_words = [term.lower() for term in valid_words]
valid_words[:20]

In [7]:
# Some duplicates would have been induced, create unique list after normalizing.
valid_words = list(sorted(set(valid_words))) # Build an unordered collection of unique elements.

In [8]:
len(valid_words)

234377

In [9]:
valid_words[:10]

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic']

#### We needn't apply the spell checker on stop words or punctuations. This will make the code much more efficient.

Final stop word list to be used will be English stopwords from NLTK together with punctuations

In [10]:
# 5.	Create a list of stop words which should include:
from nltk.corpus import stopwords
from string import punctuation

In [11]:
import nltk
nltk.download('stopwords')
stop_nltk = stopwords.words("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
# All punctuations
stop_punct = list(punctuation)

In [13]:
# c.	Final list should be a combination of these two
stop_final = stop_nltk + stop_punct

In [14]:
len(stop_final)

211

### Function to get correction for a single term
1. For a given term, find its edit distance with each term in the valid word list
2. Store the result in a dictionary with the key as the term, and the edit distance as the value
3. Sort the dictionary in asccending order of the values
4. Return the first entry in the sorted result (value with minimum edit distance)

In [15]:
nltk.edit_distance('salsa', 'salso')

1

In [17]:
def get_correct_term(inp_word):
    res_dict = {valid_term:nltk.edit_distance(inp_word, valid_term) for valid_term in valid_words[:20000]}
    res_dict_sorted = sorted(res_dict.items(), key=lambda kv: kv[1], reverse=False)
    return res_dict_sorted[0][0]

In [18]:
get_correct_term("abacus")

'abacus'

In [30]:
get_correct_term("abocus")

'abacus'

#### Write the commands to tokenize after lowering case for any given input sentence
- Use NLTKs word_tokenize for this

In [23]:
inp_sent = "The new abacos is great"

In [24]:
nltk.download('punkt')
res = word_tokenize(inp_sent.lower())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [25]:
res

['the', 'new', 'abacos', 'is', 'great']

#### Make a set from valid_words, for faster lookup to check if word is in valid list or not.
Only those words which are not in the valid list need to be corrected.

In [26]:
valid_words_set = set(valid_words)

#### Define a function for spell correcting any given input sentence
1. tokenize after doing lower case
2. For each term in the tokenized sentence +
    - check if the term is in the list of valid words (valid_words_set)
    - if yes, return the word as is
    - if no, get the correct word using get_correct_term function
3. Return the joined string as output


In [27]:
inp_sent

'The new abacos is great'

In [28]:
def correct_set(inp_sent):
    inp_tokens = word_tokenize(inp_sent.lower())
    corrected_tokens = [term if ((term in valid_words_set) or (term in stop_final)) else get_correct_term(term) for term in inp_tokens]
    return " ".join(corrected_tokens)

In [29]:
# Test the function for the input sentence “The new abacos is great
correct_set('The new abacos is great')

'the new abacus is great'