# Create Your Own Spell Checker

In [1]:
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import nltk
from nltk.corpus import words, stopwords
from string import punctuation


In [4]:
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def get_valid_words():
    return set(words.words())

In [6]:
def normalize_casing(term):
    return term.lower()

In [7]:
def create_unique_list(terms):
    return list(set(terms))


In [8]:
def get_stop_words():
    nltk_stopwords = set(stopwords.words('english'))
    punctuation_set = set(punctuation)
    stop_words = nltk_stopwords.union(punctuation_set)
    return stop_words

In [9]:
def get_correct_term(target_term, valid_words, max_entries=20000):
    edit_distances = {}
    for word in list(valid_words)[:max_entries]:
        edit_distances[word] = nltk.edit_distance(target_term, word)
        sorted_distances = sorted(edit_distances.items(), key=lambda x: x[1])
    return sorted_distances[0][0]

In [10]:
def spelling_correction(sentence, valid_words_set, stop_words_set):
    tokenized_sentence = nltk.word_tokenize(sentence.lower())
    corrected_sentence = []

    for term in tokenized_sentence:
        if term in valid_words_set:
            corrected_sentence.append(term)
        elif term not in stop_words_set:
            corrected_term = get_correct_term(term, valid_words_set)
            corrected_sentence.append(corrected_term)

    return ' '.join(corrected_sentence)

In [11]:
# Task 1
valid_words_set = get_valid_words()

In [12]:
# Task 2
print(list(valid_words_set)[:20])

['filament', 'Acis', 'phenological', 'bluegrass', 'sclerencephalia', 'pulvillus', 'ropeman', 'tench', 'extracloacal', 'Spaniard', 'enterocoelous', 'insinuate', 'pelvis', 'earthpea', 'spiderish', 'gonydeal', 'noy', 'unflustered', 'avo', 'predecide']


In [13]:
# Task 3
normalized_valid_words = create_unique_list([normalize_casing(word) for word in valid_words_set])


In [14]:
# Task 4
print(normalized_valid_words[:20])

['filament', 'phenological', 'bluegrass', 'sclerencephalia', 'pulvillus', 'ropeman', 'tench', 'extracloacal', 'enterocoelous', 'pamlico', 'insinuate', 'pelvis', 'earthpea', 'spiderish', 'gonydeal', 'noy', 'unflustered', 'avo', 'predecide', 'bowdichia']


In [15]:
# Task 5
stop_words_set = get_stop_words()


In [16]:
# Task 6
example_correction = get_correct_term('committee', valid_words_set)
print(f"Corrected term for 'committee': {example_correction}")


Corrected term for 'committee': committee


In [17]:
# Task 7
valid_words_set = set(valid_words_set)

In [18]:
# Task 8
def execute_spelling_correction(input_sentence):
    return spelling_correction(input_sentence, valid_words_set, stop_words_set)

In [19]:
# Task 9
input_sentence = "The new abacos is great"
output_sentence = execute_spelling_correction(input_sentence)
print("Input Sentence:", input_sentence)
print("Output Sentence:", output_sentence)


Input Sentence: The new abacos is great
Output Sentence: the new abacist is great
