# Create a dictionary of misspellings

This notebook parses the whole dataset and adds to an empty dict every token starting with an alpha that is not known to the vocabulary. After that, for each misspelled word a corrected version is found using pyspellchecker.

In [None]:
import pandas as pd
import re
import spacy
from spellchecker import SpellChecker
import pickle
import time
from nlp_utils import preprocess, find_pattern
from custom_tokenizer import combined_rule_tokenizer

**Load RMH data**

In [None]:
df = pd.read_csv("../../data/spelling_correction/rmh_nospellcorr.csv")
df.head()

**Create a dictionary of misspellings**

In [None]:
def starts_with_alpha(token):
    return token == "" or token[0].isalpha()

def add_misspelling(text):
    tokens = text.split()
    for token in spell.unknown(tokens):
        if starts_with_alpha(token):
            misspelled[token] = misspelled.get(token, 0) + 1

In [None]:
# Load a custom word frequency list
with open ('../../data/spelling_correction/rmh_custom_vocab.txt', 'rb') as f:
    vocab = pickle.load(f)
    
# Initialise spellchecker with a custom vocab
spell = SpellChecker(language=None)
spell.word_frequency.load_words(vocab)

print("Domain-specific vocabulary contains %d unique words (%d words in total)." % 
      (len(set(vocab)), len(vocab)))

misspelled = dict()

In [None]:
%%time
df.text_clean.apply(add_misspelling)

print("Found %d misspelled words." % len(misspelled))

with open('../../data/spelling_correction/rmh_misspelled_dict_nocorr.txt', 'wb') as f:
    pickle.dump(misspelled, f)

**Find a correct spelling for every misspelled word**

In [None]:
%%time
for token in list(misspelled.keys()):
    misspelled.update({token : (misspelled[token], spell.correction(token))})

### Manually correct misspellings

In [None]:
del misspelled[""]

In [None]:
sorted(misspelled.items(), key=lambda item: item[1], reverse=True)

In [None]:
misspelled.update({"spont" : (misspelled["spont"][0], "spontaneous")})
misspelled.update({"ecat" : (misspelled["ecat"][0], "ecatt")})
misspelled.update({"spontanoues" : (misspelled["spontanoues"][0], "spontaneous")})
misspelled.update({"sapu" : (misspelled["sapu"][0], "saapu")})
misspelled.update({"ethol" : (misspelled["ethol"][0], "ethanol")})
misspelled.update({"sucidial" : (misspelled["sucidial"][0], "suicidal")})
misspelled.update({"incont" : (misspelled["incont"][0], "incontinent")})

In [None]:
with open('../../data/spelling_correction/rmh_misspelled_dict.txt', 'wb') as f:
    pickle.dump(misspelled, f)