# Create a dictionary of misspellings

This notebook parses the whole dataset and adds to an empty dict every token starting with an alpha that is not known to the vocabulary. After that, for each misspelled word a corrected version is found using pyspellchecker.

In [1]:
import pandas as pd
import re
import spacy
from spellchecker import SpellChecker
import pickle
import time
from nlp_utils import preprocess, find_pattern
from custom_tokenizer import combined_rule_tokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vrozova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Load RMH data**

In [2]:
df = pd.read_csv("../data/spelling_correction/rmh_nospellcorr.csv")
df.head()

Unnamed: 0,SH,SI,length,text,text_clean
0,0.0,,140,"SOB for 5/7, been to GP given prednisolone, co...","sob for 5/7 , been to gp given prednisolone , ..."
1,0.0,,107,"pt has lac down right forehead, to eyebrow, wi...","pt has lac down right forehead , to eyebrow , ..."
2,0.0,,74,"pt expect MBA, trapped for 45mins, #right femu...","pt expect mba , trapped for 45 mins , fracture..."
3,0.0,,167,L) sided flank pain same as previous renal col...,left sided flank pain same as previous renal...
4,0.0,,193,generalised abdo pain and associated headache ...,generalised abdo pain and associated headache ...


**Create a dictionary of misspellings**

In [3]:
def starts_with_alpha(token):
    return token == "" or token[0].isalpha()

def add_misspelling(text):
    tokens = text.split()
    for token in spell.unknown(tokens):
        if starts_with_alpha(token):
            misspelled[token] = misspelled.get(token, 0) + 1

In [4]:
# Load a custom word frequency list
with open ('../data/spelling_correction/rmh_custom_vocab.txt', 'rb') as f:
    vocab = pickle.load(f)
    
# Initialise spellchecker with a custom vocab
spell = SpellChecker(language=None)
spell.word_frequency.load_words(vocab)

print("Domain-specific vocabulary contains %d unique words (%d words in total)." % 
      (len(set(vocab)), len(vocab)))

misspelled = dict()

Domain-specific vocabulary contains 36506 unique words (9127336 words in total).


In [5]:
%%time
df.text_clean.apply(add_misspelling)

print("Found %d misspelled words." % len(misspelled))

with open('../data/spelling_correction/rmh_misspelled_dict_nocorr.txt', 'wb') as f:
    pickle.dump(misspelled, f)

Found 60561 misspelled words.
CPU times: user 25.5 s, sys: 9.51 ms, total: 25.6 s
Wall time: 25.6 s


**Find a correct spelling for every misspelled word**

In [None]:
%%time
for token in list(misspelled.keys()):
    misspelled.update({token : (misspelled[token], spell.correction(token))})

In [None]:
# start = [0, 5000, 10000, 15000]
# end = [5000, 10000, 15000, len(misspelled)]

# for i,j in zip(start, end):
#     print(i, j)
# print("Correcting spelling of {} tokens...".format(len(misspelled)))
# i = 0
# for token in misspelled:
#     misspelled.update({token : (misspelled[token], spell.correction(token))})
#     if i % 5000 == 0:
#         with open('data/spelling correction/' + filename + 'misspelled', 'a') as f:
#             pickle.dump(misspelled, f)
#         print(i)
#     i += 1

### Manually correct misspellings

In [None]:
del misspelled[""]

In [None]:
sorted(misspelled.items(), key=lambda item: item[1], reverse=True)

In [None]:
misspelled.update({"spont" : (misspelled["spont"][0], "spontaneous")})
misspelled.update({"ecat" : (misspelled["ecat"][0], "ecatt")})
misspelled.update({"spontanoues" : (misspelled["spontanoues"][0], "spontaneous")})
misspelled.update({"sapu" : (misspelled["sapu"][0], "saapu")})
misspelled.update({"ethol" : (misspelled["ethol"][0], "ethanol")})
misspelled.update({"sucidial" : (misspelled["sucidial"][0], "suicidal")})
misspelled.update({"incont" : (misspelled["incont"][0], "incontinent")})

In [None]:
with open('../data/spelling_correction/rmh_misspelled_dict.txt', 'wb') as f:
    pickle.dump(misspelled, f)