# Create a dictionary of misspellings

This notebook parses the whole dataset and adds to an empty dict every token starting with an alpha that is not known to the vocabulary. After that, for each misspelled word a corrected version is found using pyspellchecker.

In [None]:
import pandas as pd
from spellchecker import SpellChecker
import json

# Project imports
from self_harm_triage_notes.config import interim_data_dir, spell_corr_dir
from self_harm_triage_notes.text_utils import *

In [None]:
# ED vocabulary
vocab_filename = "rmh_2012_2017_dev_amt6"

# Dataset used for learning
tokenized_data_filename = "rmh_2012_2017_dev_amt6"

### Load pre-processed and tokenised training data

In [None]:
df = pd.read_parquet(interim_data_dir / (tokenized_data_filename + "_nospellcorr.parquet"), engine="pyarrow")
print(df.shape)
df.head()

### Load ED vocabulary

In [None]:
# Load the ED vocabulary
vocab = load_vocab(spell_corr_dir, vocab_filename)

# Load ED word frequency list
word_list = load_word_list(spell_corr_dir, vocab_filename)

# Load the dictionary of corrected misspellings
misspelled_dict = load_misspelled_dict(spell_corr_dir, vocab_filename)

### Find tokens unknown to the vocabulary

In [None]:
# Count all valid tokens in the corpus
counts = count_tokens(df.tokenized_triage_note, valid=True)
# Select tokens unknown to the vocabulary
unknown_tokens = {k:v for k,v in counts.items() if k not in vocab}
print("Detected %d unique tokens unknown to the vocabulary (%d in total)." % 
          (len(unknown_tokens), sum(v for v in unknown_tokens.values())))
print("- %d tokens occur only once." % 
      sum(1 for v in unknown_tokens.values() if v==1))
print("- %d tokens occur less than 10 times." % 
          sum(1 for v in unknown_tokens.values() if v<10))

### Attempt to correct spelling in OOV tokens

In [None]:
def find_correct_spelling(unknown_tokens, misspelled_dict=None, word_list=None):
    """
    Find a correct spelling for every unknown token either based on an 
    existing dictionary of misspellings or by running spellchecker. 
    """
    if word_list:
        # Initialise spellchecker with a custom vocab
        spell = SpellChecker(language=None)
        spell.word_frequency.load_words(word_list)
    else:
        spell=None
        
    known_misspellings = 0
    
    for token in unknown_tokens.keys():
        if misspelled_dict and token in misspelled_dict.keys():
            unknown_tokens.update({token : (unknown_tokens[token], misspelled_dict[token])})
            known_misspellings += 1
        elif spell:
            unknown_tokens.update({token : (unknown_tokens[token], spell.correction(token))})
        else:
            unknown_tokens.update({token : (unknown_tokens[token], None)})
            
    print("Found a spelling correction for %d words." % sum(1 for v in unknown_tokens.values() if v[1]!=None))
    print("Out of those, %d were in the existing dictionary of misspellings and the rest are new." % 
          known_misspellings)
    
    return unknown_tokens

In [None]:
unknown_tokens = find_correct_spelling(unknown_tokens, misspelled_dict=misspelled_dict, word_list=word_list)
# unknown_tokens = find_correct_spelling({k:v for k,v in unknown_tokens.items() if v>=100}, word_list=word_list)

In [None]:
sorted(unknown_tokens.items(),  key=lambda item: item[1][0], reverse=True)

In [None]:
pd.DataFrame.from_dict(unknown_tokens, 
                       orient='index', 
                       columns=('phrase', 'count')
                      ).to_csv(spell_corr_dir / "unknown_tokens.csv")

### Overwrite spelling corrections 

In [None]:
# Load reviewed corrections
corrections = pd.read_csv(spell_corr_dir / "unknown_tokens_reviewed.csv")
corrections.columns = ['phrase', 'count', 'correction']
corrections.fillna({'correction': ""}, inplace=True)
corrections = corrections.set_index('phrase').correction.to_dict()

for k,v in corrections.items():
    if v=="":
        corrections[k] = None
corrections

In [None]:
# Update dictionary of misspellings
for k in unknown_tokens:
    if k in corrections:
        unknown_tokens[k] = (unknown_tokens[k][0], corrections[k])

In [None]:
sorted(unknown_tokens.items(),  key=lambda item: item[1][0], reverse=True)

In [None]:
# def overwirte_correction(token, correction):
#     count = unknown_tokens[token][0]
#     unknown_tokens.update({token : (count, correction)})
    
# overwirte_correction("incont", "incontinent")
# overwirte_correction("hyperchol", "hypercholesterolemia")
# overwirte_correction("edmo", "ecmo")
# overwirte_correction("ivabs", "abs")
# overwirte_correction("intermient", "intermittent")
# overwirte_correction("intermit", "intermittent")
# overwirte_correction("ivab", "abs")
# overwirte_correction("bilate", "bilateral")
# overwirte_correction("symp", "symptoms")
# overwirte_correction("palpn", "palpitation")
# overwirte_correction("sympt", "symptoms")
# overwirte_correction("excas", "exacerbation")
# overwirte_correction("monc", "medical oncology")
# overwirte_correction("polypharm", "polypharmacy")
# overwirte_correction("pmac", "pmcc")
# overwirte_correction("ethol", "etoh")
# overwirte_correction("cholesectomy", "cholecystectomy")
# overwirte_correction("autoinfusion", "auto infusion")
# overwirte_correction("autoinfused", "auto infused")
# overwirte_correction("boxhill", "box hill")
# overwirte_correction("swollening", "swelling")
# overwirte_correction("vascuarly", "vascular")
# overwirte_correction("neckstiffness", "neck stiffness")
# overwirte_correction("suddenonset", "sudden onset")
# overwirte_correction("motorsensory", "motor sensory")
# overwirte_correction("neurovascally", "neurovascularly")
# overwirte_correction("interhospital", "inter hospital")
# overwirte_correction("antiinflammatories", "antiinflammatory")
# overwirte_correction("hardcollar", "hard collar")
# overwirte_correction("dirroreah", "diarrhoea")
# overwirte_correction("petermac", "pmcc")
# overwirte_correction("weighbare", "weightbear")
# overwirte_correction("bodyache", "body ache")
# overwirte_correction("spinabifida", "spina bifida")
# overwirte_correction("painrelief", "pain relief")
# overwirte_correction("hypercholest", "hypercholesterol")
# overwirte_correction("triagephx", "triage phx")
# overwirte_correction("aspirin300", "aspirin 300")
# overwirte_correction("sorethroat", "sore throat")
# overwirte_correction("haemoserrous", "haemoserous")
# overwirte_correction("interminant", "intermittent")
# overwirte_correction("apperiants", "aperients")
# overwirte_correction("bodyaches", "body aches")
# overwirte_correction("facestrike", "face strike")

### Corrected misspellings and  OOV tokens

In [None]:
def split_misspelled_oov(unknown_tokens):
    """
    Separate unknown tokens into corrected and out-of-vocabulary tokens. 
    """
    misspelled = {k:v[1] for k,v in unknown_tokens.items() if v[1]!=None}
    print("Corrected the spelling of %d words." % len(misspelled))
    
    oov = {k:v for k,v in unknown_tokens.items() if v[1]==None}
    print("Failed to correct spelling of %d words." % len(oov))
    print("- %d words appear in the corpus only once." % 
          sum(1 for v in oov.values() if v[0]==1))
    print("- %d words appear in the corpus less than 10 times." % 
          sum(1 for v in oov.values() if v[0]<10))
    
    return misspelled, oov

In [None]:
misspelled, oov = split_misspelled_oov(unknown_tokens)

In [None]:
with open(spell_corr_dir / (tokenized_data_filename + "_misspelled_dict.json"), 'w') as f:
    json.dump(misspelled, f)