# Create a dictionary of misspellings

This notebook parses the whole dataset and adds to an empty dict every token starting with an alpha that is not known to the vocabulary. After that, for each misspelled word a corrected version is found using pyspellchecker.

In [1]:
import pandas as pd
from spellchecker import SpellChecker
import json

# Project imports
from self_harm_triage_notes.config import interim_data_dir, spell_corr_dir
from self_harm_triage_notes.text_utils import load_vocab, load_word_list, count_tokens

In [2]:
# ED vocabulary
vocab_filename = "rmh_2012_2017_dev_amt6"

# Dataset used for learning
tokenized_data_filename = "rmh_2012_2017_dev_amt6"

### Load pre-processed and tokenised training data

In [3]:
df = pd.read_parquet(interim_data_dir / (tokenized_data_filename + "_nospellcorr.parquet"), engine="pyarrow")
print(df.shape)
df.head()

(319288, 17)


Unnamed: 0,uid,sex,age,arrival_method,arrival_date,year,triage_note,SH,SI,AOD_OD,audit_case,source_system,quarter,length,val_fold,preprocessed_triage_note,tokenized_triage_note
0,RMH-1,female,64.0,other,2012-01-08 00:35:00,2012,"SOB for 5/7, been to GP given prednisolone, co...",Negative,Negative,Negative,,Symphony,2012Q1,140,4,sob for 5/7 been to gp given prednisolone coug...,sob for 5/7 been to gp given prednisolone coug...
1,RMH-2,male,31.0,other,2012-01-08 00:41:00,2012,"pt has lac down right forehead, to eyebrow, wi...",Negative,Negative,Negative,,Symphony,2012Q1,107,1,pt has lac down right forehead to eyebrow will...,pt has lac down right forehead to eyebrow will...
2,RMH-3,male,19.0,road ambulance,2012-01-08 00:52:00,2012,"pt expect MBA, trapped for 45mins, #right femu...",Negative,Negative,Negative,,Symphony,2012Q1,74,1,pt expect mba trapped for 45mins fracture righ...,pt expect mba trapped for 45 mins fracture rig...
3,RMH-5,female,25.0,other,2012-01-08 01:23:00,2012,generalised abdo pain and associated headache ...,Negative,Negative,Negative,,Symphony,2012Q1,196,5,generalised abdo pain and associated headache ...,generalised abdo pain and associated headache ...
4,RMH-6,female,18.0,other,2012-01-08 01:37:00,2012,abdo pain associated with constipation. Pt se...,Negative,Negative,Negative,,Symphony,2012Q1,134,5,abdo pain associated with constipation. pt see...,abdo pain associated with constipation . pt se...


### Load ED vocabulary

In [4]:
# Load the ED vocabulary
vocab = load_vocab(vocab_filename)

# Load ED word frequency list
word_list = load_word_list(vocab_filename)

Domain-specific vocabulary contains 20043 words.
Word frequency list contains 20043 unique words (6109147 in total).


### Find tokens unknown to the vocabulary

In [5]:
# Count all valid tokens in the corpus
counts = count_tokens(df.tokenized_triage_note, valid=True)
# Select tokens unknown to the vocabulary
unknown_tokens = {k:v for k,v in counts.items() if k not in vocab}
print("Detected %d unique tokens unknown to the vocabulary (%d in total)." % 
          (len(unknown_tokens), sum(v for v in unknown_tokens.values())))
print("- %d tokens occur only once." % 
      sum(1 for v in unknown_tokens.values() if v==1))
print("- %d tokens occur less than 10 times." % 
          sum(1 for v in unknown_tokens.values() if v<10))

Detected 50446 unique tokens unknown to the vocabulary (239600 in total).
- 34400 tokens occur only once.
- 47422 tokens occur less than 10 times.


### Attempt to correct spelling in OOV tokens

In [6]:
def find_correct_spelling(unknown_tokens, misspelled_dict=None, word_list=None):
    """
    Find a correct spelling for every unknown token either based on an 
    existing dictionary of misspellings or by running spellchecker. 
    """
    if word_list:
        # Initialise spellchecker with a custom vocab
        spell = SpellChecker(language=None)
        spell.word_frequency.load_words(word_list)
    else:
        spell=None
        
    known_misspellings = 0
    
    for token in unknown_tokens.keys():
        if misspelled_dict and token in misspelled_dict.keys():
            unknown_tokens.update({token : (unknown_tokens[token], misspelled_dict[token])})
            known_misspellings += 1
        elif spell:
            unknown_tokens.update({token : (unknown_tokens[token], spell.correction(token))})
        else:
            unknown_tokens.update({token : (unknown_tokens[token], None)})
            
    print("Found a spelling correction for %d words." % sum(1 for v in unknown_tokens.values() if v[1]!=None))
    print("Out of those, %d were in the existing dictionary of misspellings and the rest are new." % 
          known_misspellings)
    
    return unknown_tokens

In [None]:
unknown_tokens = find_correct_spelling(unknown_tokens, word_list=word_list)
# unknown_tokens = find_correct_spelling({k:v for k,v in unknown_tokens.items() if v>=100}, word_list=word_list)

In [8]:
sorted(unknown_tokens.items(),  key=lambda item: item[1][0], reverse=True)

[("'s", (8865, 'as')),
 ('dizzyness', (3775, 'dizziness')),
 ('sentances', (3100, 'sentences')),
 ('refered', (2825, 'refereed')),
 ('insitu', (2563, 'inset')),
 ('intermittant', (2228, 'intermittent')),
 ('vomitting', (2069, 'omitting')),
 ('xray', (1573, 'bray')),
 ('tachycardic', (1568, 'tachycardia')),
 ('abcess', (1538, 'abscess')),
 ('lacs', (1079, 'acs')),
 ('diahorrea', (1066, None)),
 ('parkinsons', (960, 'parkinson')),
 ('traige', (900, 'triage')),
 ('intubated', (838, 'intubate')),
 ('palpatations', (773, 'palpitations')),
 ('maxalon', (743, 'maxolon')),
 ('aggitated', (664, 'agitated')),
 ('nill', (650, 'bill')),
 ('weightbear', (632, 'weight-bear')),
 ('asprin', (632, 'aspirin')),
 ('painfull', (615, 'painful')),
 ('xrays', (609, 'rays')),
 ('alledged', (588, 'alleged')),
 ('tonsilitis', (587, 'tonsillitis')),
 ('panadiene', (549, 'panadeine')),
 ('radiaiting', (549, 'radiating')),
 ('hlcnh', (545, 'hlc')),
 ('persistant', (502, 'persistent')),
 ('concious', (484, 'conscio

In [10]:
pd.DataFrame.from_dict(unknown_tokens, 
                       orient='index', 
                       columns=('phrase', 'count')
                      ).to_csv(spell_corr_dir / "unknown_tokens.csv")

### Overwrite spelling corrections 

In [21]:
# Load reviewed corrections
corrections = pd.read_csv(spell_corr_dir / "unknown_tokens_reviewed.csv")
corrections.columns = ['phrase', 'count', 'correction']
corrections.fillna({'correction': ""}, inplace=True)
corrections = corrections.set_index('phrase').correction.to_dict()

for k,v in corrections.items():
    if v=="":
        corrections[k] = None
corrections

{"'s": None,
 'dizzyness': 'dizziness',
 'sentances': 'sentences',
 'refered': 'referred',
 'insitu': 'in situ',
 'intermittant': 'intermittent',
 'vomitting': 'vomiting',
 'xray': 'x-ray',
 'tachycardic': 'tachycardia',
 'abcess': 'abscess',
 'lacs': 'lacerations',
 'diahorrea': 'diarrhoea',
 'parkinsons': 'parkinson',
 'traige': 'triage',
 'intubated': 'intubate',
 'palpatations': 'palpitations',
 'maxalon': 'maxolon',
 'aggitated': 'agitated',
 'nill': 'nil',
 'weightbear': 'weight-bear',
 'asprin': 'aspirin',
 'painfull': 'painful',
 'xrays': 'x-ray',
 'alledged': 'alleged',
 'tonsilitis': 'tonsillitis',
 'panadiene': 'panadeine',
 'radiaiting': 'radiating',
 'hlcnh': None,
 'persistant': 'persistent',
 'concious': 'conscious',
 'vomting': 'vomiting',
 'haemodynamically': 'hemodynamically',
 'assos': 'assoc',
 'odema': 'edema',
 'neurofen': 'nurofen',
 'nasuea': 'nausea',
 'brusing': 'bruising',
 'assult': 'assault',
 'lethergy': 'lethargy',
 'diarrheoa': 'diarrhoea',
 'unconcious'

In [22]:
# Update dictionary of misspellings
for k in unknown_tokens:
    if k in corrections:
        unknown_tokens[k] = (unknown_tokens[k][0], corrections[k])

In [None]:
# def overwirte_correction(token, correction):
#     count = unknown_tokens[token][0]
#     unknown_tokens.update({token : (count, correction)})
    
# overwirte_correction("incont", "incontinent")
# overwirte_correction("hyperchol", "hypercholesterolemia")
# overwirte_correction("edmo", "ecmo")
# overwirte_correction("ivabs", "abs")
# overwirte_correction("intermient", "intermittent")
# overwirte_correction("intermit", "intermittent")
# overwirte_correction("ivab", "abs")
# overwirte_correction("bilate", "bilateral")
# overwirte_correction("symp", "symptoms")
# overwirte_correction("palpn", "palpitation")
# overwirte_correction("sympt", "symptoms")
# overwirte_correction("excas", "exacerbation")
# overwirte_correction("monc", "medical oncology")
# overwirte_correction("polypharm", "polypharmacy")
# overwirte_correction("pmac", "pmcc")
# overwirte_correction("ethol", "etoh")
# overwirte_correction("cholesectomy", "cholecystectomy")
# overwirte_correction("autoinfusion", "auto infusion")
# overwirte_correction("autoinfused", "auto infused")
# overwirte_correction("boxhill", "box hill")
# overwirte_correction("swollening", "swelling")
# overwirte_correction("vascuarly", "vascular")
# overwirte_correction("neckstiffness", "neck stiffness")
# overwirte_correction("suddenonset", "sudden onset")
# overwirte_correction("motorsensory", "motor sensory")
# overwirte_correction("neurovascally", "neurovascularly")
# overwirte_correction("interhospital", "inter hospital")
# overwirte_correction("antiinflammatories", "antiinflammatory")
# overwirte_correction("hardcollar", "hard collar")
# overwirte_correction("dirroreah", "diarrhoea")
# overwirte_correction("petermac", "pmcc")
# overwirte_correction("weighbare", "weightbear")
# overwirte_correction("bodyache", "body ache")
# overwirte_correction("spinabifida", "spina bifida")
# overwirte_correction("painrelief", "pain relief")
# overwirte_correction("hypercholest", "hypercholesterol")
# overwirte_correction("triagephx", "triage phx")
# overwirte_correction("aspirin300", "aspirin 300")
# overwirte_correction("sorethroat", "sore throat")
# overwirte_correction("haemoserrous", "haemoserous")
# overwirte_correction("interminant", "intermittent")
# overwirte_correction("apperiants", "aperients")
# overwirte_correction("bodyaches", "body aches")
# overwirte_correction("facestrike", "face strike")

### Corrected misspellings and  OOV tokens

In [None]:
def split_misspelled_oov(unknown_tokens):
    """
    Separate unknown tokens into corrected and out-of-vocabulary tokens. 
    """
    misspelled = {k:v[1] for k,v in unknown_tokens.items() if v[1]!=None}
    print("Corrected the spelling of %d words." % len(misspelled))
    
    oov = {k:v for k,v in unknown_tokens.items() if v[1]==None}
    print("Failed to correct spelling of %d words." % len(oov))
    print("- %d words appear in the corpus only once." % 
          sum(1 for v in oov.values() if v[0]==1))
    print("- %d words appear in the corpus less than 10 times." % 
          sum(1 for v in oov.values() if v[0]<10))
    
    return misspelled, oov

In [25]:
misspelled, oov = split_misspelled_oov(unknown_tokens)

Corrected the spelling of 43695 words.
Failed to correct spelling of 6590 words.
- 5359 words appear in the corpus only once.
- 6373 words appear in the corpus less than 10 times.


In [26]:
with open(spell_corr_dir / (tokenized_data_filename + "_misspelled_dict.json"), 'w') as f:
    json.dump(misspelled, f)