# Five preprocessing steps:
This notebook contains the main preprocessing steps to clean triage notes:
1. Pre-processing
2. Tokenization
3. Re-tokenization
4. Post-processing
5. Spelling correction
6. Slang replacement

The logic of the notebook is non-linear, i.e. it provides the input for and uses the output of other notebooks and should be excecuted according to the flowchart.

In [None]:
import pandas as pd
import re
import spacy
from spellchecker import SpellChecker
import pickle
import time
from nlp_utils import preprocess, find_pattern
from custom_tokenizer import combined_rule_tokenizer

### Load data

In [None]:
# df = pd.read_csv("../data/rmh_raw.csv")
df = pd.read_csv("../../data/rmh_raw.csv")
print(df.shape)
df.head()

### Preprocess comments
Preprocess to handle errors in data extraction and some abbreviations specific to triage notes.
* `\x7f`
* `'/c`
* `l)` as "left", `r)` as "right"
* `@` as "at"
* `#` as "fractured"
* `++ve` as "positive", `--ve` as "negative"
* etc.

In [None]:
%%time
# Preprocess comments
df['text_clean'] = df.text.apply(preprocess)

### Tokenize

In [None]:
# Load scispacy model for tokenization
nlp = spacy.load("en_core_sci_sm", disable=['tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner'])
nlp.tokenizer = combined_rule_tokenizer(nlp)

In [None]:
%%time
df['text_clean'] = list(nlp.pipe(df.text_clean))

### Retokenize

In [None]:
def is_multiple_tokens(string):
    pattern = re.compile(".[-/\+_,\?\.].")
    return pattern.search(string) and string not in vocab

def retokenize(text):
    new_text = []
    for token in text:
        if token.like_num:
            new_text.append(token.text)
        elif is_multiple_tokens(token.text):
            [new_text.append(new_token) for new_token in re.split('([-/\+_,\?\.])', token.text)]
        else:
            new_text.append(token.text)
            
    return ' '.join(new_text)

In [None]:
# Load a custom word frequency list
with open ("../../data/spelling_correction/rmh_custom_vocab.txt", 'rb') as f:
    vocab = pickle.load(f)
    
# Initialise spellchecker with a custom vocab
spell = SpellChecker(language=None)
spell.word_frequency.load_words(vocab)

print("Custom vocabulary contains a total of %d words and %d unique words." % 
      (len(vocab), len(set(vocab))))

In [None]:
%%time
df.text_clean = df.text_clean.apply(retokenize)

### Postprocessing

In [None]:
pattern = re.compile("\s\.([a-z]{2,})")

df.text_clean = df.text_clean.apply(lambda x: pattern.sub(r" . \1", x))

In [None]:
df.to_csv("../../data/spelling_correction/rmh_nospellcorr.csv", index=False)

### Correct spelling in triage notes

In [None]:
df = pd.read_csv("../../data/spelling_correction/rmh_nospellcorr.csv")
print(df.shape)
df.head()

In [None]:
# Load dictionary of misspelled words and their corrections
with open ("../../data/spelling_correction/rmh_misspelled_dict.txt", 'rb') as f:
    misspelled = pickle.load(f)

In [None]:
def spelling_correction(doc):
    tokens = doc.text.split()
    corrected_tokens = [misspelled[token][1] if token in misspelled else token for token in tokens]
    return ' '.join(corrected_tokens)

In [None]:
%%time
df.text_clean = df.text_clean.apply(spelling_correction)

### Replace slang drug names

In [None]:
df_drugs = pd.read_csv("../../data/spelling_correction/medication_names.csv")

df_drugs.slang = df_drugs.slang.str.strip().str.lower()
df_drugs.generic_name = df_drugs.generic_name.str.strip().str.lower()
df_drugs.dropna(subset=["slang"], inplace=True)

slang_names = dict(zip(df_drugs.slang, df_drugs.generic_name))

In [None]:
def slang_to_generic(doc):
    tokens = doc.split()
    corrected_tokens = [slang_names[token] if token in slang_names else token for token in tokens]
    return ' '.join(corrected_tokens)

In [None]:
%%time
df.text_clean = df.text_clean.apply(slang_to_generic)

### Save the dataset

In [None]:
df.to_csv("../../data/rmh_cleaned.csv", index=False)