# Five preprocessing steps:
This notebook contains the main preprocessing steps to clean triage notes:
1. Pre-processing
2. Tokenization
3. Re-tokenization
4. Post-processing
5. Spelling correction
6. Slang replacement

The logic of the notebook is non-linear, i.e. it provides the input for and uses the output of other notebooks and should be excecuted according to the flowchart.

In [14]:
import pandas as pd
import re
import spacy
from spellchecker import SpellChecker
import pickle
import time
from nlp_utils import preprocess, find_pattern
from custom_tokenizer import combined_rule_tokenizer

### Load data

In [16]:
# df = pd.read_csv("../data/rmh_raw.csv")
df = pd.read_csv("../data/epic2020.csv")
print(df.shape)
df.head()

(57681, 3)


Unnamed: 0,timestamp,text,length
0,2020-01-01 00:15:00,"argument with friend, threatened to jump off b...",112
1,2020-01-01 00:20:00,mech fall with swelling to L) hand and dec ROM...,60
2,2020-01-01 00:33:00,"Left lower dental pain since last year, seekin...",142
3,2020-01-01 00:34:00,"ETOH, scuffle with HS ? LOC, lac approx 2cm ab...",145
4,2020-01-01 00:36:00,mech fall landed L) hip. headstrike onto wall....,84


### Preprocess comments
Preprocess to handle errors in data extraction and some abbreviations specific to triage notes.
* `\x7f`
* `'/c`
* `l)` as "left", `r)` as "right"
* `@` as "at"
* `#` as "fractured"
* `++ve` as "positive", `--ve` as "negative"
* etc.

In [17]:
%%time
# Preprocess comments
df['text_clean'] = df.text.apply(preprocess)

CPU times: user 6.2 s, sys: 9.54 ms, total: 6.21 s
Wall time: 6.23 s


### Tokenize

In [18]:
# Load scispacy model for tokenization
nlp = spacy.load("en_core_sci_sm", disable=['tagger', 'attribute_ruler', 'lemmatizer', 'parser', 'ner'])
nlp.tokenizer = combined_rule_tokenizer(nlp)

  target (Model): The target node.


In [19]:
%%time
df['text_clean'] = list(nlp.pipe(df.text_clean))

CPU times: user 55.1 s, sys: 1.59 s, total: 56.7 s
Wall time: 56.8 s


### Retokenize

In [None]:
def is_multiple_tokens(string):
    pattern = re.compile(".[-/\+_,\?\.].")
    return pattern.search(string) and string not in vocab

def retokenize(text):
    new_text = []
    for token in text:
        if token.like_num:
            new_text.append(token.text)
        elif is_multiple_tokens(token.text):
            [new_text.append(new_token) for new_token in re.split('([-/\+_,\?\.])', token.text)]
        else:
            new_text.append(token.text)
            
    return ' '.join(new_text)

In [None]:
# Load a custom word frequency list
with open ('../data/spelling_correction/rmh_custom_vocab.txt', 'rb') as f:
    vocab = pickle.load(f)
    
# Initialise spellchecker with a custom vocab
spell = SpellChecker(language=None)
spell.word_frequency.load_words(vocab)

print("Custom vocabulary contains a total of %d words and %d unique words." % 
      (len(vocab), len(set(vocab))))

In [None]:
%%time
df.text_clean = df.text_clean.apply(retokenize)

### Postprocessing

In [None]:
pattern = re.compile("\s\.([a-z]{2,})")

df.text_clean = df.text_clean.apply(lambda x: pattern.sub(r" . \1", x))

In [None]:
df.to_csv("../data/spelling_correction/rmh_nospellcorr.csv", index=False)

### Correct spelling in triage notes

In [4]:
df = pd.read_csv("../data/spelling_correction/rmh_nospellcorr.csv")
print(df.shape)
df.head()

(486458, 5)


Unnamed: 0,SH,SI,length,text,text_clean
0,0.0,,140,"SOB for 5/7, been to GP given prednisolone, co...","sob for 5/7 , been to gp given prednisolone , ..."
1,0.0,,107,"pt has lac down right forehead, to eyebrow, wi...","pt has lac down right forehead , to eyebrow , ..."
2,0.0,,74,"pt expect MBA, trapped for 45mins, #right femu...","pt expect mba , trapped for 45 mins , fracture..."
3,0.0,,167,L) sided flank pain same as previous renal col...,left sided flank pain same as previous renal...
4,0.0,,193,generalised abdo pain and associated headache ...,generalised abdo pain and associated headache ...


In [21]:
# Load dictionary of misspelled words and their corrections
with open ('../data/spelling_correction/rmh_misspelled_dict.txt', 'rb') as f:
    misspelled = pickle.load(f)

In [27]:
def spelling_correction(doc):
    tokens = doc.text.split()
    corrected_tokens = [misspelled[token][1] if token in misspelled else token for token in tokens]
    return ' '.join(corrected_tokens)

In [28]:
%%time
df.text_clean = df.text_clean.apply(spelling_correction)

CPU times: user 1.18 s, sys: 9.91 ms, total: 1.19 s
Wall time: 1.19 s


### Replace slang drug names

In [29]:
df_drugs = pd.read_csv("../data/spelling_correction/medication_names.csv")

df_drugs.slang = df_drugs.slang.str.strip().str.lower()
df_drugs.generic_name = df_drugs.generic_name.str.strip().str.lower()
df_drugs.dropna(subset=["slang"], inplace=True)

slang_names = dict(zip(df_drugs.slang, df_drugs.generic_name))

In [30]:
def slang_to_generic(doc):
    tokens = doc.split()
    corrected_tokens = [slang_names[token] if token in slang_names else token for token in tokens]
    return ' '.join(corrected_tokens)

In [31]:
%%time
df.text_clean = df.text_clean.apply(slang_to_generic)

CPU times: user 301 ms, sys: 9.04 ms, total: 310 ms
Wall time: 310 ms


### Save the dataset

In [32]:
df.to_csv("../data/epic2020_cleaned.csv", index=False)

### How much did this reduce the dimensionality?

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
df1 = pd.read_csv("../data/rmh_raw.csv")
df2 = pd.read_csv("../data/spelling_correction/rmh_nospellcorr.csv")
df2 = df2[:466605]
df3 = pd.read_csv("../data/rmh_cleaned.csv")

In [35]:
vectorizer = CountVectorizer()
vectorizer.fit(df.text)
len(vectorizer.get_feature_names())

29778

In [36]:
vectorizer = CountVectorizer()
vectorizer.fit(df.text_clean)
len(vectorizer.get_feature_names())

26341

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(df3.text_clean)
len(vectorizer.get_feature_names())

In [None]:
100328 - 43887

In [None]:
94326 - 60561

In [None]:
df = pd.read_csv("../data/rmh_raw.csv")
df_train = pd.read_csv("../data/rmh_raw_train.csv")
df_test = pd.read_csv("../data/rmh_raw_test.csv")
df_ho = pd.read_csv("../data/rmh_raw_holdout.csv")

In [None]:
df.shape, df_train.shape, df_test.shape, df_ho.shape

In [None]:
df.head()

In [None]:
def stats_print(df):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df.text)
    
    lexical_diversity = X.shape[1] / X.sum().sum()
    print("Vocabulary size: ", X.shape[1])
    print("Total number of tokens: ", X.sum().sum())
    print("Lexical diversity: ", lexical_diversity)
    print("Total number of reviews:", df.shape[0])

In [None]:
stats_print(df)

In [None]:
stats_print(df_train)

In [None]:
stats_print(df_test)

In [None]:
stats_print(df_ho)

In [None]:
312177 + 78045