In [2]:
import pandas as pd
import numpy as np
import re
import spacy
import en_core_med7_lg
import pickle
from spellchecker import SpellChecker
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from nltk.corpus import stopwords


pd.options.display.max_colwidth = 100

## Load data

In [17]:
df_raw = pd.read_csv("./data/wh_data_prepared.csv")
df_raw.head()

Unnamed: 0,description,text,SH,SI,SISH,comment,length
0,"MENTAL STATE - ALTERATION IN. ETOH AFFECTED, WANTS TO KILL SELF. HX MULTIPLE PRESENTATIONS WITH ...","ALTERNATES B/W AGGRESSION AND SETTLED, STATES NEEDS ADMISSION TO PSYCH. NO APPARENT INJURIES",0,1,1,"mental state - alteration in. etoh affected, wants to kill self. hx multiple presentations with ...",194
1,POISONING / OVERDOSE- 30MG DIAZEPAM+ETOH PHX RECENT DEATH IN FAMILY,"GCS12, PATENT AIRWAY, SPEAKS CLEARLY, INTOXICATED, APPEARS NEAT PRESENTATION",1,0,2,"poisoning / overdose- 30mg diazepam+etoh phx recent death in family. gcs12, patent airway, speak...",145
2,"MENTAL STATE - ALTERATION IN - FOUND LYING ON ROAD BY POLICE. ETOH EFFECTED CRYING, STATING WANT...",CRYING STATNG DOESN'T WANT TO BE HERE ANY MOR. STATES EX PARTNER ABUSED HER OVERNIGHT. FAMILY HA...,0,1,1,"mental state - alteration in - found lying on road by police. etoh effected crying, stating want...",269
3,"MENTAL STATE - SUICIDE ATTEMPT / RISK, PT WALKED IN FRONT OF CAR","RECEIVING CONGNITIVE THERAPY POST ENDING A TRAUMATIC R/SHIP 8/12 AGO, T 36.4, P 97 REG, STRONG R...",0,1,1,"mental state - suicide attempt / risk, pt walked in front of car. receiving congnitive therapy p...",396
4,"PT ACTING ABNORMALLY SINCE HAVING MISCARRIAGE 3/12 AGO, TODAY BEEN BITING RELATIVES, SCREAMING A...","PT CHATTING FREELY, HAS NO RECOLLECTION OF TODAYS EVENTS, STATES LAST MEMORY IS FIREWORKS AND DA...",0,0,0,"pt acting abnormally since having miscarriage 3/12 ago, today been biting relatives, screaming a...",297


In [18]:
df_raw = df_raw[:100].copy()

## Examine the dataset for patterns

**Find and extract a pattern from the dataframe**

In [None]:
def find_pattern(df, pattern, context=True):
    tmp = df[df.comment.str.contains(pattern)]
    print("Found {} rows.".format(tmp.shape[0]))
    for ind, row in tmp.iterrows():
        if context:
            print(ind, ": ", re.findall(".{10}" + pattern + ".{10}", row.comment))
        else:
            print(ind, ": ", re.findall(pattern, row.comment))

In [None]:
%%time
find_pattern(df_new, "aggi")

In [None]:
%%time
find_pattern(df_raw, "/7\s", context=False)

In [None]:
tmp = df_raw.loc[34992,'comment']
tmp

In [None]:
patter = re.compile("(?:\d)x")
tmp = patter.sub(r" times ", tmp)
tmp

## Create a preprocessing pipeline

In [5]:
def preprocess(df):
    cleaned = []
    for ind, row in df.iterrows():
        
        # convert to lower case
        comment = row.comment.lower()
        
### Handling errors in WH data extraction
        # Remove "\x7f"
        pattern = re.compile(r"\x7f")
        comment = pattern.sub(r" ", comment)
        
#         # Remove ";" in the middle of a word
#         pattern = re.compile("(\S);(\S)")
#         comment = pattern.sub(r"\1\2", comment)
        
#         # Remove "[" in the middle of a word
#         pattern = re.compile("(\w)\[(\w)")
#         comment = pattern.sub(r"\1\2", comment)
        
        # Remove what might mean "with"
        pattern = re.compile("`/c")
        comment = pattern.sub(r" ", comment)
    
### General rules for punctuation 
        # "l)" to "left"
        pattern = re.compile("(\W)l\)")
        comment = pattern.sub(r"\1 left ", comment)
        
        # "r)" to "right"
        pattern = re.compile("(\W)r\)")
        comment = pattern.sub(r"\1 right ", comment)

        # "can't" to "cannot"
        pattern = re.compile("can't")
        comment = pattern.sub(r" can not ", comment)
        
        # "n't" to "not"
        pattern = re.compile("n't")
        comment = pattern.sub(r" not ", comment)
        
        # Drop 's, 'll, 've, 'm, 're
        pattern = re.compile("\'(s|ll|ve|m|re)(?!\w)")
        comment = pattern.sub(r" ", comment)
        
        # Remove ' in the middle of a word
        pattern = re.compile("\w\'\w")
        comment = pattern.sub(r"", comment)
        
        # Abstract float numbers
        pattern = re.compile("\d+\.\d+")
        comment = pattern.sub(r" float ", comment)
        
        # "?" to "question"
        pattern = re.compile("\?+")
        comment = pattern.sub(r" question ", comment)
        
        # "@" to "at"
        pattern = re.compile("@")
        comment = pattern.sub(r" at ", comment)
        
        # "#" not followed by number to "fractured"
        pattern = re.compile("#(?!\W?\d)")
        comment = pattern.sub(r" fracture ", comment)
        
### Unmix letters and digits
        # "x" to "times"
        pattern = re.compile("x(?=\d)")
        comment = pattern.sub(r" times ", comment)
        # for some reason look behind assertion doesn't work
        patter = re.compile("(\d)x") 
        comment = patter.sub(r" \1 times ", comment)
        
        # Abstract integers
        pattern = re.compile("\d+")
        comment = pattern.sub(r" integer ", comment)
        
        # Remove the rest of non-alphanumeric and non-whitespace characters
        pattern = re.compile("[^a-zA-Z\d\s]")
        comment = pattern.sub(r" ", comment)
        
        # Oxygen saturation
#         pattern = re.compile("sat\so2|sat\s02|sao2|sa02|o2\s?sat[a-z]*|\D02\s?sat[a-z]*")
#         comment = pattern.sub(" oxygen saturation ", comment)
        
#         # Abstract dates
#         pattern = re.compile("\d{1,2}\.\d{1,2}\.\d{2,4}")
#         comment = pattern.sub(r" NUM_DATE ", comment)
        
#         # Glasgow Coma Scale response
#         pattern = re.compile("e\dv\dm\d")
#         comment = pattern.sub(r" GCS_RESPONSE ", comment)
        
#         # Type 1 diabetes
#         pattern = re.compile("t1dm")
#         comment = pattern.sub(r" type 1 diabetes ", comment)
        
#         # Type 2 diabetes
#         pattern = re.compile("t2dm")
#         comment = pattern.sub(r" type 2 diabetes ", comment)
        
        # Remove duplicated whitespace
        pattern = re.compile("\s+")
        comment = pattern.sub(r" ", comment)
        
### Spelling correction
#         comment = correct_spelling(comment)
        
        cleaned.append(comment)
    
        if ind % 10000 == 0:
            print(ind)
        
    return cleaned

In [6]:
%%time
df_new = df_raw.copy()
df_new.comment = preprocess(df_raw)

0
CPU times: user 30 ms, sys: 1 ms, total: 31 ms
Wall time: 30.6 ms


In [None]:
for i in range(5):
    print("***Comment {}***".format(i))
    print(df_raw.comment[i], "\n", df_new.comment[i], "\n")

## Save the new dataset

In [None]:
df_new.drop(['description', 'text'], axis=1).to_csv("./data/" + "wh_data_prepared.csv", index=False)

## Negation

In [None]:
### 45 and 151

import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from negspacy.negation import Negex

nlp = spacy.load("en_core_med7_lg")

negex = Negex(nlp, language = "en_clinical")
# negex = Negex(nlp, language = "en_clinical", 
#               preceding_negations = ["denies", "no", "non", "not", "without", "unable"], 
#               termination = [".", "-", ";", "+", "and", "aox3", "but", "complains", "did", 
#                              "except", "has", "per", "pt", "reports", "secondary", "states"])

nlp.add_pipe(negex, last=True)

In [None]:
complaints = list(nlp.pipe(["INJURY", "MARKINGS"]))
matcher = PhraseMatcher(nlp.vocab)
matcher.add("complaints", None, *complaints)

def add_complaints(doc):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label="complaints") for match_id, start, end in matches]
    doc.ents += spans
    return doc

In [None]:
nlp.add_pipe(add_complaints, after="ner")
print(nlp.pipe_names)

In [None]:
# text = "Foot is red swollen warm to the touch, denies fever at home . Hx multiple infections"
# text = "She does not like houses but denies Apple at home."
# text = "A patient denied taking Magnesium hydroxide 400mg/5ml but had suspension PO of total 30ml bid for the next 5 days"


In [None]:
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_, ent._.negex)

## Create vocab using Med7

In [None]:
# Step 1: create a custom word frequency list

# Load the model trained on MIMIC data
nlp = en_core_med7_lg.load()

# Create an empty spellchecker object and 
# initialise it with the words known to the model
spell_med7 = SpellChecker(language=None)
spell_med7.word_frequency.load_words(list(nlp.vocab.strings))

# Check to see how many unique and total words in the vocab
print(spell_med7.word_frequency.unique_words, spell_med7.word_frequency.total_words)

# A function that splits a string and 
# adds known words fromthe string to a list
def add_to_vocab(comment):
    words = comment.split()
    vocab.extend(spell_med7.known(words))

# Apply the function to each triage comment
vocab = []
df.comment.apply(add_to_vocab)

# Save the vocab
with open('vocab.txt', 'wb') as f:
    pickle.dump(vocab, f)

In [None]:
# Step 2: spelling correction
# Load the custom word frequency list
with open ('vocab.txt', 'rb') as f:
    vocab = pickle.load(f)
    
# Creat an empty spell checker object and
# initialise it with our vocabulary
spell = SpellChecker(language=None)
spell.word_frequency.load_words(vocab)

# Check to see how many unique and total words
print(spell_med7.word_frequency.unique_words, spell_med7.word_frequency.total_words)

# A function that splits a string and
# and corrects spelling based on our custom vocab
def correct_spelling(comment):
    words = comment.split()
    corrected_words = [spell.correction(word) if word in spell.unknown(words) else word for word in words]
    return ' '.join(corrected_words)

# Apply the function to each triage comment
df_new.comment = df_new.comment.apply(correct_spelling)

In [None]:
nlp = en_core_med7_lg.load()

In [None]:
spell_med7 = SpellChecker(language=None)
spell_med7.word_frequency.load_words(list(nlp.vocab.strings))

In [None]:
spell_med7.word_frequency.unique_words, spell_med7.word_frequency.total_words

In [None]:
def add_to_vocab(comment):
    words = comment.split()
    vocab.extend(spell_med7.known(words))

In [None]:
%%time
vocab = []
df_new.comment.apply(add_to_vocab)

In [None]:
with open('./data/vocab.txt', 'wb') as f:
    pickle.dump(vocab, f)

## Spelling correction

### Load vocab

In [7]:
with open ('./data/vocab.txt', 'rb') as f:
    vocab = pickle.load(f)

In [8]:
spell = SpellChecker(language=None)
spell.word_frequency.load_words(vocab)

In [9]:
spell.word_frequency.unique_words, spell.word_frequency.total_words

(24771, 1772242)

In [None]:
{k: v for k, v in sorted(spell.word_frequency.dictionary.items(), key=lambda item: item[1])}

In [None]:
rare_tokens = 0
for k, v in spell.word_frequency.dictionary.items():
    if v == 1:
        rare_tokens += 1
        
rare_tokens

### Process comments

In [12]:
def correct_spelling(comment):
    words = comment.split()
    corrected_words = [spell.correction(word) if word not in vocab else word for word in words]
    return ' '.join(corrected_words)

In [None]:
%%time
df_new.comment = df_new.comment.apply(correct_spelling)

# df_new.drop(['description', 'text'], axis=1).to_csv("./data/" + "wh_data_corrected.csv", index=False)

## Find rare unigrams and common bigrams

In [145]:
df_train = pd.read_csv("./data/rmh_train.csv")
df_train.shape

(311544, 10)

In [146]:
def word_count(data, ngram_range=(1,1)):
    vectorizer = CountVectorizer(stop_words=stopwords.words('english'), 
                                 ngram_range=ngram_range, 
                                 token_pattern=r'\S+'
                                )
    vectors = vectorizer.fit_transform(data)    
    vocab = list(vectorizer.get_feature_names())
    counts = vectors.sum(axis=0).A1
    return Counter(dict(zip(vocab, counts)))

In [None]:
unigrams = word_count(df.comment)
len(unigrams)

In [147]:
bigrams = word_count(df_train.entities, (2, 2))
len(bigrams)

955481

In [155]:
cut_off = np.quantile(np.fromiter(bigrams.values(), dtype=np.int), 0.99) 
print("Cut-off:", cut_off)
n_bigrams = (np.fromiter(bigrams.values(), dtype=np.int) > cut_off).sum()
print("%d most common bigrams" % n_bigrams)
most_common_bigrams = [item[0] for item in bigrams.most_common(n_bigrams)]

Cut-off: 55.0
9432 most common bigrams


['abdo pain',
 'chest pain',
 't2 dm',
 'phx nil',
 'nil loc',
 'sudden onset',
 'neck pain',
 'pain radiating',
 'pt expect',
 'pain nil',
 'pain worse',
 'pt states',
 'pmhx nil',
 'pain post',
 'o/a gcs',
 'flank pain',
 'left arm',
 'mechanical fall',
 'nil relief',
 'epigastric pain',
 'left sided',
 'pain phx',
 'hr regular',
 'central chest',
 'nausea vomiting',
 'productive cough',
 'pain +',
 'lower abdo',
 'n v',
 'altered sensation',
 'lower pain',
 'pain left',
 'self harm',
 'nil phx',
 'pain right',
 'pmh nil',
 'referred gp',
 'speaking sentences',
 'neurovasc intact',
 'pain swelling',
 'hx nil',
 'facial droop',
 'chest tightness',
 'shoulder pain',
 '+ v',
 'non distressed',
 'left leg',
 'n +',
 'nil pain',
 'km hr',
 'sore throat',
 'hs loc',
 'sao2 ra',
 'nil sob',
 'right sided',
 'post fall',
 'headstrike loc',
 'worse inspiration',
 'phx ht',
 'chest infection',
 'left shoulder',
 'blurred vision',
 'denies pain',
 'slurred speech',
 'nausea nil',
 'right hand',

In [133]:
cut_off = np.quantile(np.fromiter(bigrams.values(), dtype=np.int), 0.99) 
print("Cut-off:", cut_off)

# rare_tokens = []

# # Cut off 89% if "<4" and 91.5% if "<=4"
# for token, count in bigrams.items():
#     if count <= cut_off:
#         rare_tokens.append(token)

# print("Number of rare tokens:", len(rare_tokens))
# print("Percentage of rare tokens: {:.2f}%".format(len(rare_tokens) / len(bigrams) * 100))

# for item in rare_tokens:
#     bigrams.pop(item)
    
# print("Number of tokens left:", len(bigrams))

Cut-off: 55.0


In [75]:
sorted(bigrams.items(), key=lambda item: item[1], reverse=True)

[('abdo pain', 21115),
 ('chest pain', 18004),
 ('t2 dm', 12190),
 ('phx nil', 11657),
 ('nil loc', 8194),
 ('sudden onset', 7922),
 ('neck pain', 7455),
 ('pain radiating', 6946),
 ('pt expect', 6885),
 ('pain nil', 6635),
 ('pain worse', 6346),
 ('pt states', 6232),
 ('pmhx nil', 5684),
 ('pain post', 5098),
 ('o/a gcs', 5054),
 ('flank pain', 4972),
 ('left arm', 4873),
 ('mechanical fall', 4431),
 ('nil relief', 4430),
 ('epigastric pain', 4336),
 ('left sided', 4322),
 ('pain phx', 4300),
 ('hr regular', 4261),
 ('central chest', 4152),
 ('nausea vomiting', 4108),
 ('productive cough', 4084),
 ('pain +', 4043),
 ('lower abdo', 4025),
 ('n v', 3788),
 ('altered sensation', 3776),
 ('lower pain', 3740),
 ('pain left', 3638),
 ('self harm', 3402),
 ('nil phx', 3401),
 ('pain right', 3397),
 ('pmh nil', 3388),
 ('referred gp', 3385),
 ('speaking sentences', 3366),
 ('neurovasc intact', 3309),
 ('pain swelling', 3289),
 ('hx nil', 3258),
 ('facial droop', 3146),
 ('chest tightness', 31

In [79]:
df_train.entities.str.contains("hx").sum()

155483

In [None]:
plt.hist(bigrams.values(), bins=100);
plt.xlim([20000, 30000]);
plt.ylim([0, 10]);

## Synonyms

In [38]:
df_train.entities.str.contains("\+").sum()

24243

In [39]:
df_train.entities[df_train.entities.str.contains("\+")]

27                                 playing soccer knee locked pain + fall ground nil hs tx penthrane morphine
73        represent alert intermittent pic seen yesterday referred gp today ongoing l lateral pic + fever ...
77                                   etoh + drugs unsure hours fell taxi nil headstrike loc feeling weak post
80        woke feeling nauseated following tooth extraction thurs a/w slight ^ lower jaw swelling + shakes...
87                                                                     dizziness + unsteady gait ph pacemaker
                                                         ...                                                 
311513                     found street av aggressive + confronting people street pmh unknown known john cade
311514    fall home r/t ms + mobility aids hit head shoulder loc incomplete recall events haematoma head s...
311525                 etoh foosh left arm slight numbness fingers pain + limited movement swelling elbow phx
311528    

In [None]:
import json

with open('../Final/data/synonym.json') as json_file:
    synonyms = json.load(json_file)

In [None]:
synonyms

## Character n-grams w/o preprocessing

In [46]:
i

45432

In [None]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'),
                             analyzer='char', ngram_range=(2,4))
vectors = vectorizer.fit_transform(df_raw.comment.str.lower())

In [None]:
vocab = vectorizer.vocabulary_
len(vocab)

## Tokenize

In [None]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
vectors = vectorizer.fit_transform(df_new.comment)

In [None]:
vocab = vectorizer.vocabulary_
len(vocab)