In [1]:
import pandas as pd
import re
import spacy
from spellchecker import SpellChecker
import pickle
import time
import utility_functions as utils

In [2]:
df = pd.read_csv("./data/wh_data_raw.csv")
df.head()

Unnamed: 0,SISH,SI,SH,length,text
0,1,1,0,194,"MENTAL STATE - ALTERATION IN. ETOH AFFECTED, W..."
1,2,0,1,145,POISONING / OVERDOSE- 30MG DIAZEPAM+ETOH PHX R...
2,1,1,0,269,MENTAL STATE - ALTERATION IN - FOUND LYING ON ...
3,1,1,0,396,"MENTAL STATE - SUICIDE ATTEMPT / RISK, PT WALK..."
4,0,0,0,297,PT ACTING ABNORMALLY SINCE HAVING MISCARRIAGE ...


In [None]:
utils.find_pattern(df, "text", "\.mainta")

In [3]:
def preprocess(text):
    
    # Convert to lower case
    text = text.lower()
    
    # Remove "\x7f"
    pattern = re.compile(r"\x7f")
    text = pattern.sub(r" ", text)
    # "`/c" to "with"
    pattern = re.compile("`/c")
    text = pattern.sub(r" ", text)
    # Remove backslashes
    pattern = re.compile("\d\\\\\d")
    text = pattern.sub(r"/", text)
    pattern = re.compile("\\\\")
    text = pattern.sub(r" ", text)
    # Fix "patientexpect"
    pattern = re.compile(r"patientexpect")
    text = pattern.sub(r" patient expect ", text)
    
    # "l)" to "left"
    pattern = re.compile("(^|\W)l\)")
    text = pattern.sub(r"\1 left ", text)
    # "r)" to "right"
    pattern = re.compile("(^|\W)r\)")
    text = pattern.sub(r"\1 right ", text)
    # "@" to "at"
    pattern = re.compile("@")
    text = pattern.sub(r" at ", text)
    # "#" to "fractured" if not followed by number
    pattern = re.compile("#(?!\d)")
    text = pattern.sub(r" fracture ", text)
    # "+ve" to "positive"
    pattern = re.compile("\+ve(?![a-z])")
    text = pattern.sub(r" positive ", text)
    # "-ve" to "positive"
    pattern = re.compile("\-ve(?![a-z])")
    text = pattern.sub(r" negative ", text)
    # "co operative" and "co-operative" to "cooperative"
    pattern = re.compile("co\sop|co-op")
    text = pattern.sub(r"coop", text)
    # "r/ship" to relationship
    pattern = re.compile("r/ships?")
    text = pattern.sub(r" relationship ", text)
    # Remove "+" after digit
    pattern = re.compile("(\d)\+")
    text = pattern.sub(r"\1 ", text)
    
    # Remove parentheses
    pattern = re.compile("\((.*)\)[,\.]?")
    text = pattern.sub(r" , \1, ", text)
    # Remove curly brackets
    pattern = re.compile("\((.*)\)")
    text = pattern.sub(r" . \1. ", text)
    
    # 1. Replace "preg" by "pregnant"
    pattern = re.compile("preg$|preg\.?(\W)")
    text = pattern.sub(r" pregnant \1", text)
    
    # 2. Replace "reg" by "regular"
    pattern = re.compile("irreg$|irreg\.?(\W)")
    text = pattern.sub(r" irregular \1", text)
    pattern = re.compile("reg$|reg\.?(\W)")
    text = pattern.sub(r" regular \1", text)
    
    # 3. Normalise respiratory rate
    pattern = re.compile("([^a-z])rr(?![a-z])|resp\srate|resp\W?(?=\d)")
    text = pattern.sub(r"\1 rr ", text)
    
    # 4. Normalise oxygen saturation
    pattern = re.compile("sp\s?[o0]2|sp2|spo02|sa\s?[o0]2|sats?\W{0,3}(?=\d)")
    text = pattern.sub(r" sao2 ", text) 
    pattern = re.compile("([^a-z])sp\W{0,3}(?=[19])")
    text = pattern.sub(r"\1 sao2 ", text)
    
    # 5. Normilise temperature
    pattern = re.compile("([^a-z])t(emp)?\W{0,3}(?=[34]\d)")
    text = pattern.sub(r"\1 temp ", text)

    # 6. Normalise hours
    pattern = re.compile("([^a-z])hrs|([^a-z])hours")
    text = pattern.sub(r"\1 hours ", text)
     
    # 7. Normalise heart rate
    pattern = re.compile("([^a-z])hr(?![a-z])")
    text = pattern.sub(r"\1 hr ", text)
    
    # 8. Normalise GCS
    pattern = re.compile("gsc")
    text = pattern.sub(r"gcs", text)
    
    # 9. Normalise on arrival
    pattern = re.compile("o/a|on arrival|on assessment")
    text = pattern.sub(r" o/a ", text)

    # Add spaces around "bp"
    pattern = re.compile("([^a-z])bp(?![a-z])")
    text = pattern.sub(r"\1 bp ", text)
    
    # Add spaces around "bmp", "bsl", "gcs"
    pattern = re.compile("(bpm|bsl|gcs)")
    text = pattern.sub(r" \1 ", text)
    
    # Remove duplicated punctuation marks [-/+_,?.] and spaces
    pattern = re.compile("-{2,}")
    text = pattern.sub(r"-", text)
    pattern = re.compile("/{2,}")
    text = pattern.sub(r"/", text)
    pattern = re.compile("\+{2,}")
    text = pattern.sub(r"+", text)
    pattern = re.compile("_{2,}")
    text = pattern.sub(r"_", text)
    pattern = re.compile(",{2,}")
    text = pattern.sub(r",", text)  
    pattern = re.compile("\?{2,}")
    text = pattern.sub(r"?", text)
    pattern = re.compile("\.{2,}")
    text = pattern.sub(r".", text)
    pattern = re.compile("\s{2,}")
    text = pattern.sub(r" ", text)
    
    return text

In [4]:
%%time
# Preprocess comments
df['text_clean'] = df.text.apply(preprocess)

CPU times: user 10.9 s, sys: 32.8 ms, total: 10.9 s
Wall time: 11 s


### Scispacy tokenizer

In [5]:
# Load scispacy model for tokenization
nlp = spacy.load("en_core_sci_sm", disable=['tagger', 'parser', 'ner'])

In [6]:
%%time
df['tok1'] = list(nlp.pipe(df.text_clean))

CPU times: user 11.7 s, sys: 496 ms, total: 12.2 s
Wall time: 12.2 s


### Custom tokenizer

In [None]:
# Load custom tokenizer from the file
from custom_tokenizer import combined_rule_tokenizer
nlp.tokenizer = combined_rule_tokenizer(nlp)

In [15]:
# Alternatively, use the code below
from typing import List

from spacy.lang import char_classes
from spacy.symbols import ORTH
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
from spacy.language import Language

def remove_new_lines(text: str) -> str:
    """Used to preprocess away new lines in the middle of words. This function
       is intended to be called on a raw string before it is passed through a
       spaCy pipeline
    @param text: a string of text to be processed
    """
    text = text.replace("-\n\n", "")
    text = text.replace("- \n\n", "")
    text = text.replace("-\n", "")
    text = text.replace("- \n", "")
    return text


def combined_rule_prefixes() -> List[str]:
    """Helper function that returns the prefix pattern for the tokenizer.
    It is a helper function to accomodate spacy tests that only test
    prefixes.
    """
    # add lookahead assertions for brackets (may not work properly for unbalanced brackets)
    prefix_punct = char_classes.PUNCT.replace("|", " ")
    prefix_punct = prefix_punct.replace(r"\(", r"\((?![^\(\s]+\)\S+)")
    prefix_punct = prefix_punct.replace(r"\[", r"\[(?![^\[\s]+\]\S+)")
    prefix_punct = prefix_punct.replace(r"\{", r"\{(?![^\{\s]+\}\S+)")

    prefixes = (
        ["§", "%", "=", r"\+", "-", "/"]
        + ["\.(?![0-9])"]
        + char_classes.split_chars(prefix_punct)
        + char_classes.LIST_ELLIPSES
        + char_classes.LIST_QUOTES
        + char_classes.LIST_CURRENCY
        + char_classes.LIST_ICONS
    )
    return prefixes


def combined_rule_tokenizer(nlp: Language) -> Tokenizer:
    """Creates a custom tokenizer on top of spaCy's default tokenizer. The
    intended use of this function is to replace the tokenizer in a spaCy
    pipeline like so:
         nlp = spacy.load("some_spacy_model")
         nlp.tokenizer = combined_rule_tokenizer(nlp)
    @param nlp: a loaded spaCy model
    """
    # remove the first hyphen to prevent tokenization of the normal hyphen
    hyphens = char_classes.HYPHENS.replace("-|", "", 1)

    infixes = (
        char_classes.LIST_ELLIPSES
        + char_classes.LIST_ICONS
        + [
            r"×",  # added this special x character to tokenize it separately
            r"(?<=[0-9])[+\-\*^](?=[0-9-])",
            r"(?<=[{al}])\.(?=[{au}])".format(
                al=char_classes.ALPHA_LOWER, au=char_classes.ALPHA_UPPER
            ),
            r"(?<=[{a}]),(?=[{a}])".format(a=char_classes.ALPHA),
            r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(
                a=char_classes.ALPHA, h=hyphens
            ),
            # removed / to prevent tokenization of /
            r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=char_classes.ALPHA),
        ]
    )

    prefixes = combined_rule_prefixes()

    # add the last apostrophe
    quotes = char_classes.LIST_QUOTES.copy() + ["’"]

    # add lookbehind assertions for brackets (may not work properly for unbalanced brackets)
    suffix_punct = char_classes.PUNCT.replace("|", " ")
    # These lookbehinds are commented out because they are variable width lookbehinds, and as of spacy 2.1,
    # spacy uses the re package instead of the regex package. The re package does not support variable width
    # lookbehinds. Hacking spacy internals to allow us to use the regex package is doable, but would require
    # creating our own instance of the language class, with our own Tokenizer class, with the from_bytes method
    # using the regex package instead of the re package
    # suffix_punct = suffix_punct.replace(r"\)", r"(?<!\S+\([^\)\s]+)\)")
    # suffix_punct = suffix_punct.replace(r"\]", r"(?<!\S+\[[^\]\s]+)\]")
    # suffix_punct = suffix_punct.replace(r"\}", r"(?<!\S+\{[^\}\s]+)\}")

    suffixes = (
        char_classes.split_chars(suffix_punct)
        + char_classes.LIST_ELLIPSES
        + quotes
        + char_classes.LIST_ICONS
        + ["'s", "'S", "’s", "’S", "’s", "’S"]
        + ["-", "/", "\+"]
        + [
            r"(?<=[0-9])\+",
            r"(?<=°[FfCcKk])\.",
            r"(?<=[0-9])(?:{})".format(char_classes.CURRENCY),
            # this is another place where we used a variable width lookbehind
            # so now things like 'H3g' will be tokenized as ['H3', 'g']
            # previously the lookbehind was (^[0-9]+)
            r"(?<=[0-9])(?:{u})".format(u=char_classes.UNITS),
            r"(?<=[0-9{}{}(?:{})])\.".format(
                char_classes.ALPHA_LOWER, r"%²\-\)\]\+", "|".join(quotes)
            ),
            # add |\d to split off the period of a sentence that ends with 1D.
            r"(?<=[{a}|\d][{a}])\.".format(a=char_classes.ALPHA_UPPER),
        ]
    )

    infix_re = compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(prefixes)
    suffix_re = compile_suffix_regex(suffixes)

    tokenizer_exceptions = nlp.Defaults.tokenizer_exceptions.copy()

    tokenizer = Tokenizer(
        nlp.vocab,
        tokenizer_exceptions,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=nlp.tokenizer.token_match,
    )
    return tokenizer

In [9]:
%%time
df['tok2'] = list(nlp.pipe(df.text_clean))

CPU times: user 10.3 s, sys: 473 ms, total: 10.8 s
Wall time: 10.8 s


In [10]:
# Inspect tokens
# i = 29681 # "-daughter"
# i = 11170 # "/risk"
# i = 24789 # "351-", "/harm"
# i = 689 # "domestic/"
# i = 137 # ".195"
# i = 4212 # ".maintaining"
i = 53393
for t in df.loc[i, 'tok1']:
    print(t)

mental
state
-
alteration
in-
{
patient
expect
:
paranoid
,
increasing
thouights
of
being
controlled
,
prev
admissions
to
saapu
with
same
.
lmo
will
speak
to
emh
directlyprev
drug
abuse
}
smoked
ice
tonight
.
a+o
,
nwob
rr
18
,
sao2
99
%
hr
131
,
very
elevated
,
si
and
intent
sh
but
nil
plan
,
"
plan
to
get
better
"
good
eye
contact
,
well
dressed
.
appears
paranoid
.
auditry
hallucinations
,
p/w/d
,
strong
radial
.
gcs
15
.


In [11]:
for t in df.loc[i, 'tok2']:
    print(t)

mental
state
-
alteration
in
-
{
patient
expect
:
paranoid
,
increasing
thouights
of
being
controlled
,
prev
admissions
to
saapu
with
same
.
lmo
will
speak
to
emh
directlyprev
drug
abuse
}
smoked
ice
tonight
.
a+o
,
nwob
rr
18
,
sao2
99
%
hr
131
,
very
elevated
,
si
and
intent
sh
but
nil
plan
,
"
plan
to
get
better
"
good
eye
contact
,
well
dressed
.
appears
paranoid
.
auditry
hallucinations
,
p/w/d
,
strong
radial
.
gcs
15
.


In [12]:
# Check for any unforeseen changes
df.index[df.apply(lambda x: len(x.tok1) != len(x.tok2), axis=1)]

Int64Index([    1,    20,    23,    24,    25,    29,    31,    40,    65,
               78,
            ...
            53365, 53376, 53378, 53380, 53381, 53384, 53388, 53389, 53390,
            53393],
           dtype='int64', length=11577)

In [13]:
df.shape

(53394, 8)