In [1]:
import numpy as np
import pandas as pd
import json
from pathlib import Path
import re
import spacy

In [2]:
medical_terms = {
    "&": "and",
    "***": "",
    ">>": "clinical diagnosis",
    "@": "at",
    "+": "with",
    "?": "unknown if",
    "abd": "abdomen",
    "af": "accidental fall",
    "afib": "atrial fibrillation",
    "aki": "acute kidney injury",
    "am": "morning",
    "ams": "altered mental status",
    "bac": "blood alcohol content",
    "bal": "blood alcohol level,",
    "biba": "brought in by ambulance",
    "c/o": "complains of",
    "chi": "closed-head injury",
    "clsd": "closed",
    "cpk": "creatine phosphokinase",
    "cva": "cerebral vascular accident",
    "dx": "clinical diagnosis",
    "ecf": "extended-care facility",
    "er": "emergency room",
    "etoh": "ethyl alcohol",
    "eval": "evaluation",
    "fd": "fall detected",
    "ft": "foot",
    "fx": "fracture",
    "fxs": "fractures",
    "glf": "ground level fall",
    "h/o": "history of",
    "htn": "hypertension",
    "hx": "history of",
    "inj": "injury",
    "inr": "international normalized ratio",
    "intox": "intoxication",
    "l": "left",
    "lac": "laceration",
    "loc": "loss of consciousness",
    "lt": "left",
    "mech": "mechanical",
    "mult": "multiple",
    "n.h.": "nursing home",
    "nh": "nursing home",
    "p/w": "presents with",
    "pm": "afternoon",
    "pt": "patient",
    "pta": "prior to arrival",
    "pts": "patient's",
    "px": "physical examination", # not "procedure",
    "r": "right",
    "r/o": "rules out",
    "rt": "right",
    "s'd&f": "slipped and fell",
    "s/p": "after",
    "sah": "subarachnoid hemorrhage",
    "sdh": "acute subdural hematoma",
    "sts": "sit-to-stand",
    "t'd&f": "tripped and fell",
    "tr": "trauma",
    "uti": "urinary tract infection",
    "w/": "with",
    "w/o": "without",
    "wks": "weeks"
}

In [3]:
def add_space_after(text, target=","):
    replaced_text = re.sub(r'({})(\S)'.format(re.escape(target)), r'{} \2'.format(target),text)
    return replaced_text

def add_space_before(text, target=","):
    replaced_text = re.sub(r'(\S)({})'.format(re.escape(target)), r'\1 {}'.format(target), text)
    return replaced_text

# cleanning
def clean_narrative(text):
    # lowercase everything
    if pd.isna(text):
        return "NA"
    else:     
        text = text.lower()
        
        # unglue DX
        regex_dx = r"([ˆ\W]*(dx)[ˆ\W]*)"
        text = re.sub(regex_dx, r". dx: ", text)
    
        # remove age and sex identifications
        ## regex to capture age and sex (not perfect but captures almost all of the cases)
        regex_age_sex = r"(\d+)\s*?(yof|yf|yo\s*female|yo\s*f|yom|ym|yo\s*male|yo\s*m)"
        age_sex_match = re.search(regex_age_sex, text)
    
        ## format age and sex
        if age_sex_match:
            age = age_sex_match.group(1)
            sex = age_sex_match.group(2)
            
            # probably not best practice but it works with this data
            if "f" in sex:
                #text = text.replace(age_sex_match.group(0), f"{age} years old female")
                text = text.replace(age_sex_match.group(0), f"patient")
            elif "m" in sex:
                #text = text.replace(age_sex_match.group(0), f"{age} years old male")
                text = text.replace(age_sex_match.group(0), f"patient")
        
        # translate medical terms
        for term, replacement in medical_terms.items():
            if term == "@" or term == ">>" or term == "&" or term == "***" or term == "+" or term == "?":
                pattern = fr"({re.escape(term)})"
                text = re.sub(pattern, f" {replacement} ", text) # force spaces around replacement
                
            else:
                pattern = fr"(?<!-)\b({re.escape(term)})\b(?!-)"
                text = re.sub(pattern, replacement, text)
                     
        text = add_space_after(text, target=",") 
        text = add_space_after(text, target=";") 
        text = add_space_after(text, target=":") 
        text = add_space_before(text, target="--")  
        text = add_space_after(text, target="--")
        text = add_space_before(text, target="/")  
        text = add_space_after(text, target="/")
        text = re.sub("-", " ", text)
        text = re.sub("\.", " ", text)
        text = re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", text).strip()
        # remove extra white spaces
        
        text = re.sub(r'\s+', ' ', text)
    
        return text.upper()

### load variable mapping

In [4]:
with Path("variable_mapping.json").open("r") as f:
    mapping = json.load(f, parse_int=True)

In [5]:
# convert the encoded values in the mapping to integers since they get read in as strings
for c in mapping.keys():
    mapping[c] = {int(k): v for k, v in mapping[c].items()}

### load primary data

In [6]:
df = pd.read_csv(
    "corrected_narrative_primary.csv",
    # set columns that can be null to nullable ints
    dtype={"body_part_2": "Int64", "diagnosis_2": "Int64"},
)
df.head()

Unnamed: 0,cpsc_case_number,narrative,treatment_date,age,sex,race,other_race,hispanic,diagnosis,other_diagnosis,...,body_part,body_part_2,disposition,location,fire_involvement,alcohol,drug,product_1,product_2,product_3
0,190103269,PATIENT FELL TO THE FLOOR AT THE NURSING HOME ...,2019-01-01,94,1,0,,0,62,,...,75,,4,5,0,0,0,1807,0,0
1,190103270,PATIENT FELL IN THE SHOWER AT HOME AND SUSTAIN...,2019-01-01,86,1,0,,0,62,,...,75,,4,1,0,0,0,611,0,0
2,190103273,PATIENT WAS GETTING UP FROM THE COUCH AND FELL...,2019-01-01,87,2,0,,0,53,,...,32,,4,1,0,0,0,679,1807,0
3,190103291,PATIENT WAS AT A FRIENDS HOUSE AND SLIPPED ON ...,2019-01-01,67,2,0,,0,57,,...,33,,1,1,0,0,0,1807,0,0
4,190103294,PATIENT WAS STANDING ON A STEP STOOL AND FELL ...,2019-01-01,70,2,0,,0,57,,...,33,,1,1,0,0,0,620,0,0


### replace numeric values with corresponding strings

In [7]:
decoded_df = df.copy()

for col in mapping.keys():
    decoded_df[col] = decoded_df[col].map(mapping[col])

In [8]:
# ensure mappings were applied correctly by checking that the number of missing values did not change
assert (decoded_df.isnull().sum() == df.isnull().sum()).all()

In [9]:
random_sample = decoded_df.sample(n=5000, replace=False, random_state=42)

In [10]:
random_sample_sub = random_sample.iloc[:,0:2]

In [11]:
random_sample_sub['narrative'] = random_sample_sub['narrative'].apply(lambda x: clean_narrative(x))

In [12]:
random_sample_sub.head()

Unnamed: 0,cpsc_case_number,narrative
80147,220302739,PATIENT WAS SITTING IN A WHEELCHAIR LEANING FO...
48158,210101788,PATIENT WAS PLAYING WITH HER GRANDCHILD WHEN S...
112225,221250671,PATIENT TRIPPED GETTING OUT OF AN ELEVATOR AND...
50792,210212098,PATIENT SLIPPED ON NYLON SOCKS WHILE COMING OU...
115089,230214236,PATIENT PRESENTS AFTER A SYNCOPAL EPISODE WITH...


# NLP

In [13]:
print(spacy.__version__)

3.6.1


# My Model

Corrected data

In [14]:
fall = pd.read_csv('fall_set4_corrected_narrative.csv')

In [15]:
fall.head()

Unnamed: 0,key_entry,span,label
0,PATIENT HISTORY OF OF DEMENTIA AFTER POSSIBLE ...,FRAME FELL ON HER NOSE,AO
1,PATIENT WITH FALL FROM BED CLINICAL DIAGNOSIS:...,FALL FROM BED,CHR
2,PATIENT FROM HOME WITH A FALL FROM HIS BED TO ...,FALL FROM HIS BED,CHR
3,PATIENT COMPLAINS OF INJURY TO HEAD / FACE AFT...,FALL OFF THE TOLIET,CHR
4,PATIENT FALL OFF TOILET ONTO FLOOR AND COMPLAI...,FALL OFF TOILET,CHR


In [16]:
# Replace leading whitespaces using a regular expression
fall['span'] = fall['span'].str.replace(r'^\s+', '', regex=True)

In [17]:
fall_nas = fall[fall['label'].isna()]
len(fall_nas)

60

In [18]:
fall_labeled = fall[fall['label'].isna() == False]

In [19]:
fall_labeled.head(2)

Unnamed: 0,key_entry,span,label
0,PATIENT HISTORY OF OF DEMENTIA AFTER POSSIBLE ...,FRAME FELL ON HER NOSE,AO
1,PATIENT WITH FALL FROM BED CLINICAL DIAGNOSIS:...,FALL FROM BED,CHR


In [20]:
all_labels = fall_labeled.label.unique()
all_labels

array(['AO', 'CHR', 'TRS', 'COL', 'DIZ', 'LAD', 'MED', 'OBJ', 'RCH', 'SF',
       'SHW', 'SO', 'STR', 'SU', 'WT'], dtype=object)

In [21]:
labels_to_use = ['CHR', 'TRS', 'LAD', 'OBJ', 'RCH', 'SF', 'SHW', 'SO', 'STR', 'SU', 'WT']
# labels_to_use = ['SO']

In [22]:
fall_labeled_sub = fall_labeled[fall_labeled['label'].isin(labels_to_use)]

In [23]:
key_list = fall_labeled_sub.key_entry.unique()
len(key_list)

355

In [24]:
fall_labeled[fall_labeled.key_entry == key_list[1]]

Unnamed: 0,key_entry,span,label
2,PATIENT FROM HOME WITH A FALL FROM HIS BED TO ...,FALL FROM HIS BED,CHR


In [25]:
nlp = spacy.blank("en")
# special_case = [{Token.ORTH: "CLOSED-HEAD"}]
# nlp.tokenizer.add_special_case("CLOSED-HEAD", special_case)

from spacy.tokens import Span

# keeping span token lengths to appropriately set config
token_lengths = []
label_list = []

docs=[] # this will hold the processed strings and spans
for k in key_list:
    doc = nlp(k)
    temp_df = fall_labeled_sub[fall_labeled_sub.key_entry == k]

    if len(temp_df)==1:              
        span_text = temp_df.iloc[0,1]
        temp_label = temp_df.iloc[0,2]       
        span_start_char = k.find(span_text)
        span_end_char = span_start_char + len(span_text)

        # Finding the start and end tokens using character offsets
        start_token = None
        end_token = None
        for token in doc:
            if token.idx == span_start_char:
                 start_token = token.i
            if token.idx + len(token.text) == span_end_char:
                end_token = token.i
                break
        if start_token is not None and end_token is not None:
            temp_start = start_token
            temp_end = end_token + 1
            doc.spans["sc"] = [Span(doc, temp_start, temp_end, temp_label)]
            docs.append(doc)
            token_lengths.append(temp_end - temp_start)
            label_list.append(temp_label)
        else:
            print(k, "span=", span_text,"couldn't find tokens")
    else:
        print("temp_df has length > 1")
        span_list = []
        for ent in range(len(temp_df)):
            span_text = temp_df.iloc[ent,1]
            temp_label = temp_df.iloc[ent,2]
            span_start_char = k.find(span_text)
            span_end_char = span_start_char + len(span_text)
            print(span_start_char, span_end_char)

            # Finding the start and end tokens using character offsets
            start_token = None
            end_token = None
            for token in doc:
                if token.idx == span_start_char:
                     start_token = token.i
                if token.idx + len(token.text) == span_end_char:
                    end_token = token.i
                    break
            if start_token is not None and end_token is not None:
                temp_start = start_token
                temp_end = end_token + 1             
                span_list.append(Span(doc, temp_start, temp_end, temp_label))
                token_lengths.append(temp_end - temp_start)
                label_list.append(temp_label)
            else:
                print(k, "span=",span_text, "couldn't find tokens")
        
        doc.spans["sc"] = span_list
    docs.append(doc)
          

temp_df has length > 1
25 40
41 64
temp_df has length > 1
25 40
83 107
temp_df has length > 1
27 45
50 78
temp_df has length > 1
72 94
165 201
temp_df has length > 1
38 72
73 103
temp_df has length > 1
85 98
110 122
temp_df has length > 1
28 41
42 57
temp_df has length > 1
8 21
22 39
temp_df has length > 1
8 24
45 54
temp_df has length > 1
16 35
36 62
temp_df has length > 1
8 27
28 45
temp_df has length > 1
8 23
91 132
temp_df has length > 1
8 23
24 34
temp_df has length > 1
8 23
24 49
temp_df has length > 1
58 73
78 99
temp_df has length > 1
9 24
51 64
temp_df has length > 1
8 23
39 47
temp_df has length > 1
8 23
42 51
temp_df has length > 1
32 49
61 73
temp_df has length > 1
9 31
122 133
temp_df has length > 1
37 51
61 75
temp_df has length > 1
35 52
9 30
53 89
temp_df has length > 1
8 25
43 64
temp_df has length > 1
62 79
81 96
temp_df has length > 1
13 25
26 39
temp_df has length > 1
31 48
84 101
temp_df has length > 1
24 34
56 70
temp_df has length > 1
19 36
37 49
temp_df has leng

In [26]:
len(docs)

568

In [27]:
print(np.min(token_lengths), np.max(token_lengths), np.median(token_lengths))

2 13 4.0


In [28]:
np.quantile(token_lengths, q =[0.05,0.95])

array([2., 7.])

In [29]:
pd.unique(label_list)

  pd.unique(label_list)


array(['CHR', 'SF', 'SO', 'SU', 'RCH', 'WT', 'TRS', 'LAD', 'OBJ', 'STR',
       'SHW'], dtype=object)

Make training and test sets with the docs

In [30]:
from spacy.tokens import DocBin

In [31]:
doc_bin = DocBin(docs=docs[0:400])

In [32]:
doc_bin.to_disk("./testtrain_230929_mult_labs.spacy")

In [33]:
doc_bin = DocBin(docs=docs[400:])
doc_bin.to_disk("./testdev_230929_mult_labs.spacy")

python -m spacy init config ./config.cfg --lang en --pipeline spancat

## Using model

In [42]:
nlp_spancat = spacy.load("/Users/wendyphillips/Documents/Computing/WendysPython/Falling_analysis/code/outputs230929_mult_labs2/model-best")

In [43]:
nlp_spancat.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x141c42760>),
 ('spancat_singlelabel',
  <spacy.pipeline.spancat.SpanCategorizer at 0x141c98040>)]

In [44]:
test_text = random_sample_sub.iloc[1101,1]
doc = nlp_spancat(test_text)
doc.text

'PATIENT GOING TO RESTROOM WHEN HE GOT HIS FEET TRIPPED UP AND FELL INTO THE DOOR CLINICAL DIAGNOSIS: NECK PAIN, EPISTAXIS, FALL'

In [45]:
doc.spans

{'sc': [FELL INTO THE DOOR]}

In [46]:
for span in doc.spans["sc"]:
    print(span.label_, span.start, span.end, span.text)

SO 12 16 FELL INTO THE DOOR


In [47]:
random_sample_sub2 = random_sample_sub.iloc[1100:1200,]
for text in random_sample_sub2['narrative']:
    print(text)
    doc = nlp_spancat(text)
    for span in doc.spans["sc"]:
        print(span.label_, span.start, span.end, span.text)

PATIENT WITHHEAD AND NECK PAIN AND RIGHT FOREARM LACERATION AFTER A MECHANICAL FALL PATIENT WAS PUSHING HIS LAWNMOWER DOWN A HILL AT HIS DAUGHTERS HOUSE AND FELL, STRIKING RIGHT FOREARM CLINICAL DIAGNOSIS: CERV SPINE FRACTURE; RIGHT FOREARM LACERATION
PATIENT GOING TO RESTROOM WHEN HE GOT HIS FEET TRIPPED UP AND FELL INTO THE DOOR CLINICAL DIAGNOSIS: NECK PAIN, EPISTAXIS, FALL
SO 12 16 FELL INTO THE DOOR
PATIENT FELL IN FLOORT A HOME, LANDED ON HIP CLINICAL DIAGNOSIS: HIP FRACTURE
SF 7 10 LANDED ON HIP
PATIENT PATIENT WAS GETTING OUT OF BED TO USE THE BATHROOM WHEN HE FELL TO THE FLOOR HAVING BILATERAL FOOT PAIN AND THINKS HE HIT HIS HEAD HE IS ACTING ALTERED CLINICAL DIAGNOSIS: CLOSED HEAD INJURY, BILATERAL FOOT PAIN
TRS 3 7 GETTING OUT OF BED
SU 24 27 HIT HIS HEAD
PATIENT TRIPPED OVER HER CAT HIT LOW BACK ON TOILET CLINICAL DIAGNOSIS: FRACTURE LUMBAR SPINE
SO 5 10 HIT LOW BACK ON TOILET
OBJ 1 5 TRIPPED OVER HER CAT
PATIENT TRIPPED ON RUG, FELL IN FLOOR AT HOME CLINICAL DIAGNOSIS: FOR

SO 29 33 HIT HEAD ON SINK
PATIENT WENT TO SIT ON A CHAIR AND MISSED IT AND FELL DOWN A FLIGHT OF STAIRS ONTO HEAD CLINICAL DIAGNOSIS: CLOSED HEAD INJURY
STR 11 17 FELL DOWN A FLIGHT OF STAIRS
SF 17 19 ONTO HEAD
TRS 1 4 WENT TO SIT
PATIENT REPORTS FALLING OUT OF BED THIS MORNING INJURING HIP CLINICAL DIAGNOSIS: FRACTURE PELVIS
CHR 2 6 FALLING OUT OF BED
PATIENT FELL GETTING OUT OF A CHAIR LANDING ON LEFT HIP ON TILE FLOOR CLINICAL DIAGNOSIS: HIP FRACTURE
SF 7 11 LANDING ON LEFT HIP
TRS 2 7 GETTING OUT OF A CHAIR
PATIENT FROM HOME FELL TO THE FLOOR TRANSFERRING FROM CHAIR TO BED COMPLAINS OF HIP / THIGH PAIN WITH USE FOR ATRIAL FIBRILLATION CLINICAL DIAGNOSIS: HYPONATREMIA, LOWER EXTREMITY HEMATOMAS
TRS 7 10 TRANSFERRING FROM CHAIR
PATIENT PRESENTS FROM HOME PATIENT STATES HE FELL DOWN APPROXIMATELY 7 STEPS 1 DAY PRIOR CLINICAL DIAGNOSIS: MULTIPLE ABRASIONS LOWER LEG
STR 7 12 FELL DOWN APPROXIMATELY 7 STEPS
PATIENT, ON FOR ATRIAL FIBRILLATION IN ADDITION TO , LIVES HOME WITH SON WHO WASN

In [48]:
random_sample_sub2 = random_sample_sub.iloc[1100:2000,]

# Create an empty DataFrame with column names
output_df = pd.DataFrame(columns=['text', 'span_label', 'span_text'])

for text in random_sample_sub2['narrative']:
    doc = nlp_spancat(text)
    
    if len(doc.spans["sc"]) == 0:
        df2 = pd.DataFrame([[text, "NA", "NA"]], columns=['text', 'span_text', 'span_label'])
        # Append the new row to the DataFrame
        output_df = pd.concat([output_df, df2])
    else:
        for span in doc.spans["sc"]:
        # Create a new row as a dictionary
        
            df2 = pd.DataFrame([[text, span.label_, span.text]], columns=['text', 'span_text', 'span_label'])
            # Append the new row to the DataFrame
            output_df = pd.concat([output_df, df2])

In [49]:
output_df.to_csv("predictions_mult_cat2.csv")