In [11]:
import numpy as np
import pandas as pd
import json
from pathlib import Path
import re
import spacy

In [92]:
medical_terms = {
    "&": "and",
    "***": "",
    ">>": "clinical diagnosis",
    "@": "at",
    "+":"with",
    "abd": "abdomen",
    "af": "accidental fall",
    "afib": "atrial fibrillation",
    "aki": "acute kidney injury",
    "am": "morning",
    "ams": "altered mental status",
    "bac": "blood alcohol content",
    "bal": "blood alcohol level,",
    "biba": "brought in by ambulance",
    "c/o": "complains of",
    "chi": "closed-head injury",
    "clsd": "closed",
    "cpk": "creatine phosphokinase",
    "cva": "cerebral vascular accident",
    "dx": "clinical diagnosis",
    "ecf": "extended-care facility",
    "er": "emergency room",
    "etoh": "ethyl alcohol",
    "eval": "evaluation",
    "fd": "fall detected",
    "fx": "fracture",
    "fxs": "fractures",
    "glf": "ground level fall",
    "h/o": "history of",
    "htn": "hypertension",
    "hx": "history of",
    "inj": "injury",
    "inr": "international normalized ratio",
    "intox": "intoxication",
    "l": "left",
    "loc": "loss of consciousness",
    "lt": "left",
    "mech": "mechanical",
    "mult": "multiple",
    "n.h.": "nursing home",
    "nh": "nursing home",
    "p/w": "presents with",
    "pm": "afternoon",
    "pt": "patient",
    "pta": "prior to arrival",
    "pts": "patient's",
    "px": "physical examination", # not "procedure",
    "r": "right",
    "r/o": "rules out",
    "rt": "right",
    "s'd&f": "slipped and fell",
    "s/p": "after",
    "sah": "subarachnoid hemorrhage",
    "sdh": "acute subdural hematoma",
    "sts": "sit-to-stand",
    "t'd&f": "tripped and fell",
    "tr": "trauma",
    "uti": "urinary tract infection",
    "w/": "with",
    "w/o": "without",
    "wks": "weeks"
}

def add_space_after(text, target=","):
    replaced_text = re.sub(r'({})(\S)'.format(re.escape(target)), r'{} \2'.format(target),text)
    return replaced_text

def add_space_before(text, target=","):
    replaced_text = re.sub(r'(\S)({})'.format(re.escape(target)), r'\1 {}'.format(target), text)
    return replaced_text

# cleanning
def clean_narrative(text):
    # lowercase everything
    if pd.isna(text):
        return "NA"
    else:     
        text = text.lower()
        
        # unglue DX
        regex_dx = r"([ˆ\W]*(dx)[ˆ\W]*)"
        text = re.sub(regex_dx, r". dx: ", text)
    
        # remove age and sex identifications
        ## regex to capture age and sex (not perfect but captures almost all of the cases)
        regex_age_sex = r"(\d+)\s*?(yof|yf|yo\s*female|yo\s*f|yom|ym|yo\s*male|yo\s*m)"
        age_sex_match = re.search(regex_age_sex, text)
    
        ## format age and sex
        if age_sex_match:
            age = age_sex_match.group(1)
            sex = age_sex_match.group(2)
            
            # probably not best practice but it works with this data
            if "f" in sex:
                #text = text.replace(age_sex_match.group(0), f"{age} years old female")
                text = text.replace(age_sex_match.group(0), f"patient")
            elif "m" in sex:
                #text = text.replace(age_sex_match.group(0), f"{age} years old male")
                text = text.replace(age_sex_match.group(0), f"patient")
        
        # translate medical terms
        for term, replacement in medical_terms.items():
            if term == "@" or term == ">>" or term == "&" or term == "***" or term == "+" :
                pattern = fr"({re.escape(term)})"
                text = re.sub(pattern, f" {replacement} ", text) # force spaces around replacement
                
            else:
                pattern = fr"(?<!-)\b({re.escape(term)})\b(?!-)"
                text = re.sub(pattern, replacement, text)
                
        text = add_space_after(text, target=".")        
        text = add_space_after(text, target=",") 
        text = add_space_after(text, target=";") 
        text = add_space_after(text, target=":") 
        text = add_space_before(text, target="--")  
        text = add_space_after(text, target="--")
        text = re.sub("-", " ", text)
        # remove extra white spaces
        text = re.sub(r'\s+', ' ', text)
    
        return text.upper()

### load variable mapping

In [58]:
with Path("variable_mapping.json").open("r") as f:
    mapping = json.load(f, parse_int=True)

In [59]:
# convert the encoded values in the mapping to integers since they get read in as strings
for c in mapping.keys():
    mapping[c] = {int(k): v for k, v in mapping[c].items()}

### load primary data

In [60]:
df = pd.read_csv(
    "primary_data.csv",
    # set columns that can be null to nullable ints
    dtype={"body_part_2": "Int64", "diagnosis_2": "Int64"},
)
df.head()

Unnamed: 0,cpsc_case_number,narrative,treatment_date,age,sex,race,other_race,hispanic,diagnosis,other_diagnosis,...,body_part,body_part_2,disposition,location,fire_involvement,alcohol,drug,product_1,product_2,product_3
0,190103269,94YOM FELL TO THE FLOOR AT THE NURSING HOME ON...,2019-01-01,94,1,0,,0,62,,...,75,,4,5,0,0,0,1807,0,0
1,190103270,86YOM FELL IN THE SHOWER AT HOME AND SUSTAINED...,2019-01-01,86,1,0,,0,62,,...,75,,4,1,0,0,0,611,0,0
2,190103273,87YOF WAS GETTING UP FROM THE COUCH AND FELL T...,2019-01-01,87,2,0,,0,53,,...,32,,4,1,0,0,0,679,1807,0
3,190103291,67YOF WAS AT A FRIENDS HOUSE AND SLIPPED ON WA...,2019-01-01,67,2,0,,0,57,,...,33,,1,1,0,0,0,1807,0,0
4,190103294,70YOF WAS STANDING ON A STEP STOOL AND FELL OF...,2019-01-01,70,2,0,,0,57,,...,33,,1,1,0,0,0,620,0,0


### replace numeric values with corresponding strings

In [61]:
decoded_df = df.copy()

for col in mapping.keys():
    decoded_df[col] = decoded_df[col].map(mapping[col])

In [62]:
# ensure mappings were applied correctly by checking that the number of missing values did not change
assert (decoded_df.isnull().sum() == df.isnull().sum()).all()

In [63]:
random_sample = decoded_df.sample(n=5000, replace=False, random_state=42)

In [64]:
random_sample_sub = random_sample.iloc[:,0:2]

In [93]:
random_sample_sub['narrative'] = random_sample_sub['narrative'].apply(lambda x: clean_narrative(x))

In [94]:
random_sample_sub.head()

Unnamed: 0,cpsc_case_number,narrative
80147,220302739,PATIENT WAS SITTING IN A WHEELCHAIR LEANING FO...
48158,210101788,PATIENT WAS PLAYING WITH HER GRANDCHILD WHEN S...
112225,221250671,PATIENT TRIPPED GETTING OUT OF AN ELEVATOR AND...
50792,210212098,PATIENT SLIPPED ON NYLON SOCKS WHILE COMING OU...
115089,230214236,PATIENT PRESENTS AFTER A SYNCOPAL EPISODE WITH...


# NLP

In [5]:
print(spacy.__version__)

3.6.1


# My Model

Corrected data

In [6]:
fall = pd.read_csv('corrected_fall_set4.csv')

In [7]:
fall.head()

Unnamed: 0,key_entry,span,label
0,94YOF HX OF DEMENTIA S/P POSSIBLE UNWITNESSED ...,FRAME FELL ON HER NOSE,AO
1,70YOF WITH FALL FROM BED DX: LT INTEROTROCHANT...,FALL FROM BED,CHR
2,81 YOM FROM HOME WITH A FALL FROM HIS BED TO T...,FALL FROM HIS BED,CHR
3,70 YOF C/O INJ TO HEAD/FACE AFTER FALL OFF THE...,FALL OFF THE TOLIET,CHR
4,80YOF FALL OFF TOILET ONTO FLOOR AND C/O UNSUR...,FALL OFF TOILET,CHR


In [8]:
# Replace leading whitespaces using a regular expression
fall['span'] = fall['span'].str.replace(r'^\s+', '', regex=True)

In [12]:
print(fall.key_entry[0])
clean_narrative(fall.key_entry[0])

94YOF HX OF DEMENTIA S/P POSSIBLE UNWITNESSED BUT SUSPECTED MECHANICAL FALL. STAFF AT ASSISTED LIVING NOTICED BLOOD ON PATIENTS NOSE. WENT TO ROOM AND NOTICED A BROKEN PICTURE FRAME. PATIENT STATES THE FRAME FELL ON HER NOSE DX NASAL ABRASION, RT KNEE ABRASION, GLF


'PATIENT HISTORY OF OF DEMENTIA AFTER POSSIBLE UNWITNESSED BUT SUSPECTED MECHANICAL FALL. STAFF AT ASSISTED LIVING NOTICED BLOOD ON PATIENTS NOSE. WENT TO ROOM AND NOTICED A BROKEN PICTURE FRAME. PATIENT STATES THE FRAME FELL ON HER NOSE. CLINICAL DIAGNOSIS: NASAL ABRASION, RIGHT KNEE ABRASION, GROUND LEVEL FALL'

In [13]:
fall['key_entry'] = fall['key_entry'].apply(lambda x: clean_narrative(x))

In [14]:
fall['span'] = fall['span'].apply(lambda x: clean_narrative(x))

In [15]:
fall.head(4)

Unnamed: 0,key_entry,span,label
0,PATIENT HISTORY OF OF DEMENTIA AFTER POSSIBLE ...,FRAME FELL ON HER NOSE,AO
1,PATIENT WITH FALL FROM BED. CLINICAL DIAGNOSIS...,FALL FROM BED,CHR
2,PATIENT FROM HOME WITH A FALL FROM HIS BED TO ...,FALL FROM HIS BED,CHR
3,PATIENT COMPLAINS OF INJURY TO HEAD/FACE AFTER...,FALL OFF THE TOLIET,CHR


In [16]:
fall_nas = fall[fall['label'].isna()]
len(fall_nas)

60

In [17]:
fall_labeled = fall[fall['label'].isna() == False]

In [18]:
fall_labeled.head(2)

Unnamed: 0,key_entry,span,label
0,PATIENT HISTORY OF OF DEMENTIA AFTER POSSIBLE ...,FRAME FELL ON HER NOSE,AO
1,PATIENT WITH FALL FROM BED. CLINICAL DIAGNOSIS...,FALL FROM BED,CHR


In [36]:
all_labels = fall_labeled.label.unique()
all_labels

array(['AO', 'CHR', 'TRS', 'COL', 'DIZ', 'LAD', 'MED', 'OBJ', 'RCH', 'SF',
       'SHW', 'SO', 'STR', 'SU', 'WT'], dtype=object)

In [41]:
labels_to_use = ['CHR', 'TRS', 'LAD', 'OBJ', 'RCH', 'SF', 'SHW', 'SO', 'STR', 'SU', 'WT']

In [42]:
fall_labeled_sub = fall_labeled[fall_labeled['label'].isin(labels_to_use)]

In [43]:
key_list = fall_labeled_sub.key_entry.unique()
len(key_list)

355

In [20]:
fall_labeled[fall_labeled.key_entry == key_list[1]]

Unnamed: 0,key_entry,span,label
1,PATIENT WITH FALL FROM BED. CLINICAL DIAGNOSIS...,FALL FROM BED,CHR


In [46]:
nlp = spacy.blank("en")
# special_case = [{Token.ORTH: "CLOSED-HEAD"}]
# nlp.tokenizer.add_special_case("CLOSED-HEAD", special_case)

from spacy.tokens import Span

# keeping span token lengths to appropriately set config
token_lengths = []
label_list = []

docs=[] # this will hold the processed strings and spans
for k in key_list:
    doc = nlp(k)
    temp_df = fall_labeled_sub[fall_labeled_sub.key_entry == k]

    if len(temp_df)==1:              
        span_text = temp_df.iloc[0,1]
        temp_label = temp_df.iloc[0,2]       
        span_start_char = k.find(span_text)
        span_end_char = span_start_char + len(span_text)

        # Finding the start and end tokens using character offsets
        start_token = None
        end_token = None
        for token in doc:
            if token.idx == span_start_char:
                 start_token = token.i
            if token.idx + len(token.text) == span_end_char:
                end_token = token.i
                break
        if start_token is not None and end_token is not None:
            temp_start = start_token
            temp_end = end_token + 1
            doc.spans["sc"] = [Span(doc, temp_start, temp_end, temp_label)]
            docs.append(doc)
            token_lengths.append(temp_end - temp_start)
            label_list.append(temp_label)
        else:
            print(k, "span=", span_text,"couldn't find tokens")
    else:
        print("temp_df has length > 1")
        span_list = []
        for ent in range(len(temp_df)):
            span_text = temp_df.iloc[ent,1]
            temp_label = temp_df.iloc[ent,2]
            span_start_char = k.find(span_text)
            span_end_char = span_start_char + len(span_text)
            print(span_start_char, span_end_char)

            # Finding the start and end tokens using character offsets
            start_token = None
            end_token = None
            for token in doc:
                if token.idx == span_start_char:
                     start_token = token.i
                if token.idx + len(token.text) == span_end_char:
                    end_token = token.i
                    break
            if start_token is not None and end_token is not None:
                temp_start = start_token
                temp_end = end_token + 1             
                span_list.append(Span(doc, temp_start, temp_end, temp_label))
                token_lengths.append(temp_end - temp_start)
                label_list.append(temp_label)
            else:
                print(k, "span=",span_text, "couldn't find tokens")
        
        doc.spans["sc"] = span_list
    docs.append(doc)
          

temp_df has length > 1
25 40
41 64
temp_df has length > 1
25 40
84 108
temp_df has length > 1
27 45
50 78
temp_df has length > 1
65 87
159 195
temp_df has length > 1
38 72
73 103
temp_df has length > 1
86 99
111 123
temp_df has length > 1
28 41
42 57
temp_df has length > 1
8 21
22 39
temp_df has length > 1
8 24
45 54
temp_df has length > 1
16 35
36 62
temp_df has length > 1
8 27
28 45
temp_df has length > 1
8 23
91 132
temp_df has length > 1
8 23
24 34
temp_df has length > 1
8 23
24 49
temp_df has length > 1
58 73
78 99
temp_df has length > 1
9 24
51 64
temp_df has length > 1
8 23
39 47
temp_df has length > 1
8 23
42 51
temp_df has length > 1
32 49
61 73
temp_df has length > 1
9 31
112 123
temp_df has length > 1
37 51
61 75
temp_df has length > 1
35 52
9 30
53 89
temp_df has length > 1
8 25
43 64
temp_df has length > 1
62 79
81 96
temp_df has length > 1
13 25
26 39
temp_df has length > 1
31 48
84 101
temp_df has length > 1
24 34
56 70
temp_df has length > 1
19 36
37 49
temp_df has leng

In [47]:
len(docs)

568

In [48]:
print(np.min(token_lengths), np.max(token_lengths), np.median(token_lengths))

2 13 4.0


In [49]:
np.quantile(token_lengths, q =[0.05,0.95])

array([2., 7.])

In [52]:
pd.unique(label_list)

  pd.unique(label_list)


array(['CHR', 'SF', 'SO', 'SU', 'RCH', 'WT', 'TRS', 'LAD', 'OBJ', 'STR',
       'SHW'], dtype=object)

Make training and test sets with the docs

In [53]:
from spacy.tokens import DocBin

In [54]:
doc_bin = DocBin(docs=docs[0:400])

In [55]:
doc_bin.to_disk("./train_230929.spacy")

In [56]:
doc_bin = DocBin(docs=docs[500:])
doc_bin.to_disk("./dev_230929.spacy")

python -m spacy init config ./config.cfg --lang en --pipeline spancat

## Using model

In [84]:
nlp = spacy.load("/Users/wendyphillips/Documents/Computing/WendysPython/Falling_analysis/code/outputs2309229a/model-best")

In [95]:
random_sample_sub.iloc[1000,1]

'HEAD INJURY/91YOWF WAS AMBULATING IN CHURCH WHEN LOST HER FOOT ON A STEP AND FELL DOWN 1 STEP, FALLING AND STRIKING HER FOREHEAD ON THE FLOOR. NO LOSS OF CONSCIOUSNESS BUT COMPLAINS OF HEADACHE.'

In [96]:
doc = nlp(random_sample_sub.iloc[1000,1])
doc.text

'HEAD INJURY/91YOWF WAS AMBULATING IN CHURCH WHEN LOST HER FOOT ON A STEP AND FELL DOWN 1 STEP, FALLING AND STRIKING HER FOREHEAD ON THE FLOOR. NO LOSS OF CONSCIOUSNESS BUT COMPLAINS OF HEADACHE.'

In [97]:
doc.spans

{'sc': [FELL DOWN 1 STEP, STRIKING HER FOREHEAD ON THE FLOOR, ON A STEP]}

In [98]:
for span in doc.spans["sc"]:
    print(span.label_, span.start, span.end, span.text)

STR 14 18 FELL DOWN 1 STEP
SF 21 27 STRIKING HER FOREHEAD ON THE FLOOR
LAD 10 13 ON A STEP


In [100]:
random_sample_sub2 = random_sample_sub.iloc[1100:1200,]
for text in random_sample_sub2['narrative']:
    print(text)
    doc = nlp(text)
    for span in doc.spans["sc"]:
        print(span.label_, span.start, span.end, span.text)

PATIENT WITHHEAD AND NECK PAIN AND RIGHT FOREARM LAC AFTER A MECHANICAL FALL. PATIENT WAS PUSHING HIS LAWNMOWER DOWN A HILL AT HIS DAUGHTERS HOUSE AND FELL, STRIKING RIGHT FOREARM. CLINICAL DIAGNOSIS: CERV SPINE FRACTURE; RIGHT FOREARM LAC.
PATIENT GOING TO RESTROOM WHEN HE GOT HIS FEET TRIPPED UP AND FELL INTO THE DOOR. CLINICAL DIAGNOSIS: NECK PAIN, EPISTAXIS, FALL
SO 12 16 FELL INTO THE DOOR
PATIENT FELL IN FLOORT A HOME, LANDED ON HIP. CLINICAL DIAGNOSIS: HIP FRACTURE
SF 7 10 LANDED ON HIP
PATIENT PATIENT WAS GETTING OUT OF BED TO USE THE BATHROOM WHEN HE FELL TO THE FLOOR HAVING BILATERAL FOOT PAIN AND THINKS HE HIT HIS HEAD HE IS ACTING ALTERED. CLINICAL DIAGNOSIS: CLOSED HEAD INJURY, BILATERAL FOOT PAIN
TRS 3 7 GETTING OUT OF BED
PATIENT TRIPPED OVER HER CAT HIT LOW BACK ON TOILET. CLINICAL DIAGNOSIS: FRACTURE LUMBAR SPINE
SO 5 10 HIT LOW BACK ON TOILET
OBJ 1 5 TRIPPED OVER HER CAT
PATIENT TRIPPED ON RUG, FELL IN FLOOR AT HOME. CLINICAL DIAGNOSIS: FOREARM ABRASION, RIB STRAIN
OB

CHR 11 18 FELL FROM THE EDGE OF HIS BED
PATIENT PATIENT PULLING HEAVY GARBAGE CAN WHEN HE STEPPED WRONG AND FELL ONTO DRIVEWAY HAVING RIGHT HAND LACERATION 1 TO 2CM. CLINICAL DIAGNOSIS: LACERATION OF HAND, SUTURE OF SKIN WOUND, FALL
SF 11 14 FELL ONTO DRIVEWAY
PATIENT. UPPER ARM PAIN AFTER AFTER PATIENT WAS RUNNING IN HER HOUSE TO GRAB AN ITEM WHEN SHE HAD HER FOOT STOPPED AND TRIPPED ON HER LEFT SHOULDER, RECENTLY GOT NEW CARPET. CLINICAL DIAGNOSIS: CLOSED DISPLACED FRACTURE OF SURGICAL NECK OF LEFT HUMERUS
PATIENT HERE FOR EVALUATION AFTER A FALL. SHE LOST HER BALANCE IN THE SHOWER AND HER LEGS GOT TWISTED UP. SHE FELL, STRIKING THE POSTERIOR PORTION OF HER HEAD AGAINST THE WALL AND HER CHIN ON HER WALKER. HAS HEMATOMA OVER OCCIPUT. CLINICAL DIAGNOSIS: FALL, HEMATOMA, CHIN LACERATION
SHW 11 15 BALANCE IN THE SHOWER
PATIENT COMPLAINS OF WRIST FRACTURE AFTER SLIP AND FALL TO WET FLOOR AT HOME. CLINICAL DIAGNOSIS: RIGHT WRIST FRACTURE
PATIENT REPORTS GROUND LEVEL FALL 2 WEEKS AGO WHEN F

In [101]:
random_sample_sub2 = random_sample_sub.iloc[1100:2000,]

# Create an empty DataFrame with column names
output_df = pd.DataFrame(columns=['text', 'span_label', 'span_text'])

for text in random_sample_sub2['narrative']:
    doc = nlp(text)
    
    if len(doc.spans["sc"]) == 0:
        df2 = pd.DataFrame([[text, "NA", "NA"]], columns=['text', 'span_label', 'span_text'])
        # Append the new row to the DataFrame
        output_df = pd.concat([output_df, df2])
    else:
        for span in doc.spans["sc"]:
        # Create a new row as a dictionary
        
            df2 = pd.DataFrame([[text, span.label_, span.text]], columns=['text', 'span_label', 'span_text'])
            # Append the new row to the DataFrame
            output_df = pd.concat([output_df, df2])

In [102]:
output_df.to_csv("predictions_230929.csv")