In [1]:
import numpy as np
import pandas as pd
import json
from pathlib import Path
import re
import spacy

### load variable mapping

In [2]:
with Path("variable_mapping.json").open("r") as f:
    mapping = json.load(f, parse_int=True)

In [3]:
# convert the encoded values in the mapping to integers since they get read in as strings
for c in mapping.keys():
    mapping[c] = {int(k): v for k, v in mapping[c].items()}

### load primary data

In [4]:
df = pd.read_csv(
    "corrected_narrative_primary.csv",
    # set columns that can be null to nullable ints
    dtype={"body_part_2": "Int64", "diagnosis_2": "Int64"},
)
df.head()

Unnamed: 0,cpsc_case_number,narrative,treatment_date,age,sex,race,other_race,hispanic,diagnosis,other_diagnosis,...,body_part,body_part_2,disposition,location,fire_involvement,alcohol,drug,product_1,product_2,product_3
0,190103269,PATIENT FELL TO THE FLOOR AT THE NURSING HOME ...,2019-01-01,94,1,0,,0,62,,...,75,,4,5,0,0,0,1807,0,0
1,190103270,PATIENT FELL IN THE SHOWER AT HOME AND SUSTAIN...,2019-01-01,86,1,0,,0,62,,...,75,,4,1,0,0,0,611,0,0
2,190103273,PATIENT WAS GETTING UP FROM THE COUCH AND FELL...,2019-01-01,87,2,0,,0,53,,...,32,,4,1,0,0,0,679,1807,0
3,190103291,PATIENT WAS AT A FRIENDS HOUSE AND SLIPPED ON ...,2019-01-01,67,2,0,,0,57,,...,33,,1,1,0,0,0,1807,0,0
4,190103294,PATIENT WAS STANDING ON A STEP STOOL AND FELL ...,2019-01-01,70,2,0,,0,57,,...,33,,1,1,0,0,0,620,0,0


### replace numeric values with corresponding strings

In [5]:
decoded_df = df.copy()

for col in mapping.keys():
    decoded_df[col] = decoded_df[col].map(mapping[col])

In [224]:
decoded_df.head(3)

Unnamed: 0,cpsc_case_number,narrative,treatment_date,age,sex,race,other_race,hispanic,diagnosis,other_diagnosis,...,body_part,body_part_2,disposition,location,fire_involvement,alcohol,drug,product_1,product_2,product_3
0,190103269,PATIENT FELL TO THE FLOOR AT THE NURSING HOME ...,2019-01-01,94,MALE,N.S.,,Unk/Not stated,62 - INTERNAL INJURY,,...,75 - HEAD,,4 - TREATED AND ADMITTED/HOSPITALIZED,PUBLIC,NO/?,No/Unk,No/Unk,1807 - FLOORS OR FLOORING MATERIALS,0 - None,0 - None
1,190103270,PATIENT FELL IN THE SHOWER AT HOME AND SUSTAIN...,2019-01-01,86,MALE,N.S.,,Unk/Not stated,62 - INTERNAL INJURY,,...,75 - HEAD,,4 - TREATED AND ADMITTED/HOSPITALIZED,HOME,NO/?,No/Unk,No/Unk,611 - BATHTUBS OR SHOWERS,0 - None,0 - None
2,190103273,PATIENT WAS GETTING UP FROM THE COUCH AND FELL...,2019-01-01,87,FEMALE,N.S.,,Unk/Not stated,"53 - CONTUSIONS, ABR.",,...,32 - ELBOW,,4 - TREATED AND ADMITTED/HOSPITALIZED,HOME,NO/?,No/Unk,No/Unk,"679 - SOFAS, COUCHES, DAVENPORTS, DIVANS OR ST...",1807 - FLOORS OR FLOORING MATERIALS,0 - None


In [6]:
# ensure mappings were applied correctly by checking that the number of missing values did not change
assert (decoded_df.isnull().sum() == df.isnull().sum()).all()

In [7]:
random_sample = decoded_df.sample(n=5000, replace=False, random_state=42)

In [8]:
random_sample_sub = random_sample.iloc[:,0:2]

In [9]:
random_sample_sub.head()

Unnamed: 0,cpsc_case_number,narrative
80147,220302739,PATIENT WAS SITTING IN A WHEELCHAIR LEANING FO...
48158,210101788,PATIENT WAS PLAYING WITH HER GRANDCHILD WHEN S...
112225,221250671,PATIENT TRIPPED GETTING OUT OF AN ELEVATOR AND...
50792,210212098,PATIENT SLIPPED ON NYLON SOCKS WHILE COMING OU...
115089,230214236,PATIENT PRESENTS AFTER A SYNCOPAL EPISODE WITH...


# NLP

In [10]:
print(spacy.__version__)

3.6.1


# My Model

Corrected data

In [207]:
fall = pd.read_csv('predictions_strikes_CORRECTED.csv')

In [208]:
fall.head()

Unnamed: 0,key_entry,span,label
0,"*PATIENT, DIZZY AN LIGHTHEADED AN FELL DOWN, F...",,
1,"*PATIENT, FELL DOWN ESCALATOR AT MALL, FELL BA...",LANDED ON SIDE BODY,SF
2,"*PATIENT, FELL OUT OF BED THIS MORNING, HIT UP...",HIT UPPER ARM,SU
3,"*PATIENT, FELL OUT OF BED, FOUND FACE DOWN ON ...",,
4,"*PATIENT, GOING DOWN STAIRCASE FELL STEPS, LAN...",LANDED ON KNEE,SF


In [209]:
fall.tail(3)

Unnamed: 0,key_entry,span,label
1148,"PATIENT, WAS WALKING WHEN HIS SLIPPERS SLIPPED...",IMPACTING FACE,SU
1149,"PATIENT, WITH PROGRESSIVE WAKNESS SINCE RECENT...",STRIKING HEAD ONTO BEDSIDE TABLE,SO
1150,"PATIENT, WOKE UP TO FEED HER CAT WHEN FELT DIZ...",STRIKING HEAD ONTO THE SINK,SO


In [210]:
fall.drop_duplicates(inplace=True)
len(fall)

1144

In [211]:
fall_nas = fall[fall['span'].isna()]
fall_nas.head()

Unnamed: 0,key_entry,span,label
0,"*PATIENT, DIZZY AN LIGHTHEADED AN FELL DOWN, F...",,
3,"*PATIENT, FELL OUT OF BED, FOUND FACE DOWN ON ...",,
6,"*PATIENT, OPENING BATHROOM DOOR SLIPPED AN FEL...",,
9,"*PATIENT, USING ON CELLPHONE TRIPPED AN FELL I...",,
18,74 Y O W F SHOULDER SPRAIN FELL WALKIING THROU...,,


In [212]:
len(fall_nas)

418

In [213]:
all_labels = fall.label.unique()
all_labels

array([nan, 'SF', 'SU', 'SO'], dtype=object)

In [214]:
# subset to only the labels you want to build model with
# note: this chunk of code will eliminate a lot of useful texts without these labels
#    it would be good to modify it so that it doesn't
labels_to_use = ['SO', 'SU', 'SF']
fall_sub = fall[fall['label'].isin(labels_to_use)]

# add back in the NA rows
fall_all = pd.concat((fall_sub, fall_nas))
fall_all.tail(3)

Unnamed: 0,key_entry,span,label
1101,"PATIENT, HAD 4 GLASSES OF WINE WITH DINNER, BL...",,
1109,"PATIENT, ON FOR ACCIDENTAL FALL, FELL OUT OF B...",,
1134,"PATIENT, TRIPPED AND FELL DOWN STAIRS LANDING ...",,


In [215]:
fall_all_random = fall_all.sample(n=len(fall_all), replace=False, random_state=99)
fall_all_random.tail(3)

Unnamed: 0,key_entry,span,label
674,"PATIENT ON FOR ATRIAL FIBRILLATION, FELL DOWN ...",,
615,PATIENT HISTORY OF OF DEMENTIA PRESENTS AFTER ...,,
1057,PATIENT WAS WEARING SLIPPERS WHEN SHE TRIPPED ...,LANDING ONTO LEFT HIP,SF


In [216]:
key_list = fall_all_random.key_entry.unique()
len(key_list)

1129

In [217]:
fall_all_random[fall_all_random.key_entry == key_list[1]]

Unnamed: 0,key_entry,span,label
89,PATIENT BROUGHT IN BY EMS WITH COMPLAINS OF FA...,HIT HIS HEAD ON EXERCISE EQUIPMENT,SO


In [218]:
nlp = spacy.blank("en")
# special_case = [{Token.ORTH: "CLOSED-HEAD"}]
# nlp.tokenizer.add_special_case("CLOSED-HEAD", special_case)

from spacy.tokens import Span

# keeping span token lengths to appropriately set config
token_lengths = []
label_list = []

docs=[] # this will hold the processed strings and spans
for k in key_list:
    print(k)
    doc = nlp(k)
    temp_df = fall_all_random[fall_all_random.key_entry == k]

    if len(temp_df)==1:    
        if pd.isna(temp_df.iloc[0,2]):
            doc.spans["sc"] = []
            docs.append(doc)
        else:     
            span_text = temp_df.iloc[0,1]
            temp_label = temp_df.iloc[0,2]       
            span_start_char = k.find(span_text)
            span_end_char = span_start_char + len(span_text)
    
            # Finding the start and end tokens using character offsets
            start_token = None
            end_token = None
            for token in doc:
                if token.idx == span_start_char:
                     start_token = token.i
                if token.idx + len(token.text) == span_end_char:
                    end_token = token.i
                    break
            if start_token is not None and end_token is not None:
                temp_start = start_token
                temp_end = end_token + 1
                doc.spans["sc"] = [Span(doc, temp_start, temp_end, temp_label)]
                docs.append(doc)
                token_lengths.append(temp_end - temp_start)
                label_list.append(temp_label)
            else:
                print(k, "span=", span_text,"couldn't find tokens")
    else:
        print("temp_df has length > 1")
        span_list = []
        for ent in range(len(temp_df)):
            span_text = temp_df.iloc[ent,1]
            temp_label = temp_df.iloc[ent,2]
            span_start_char = k.find(span_text)
            span_end_char = span_start_char + len(span_text)
            print(span_start_char, span_end_char)

            # Finding the start and end tokens using character offsets
            start_token = None
            end_token = None
            for token in doc:
                if token.idx == span_start_char:
                     start_token = token.i
                if token.idx + len(token.text) == span_end_char:
                    end_token = token.i
                    break
            if start_token is not None and end_token is not None:
                temp_start = start_token
                temp_end = end_token + 1             
                span_list.append(Span(doc, temp_start, temp_end, temp_label))
                token_lengths.append(temp_end - temp_start)
                label_list.append(temp_label)
            else:
                print(k, "span=",span_text, "couldn't find tokens")
        
        doc.spans["sc"] = span_list
    docs.append(doc)
          

PATIENT FELL TO THE FLOOR AT HOME CLINICAL DIAGNOSIS: RIGHT HIP FRACTURE
PATIENT BROUGHT IN BY EMS WITH COMPLAINS OF FALL EMS REPORTS PATIENT FELL AND HIT HIS HEAD ON EXERCISE EQUIPMENT CLINICAL DIAGNOSIS: ACUTE SUBDURAL HEMATOMA; FALL; NECK PAIN; ACUTE MIDLINE THORACIC BACK PAIN; CONTUSION OF FOREHEAD
PATIENT FELL DOWN SEVERAL STEPS CLINICAL DIAGNOSIS: SUBARCHNOID HEMORRHAGE
PATIENT WAS GETTING OUT OF BED WHEN SHE FELL TO THE FLOOR ONTO HEAD CLINICAL DIAGNOSIS: CLOSED HEAD INJURY; CONTUSION TO RIGHT SHOULDER
PATIENT LEFT KNEE CONTUSION AFTER SLIP AND FALL DOWN STEPS LANDING ON KNEE CLINICAL DIAGNOSIS: CONTUSION LEFT KNEE, FALL
PATIENT STATES SHE WAS LEAVING HER PHYSICAL THERAPY APPOINTMENT ON THE CARPET ON THE WAY OUT SHE STRUCK THE TOP OF HER HEAD ON THE DESK SHE DENIES LOSS OF CONSCIOUSNESS CLINICAL DIAGNOSIS: OTHER SPECIFIED INJURY OF HEAD
PATIENT PRESENTS AFTER SHE WAS WALKING UP THE STAIRS, STRUCK HER RIGHT FOREHEAD, AND FELL ONTO HER RIGHT WRIST AND RIGHT SHOULDER CLINICAL DIAGN

In [111]:
def build_model(input):
    """
    This function requires as input a dataframe with columns exactly in this order:
    key_entry, span, label
    The first column must be named 'key_entry'
    """
    nlp = spacy.blank("en")
    # special_case = [{Token.ORTH: "CLOSED-HEAD"}]
    # nlp.tokenizer.add_special_case("CLOSED-HEAD", special_case)

    from spacy.tokens import Span

    # keeping span token lengths to appropriately set config
    token_lengths = []
    label_list = []

    docs=[] # this will hold the processed strings and spans
    for k in key_list:
        doc = nlp(k)
        temp_df = fall_all_random[fall_all_random.key_entry == k]

        if len(temp_df)==1:    
            if pd.isna(temp_df.iloc[0,2]):
                doc.spans["sc"] = []
                docs.append(doc)
            else:     
                span_text = temp_df.iloc[0,1]
                temp_label = temp_df.iloc[0,2]       
                span_start_char = k.find(span_text)
                span_end_char = span_start_char + len(span_text)

                # Finding the start and end tokens using character offsets
                start_token = None
                end_token = None
                for token in doc:
                    if token.idx == span_start_char:
                         start_token = token.i
                    if token.idx + len(token.text) == span_end_char:
                        end_token = token.i
                        break
                if start_token is not None and end_token is not None:
                    temp_start = start_token
                    temp_end = end_token + 1
                    doc.spans["sc"] = [Span(doc, temp_start, temp_end, temp_label)]
                    docs.append(doc)
                    token_lengths.append(temp_end - temp_start)
                    label_list.append(temp_label)
                else:
                    print(k, "span=", span_text,"couldn't find tokens")
        else:
            print("temp_df has length > 1")
            span_list = []
            for ent in range(len(temp_df)):
                span_text = temp_df.iloc[ent,1]
                temp_label = temp_df.iloc[ent,2]
                span_start_char = k.find(span_text)
                span_end_char = span_start_char + len(span_text)
                print(span_start_char, span_end_char)

                # Finding the start and end tokens using character offsets
                start_token = None
                end_token = None
                for token in doc:
                    if token.idx == span_start_char:
                         start_token = token.i
                    if token.idx + len(token.text) == span_end_char:
                        end_token = token.i
                        break
                if start_token is not None and end_token is not None:
                    temp_start = start_token
                    temp_end = end_token + 1             
                    span_list.append(Span(doc, temp_start, temp_end, temp_label))
                    token_lengths.append(temp_end - temp_start)
                    label_list.append(temp_label)
                else:
                    print(k, "span=",span_text, "couldn't find tokens")

            doc.spans["sc"] = span_list
        docs.append(doc)
    return({"docs":docs, "token_lengths": token_lengths, "labels": label_list})      

In [112]:
temp = build_model(fall_all_random)

temp_df has length > 1
45 65
70 89
*PATIENT, FELL FROM WHEELCHAIR STRIKED BACK OF HEAD, UNWITNESS, FOUND ON FLOOR BY BROTHER CLINICAL DIAGNOSIS: HEAD LACERATION span= STRIKEBACK OF HEAD couldn't find tokens
temp_df has length > 1
78 90
91 107
temp_df has length > 1
127 156
53 96
temp_df has length > 1
48 59
93 118
temp_df has length > 1
40 58
70 82


In [114]:
len(temp['docs'])

904

In [219]:
len(token_lengths)

724

In [220]:
print(np.min(token_lengths), np.max(token_lengths), np.median(token_lengths))

2 11 4.0


In [221]:
np.quantile(token_lengths, q =[0.05,0.95])

array([2., 8.])

In [222]:
pd.unique(np.array(label_list))

array(['SO', 'SF', 'SU'], dtype='<U2')

Make training and test sets with the docs

In [97]:
from spacy.tokens import DocBin

In [223]:
doc_bin = DocBin(docs=docs[0:650])
doc_bin.to_disk("./train_230930_strike2.spacy")

doc_bin = DocBin(docs=docs[650:])
doc_bin.to_disk("./dev_230930_strike2.spacy")

python -m spacy init config ./config.cfg --lang en --pipeline spancat

## Using model

In [266]:
nlp_spancat = spacy.load("/Users/wendyphillips/Documents/Computing/WendysPython/Falling_analysis/code/outputs230929_mult_labs1017_HP/model-best")

In [267]:
nlp_spancat.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x17bcb5d00>),
 ('spancat_singlelabel',
  <spacy.pipeline.spancat.SpanCategorizer at 0x17bc9c160>)]

In [268]:
test_text = random_sample_sub.iloc[2102,1]
doc = nlp_spancat(test_text)
doc.text

'PATIENT HISTORY OF CIRRHOSIS, YESTERDAY WHILE UNKNOWN IF ETHYL ALCOHOL INTOXICATED, BLOOD ALCOHOL LEVEL, NOT DONE, TRIPPED AND FELL WHILE MOVING A CHAIR STRIKING HEAD WITHO LOSS OF CONSCIOUSNESS, WITH ARM SKIN TEAR NOW CLINICAL DIAGNOSIS: INTRAPARENCHYMAL HEMORRHAGE'

In [269]:
doc.spans

{'sc': [STRIKING HEAD, MOVING A CHAIR]}

In [270]:
for span in doc.spans["sc"]:
    print(span.label_, span.start, span.end, span.text)

SU 27 29 STRIKING HEAD
TRS 24 27 MOVING A CHAIR


In [271]:
random_sample_sub.head()

Unnamed: 0,cpsc_case_number,narrative
80147,220302739,PATIENT WAS SITTING IN A WHEELCHAIR LEANING FO...
48158,210101788,PATIENT WAS PLAYING WITH HER GRANDCHILD WHEN S...
112225,221250671,PATIENT TRIPPED GETTING OUT OF AN ELEVATOR AND...
50792,210212098,PATIENT SLIPPED ON NYLON SOCKS WHILE COMING OU...
115089,230214236,PATIENT PRESENTS AFTER A SYNCOPAL EPISODE WITH...


In [121]:
random_sample_sub2 = random_sample_sub.iloc[2000:2050,]
for text in random_sample_sub2['narrative']:
    print(text)
    doc = nlp_spancat(text)
    for span in doc.spans["sc"]:
        print(span.label_, span.start, span.end, span.text)

PATIENT FELL IN FLOOR AT HOME CLINICAL DIAGNOSIS: COVID, FEMUR FRACTURE
PATIENT WAS WALKING OUTSIDE AND TRIPPED AND FELL OVER AN IRRIGATION PIPE IN HER YARD CLINICAL DIAGNOSIS: LACERATION OF RIGHT EYEBROW
PATIENT WITH FALL FROM TOILET, JUST RETURNED FROM REHAB FROM SPINAL FUSION PATIENT WAS ON TOILET EARLIER TODAY AND FELL HE HIT HIS HEAD AND HAD LACERATIONS ON FOREHEAD AND LEFT EYEBROW CLINICAL DIAGNOSIS: FACIAL LACERATION
SU 22 25 HIT HIS HEAD
PATIENT, FELL FROM STANDING AT SNF LANDING ONTO HARD FLOOR CLINICAL DIAGNOSIS: CLOSED DISPLACED SUBTROCHANTERIC FRACTURE OF RIGHT FEMUR
SF 7 11 LANDING ONTO HARD FLOOR
PATIENT FELL TO THE FLOOR AT THE NURSING HOME LANDING ONTO RIGHT HIP CLINICAL DIAGNOSIS: HIP FRACTURE
SF 9 13 LANDING ONTO RIGHT HIP
PATIENT HAD A SLIP AND FALL AFTER PUTTING ON A BRAND NEW PAIR OF SHOES AND FELL DOWN A FLIGHT OF STAIRS CLINICAL DIAGNOSIS: CONCUSSION
PATIENT, TRIPPED OVER THE DOG WHILE HIKING AND FELL STRIKING HEAD ONTO ROCK WITHOUT LOSS OF CONSCIOUS, WITH FOREHE

In [126]:
random_sample_sub2 = random_sample_sub.iloc[2000:3500,]

# Create an empty DataFrame with column names
output_df = pd.DataFrame(columns=['text', 'span_label', 'span_text'])

for text in random_sample_sub2['narrative']:
    doc = nlp_spancat(text)
    
    if len(doc.spans["sc"]) == 0:
        df2 = pd.DataFrame([[text, "NA", "NA"]], columns=['text', 'span_label', 'span_text'])
        # Append the new row to the DataFrame
        output_df = pd.concat([output_df, df2])
    else:
        for span in doc.spans["sc"]:
        # Create a new row as a dictionary
        
            df2 = pd.DataFrame([[text, span.label_, span.text]], columns=['text', 'span_label', 'span_text'])
            # Append the new row to the DataFrame
            output_df = pd.concat([output_df, df2])

In [127]:
output_df.to_csv("predictions_strikes.csv")

In [44]:
pred_set_10k = decoded_df.sample(n=10000, replace=False, random_state=99)

In [45]:
pred_set_10k = pred_set_10k.iloc[:,0:2]
pred_set_10k.head()

Unnamed: 0,cpsc_case_number,narrative
42814,201024479,PATIENT WAS ON A LADDER WITH A HAND SAW TRYING...
2482,190227298,PATIENT HAD A TRIP AND FALL TO THE FLOOR AT TH...
105671,221060254,"PATIENT SYNCOPAL EPISODE WITH KNIFE IN HAND, F..."
59128,210609664,PATIENT REPORTS SHE WAS GOING TO WALK ACROSS T...
71980,211207838,"PATIENT SITTING IN CHAIR, IT COLLAPSED AND SHE..."


In [272]:
# Create an empty DataFrame with column names
output_df = pd.DataFrame(columns=['cpsc_case_number','text', 'span_label', 'span_text'])

for row in decoded_df.iloc[:,0:2].iterrows():
    cpsc = (row[1]['cpsc_case_number'])
    text = (row[1]['narrative'])
    doc = nlp_spancat(text)

    
    if len(doc.spans["sc"]) == 0:
        df2 = pd.DataFrame([[cpsc,text, "NA", "NA"]], columns=['cpsc_case_number','text', 'span_text', 'span_label'])
        # Append the new row to the DataFrame
        output_df = pd.concat([output_df, df2])
    else:
        for span in doc.spans["sc"]:
        # Create a new row as a dictionary
        
            df2 = pd.DataFrame([[cpsc,text, span.label_, span.text]], columns=['cpsc_case_number','text', 'span_label', 'span_text'])
            # Append the new row to the DataFrame
            output_df = pd.concat([output_df, df2])

In [273]:
output_df.head()

Unnamed: 0,cpsc_case_number,text,span_label,span_text
0,190103269,PATIENT FELL TO THE FLOOR AT THE NURSING HOME ...,,
0,190103270,PATIENT FELL IN THE SHOWER AT HOME AND SUSTAIN...,SHW,FELL IN THE SHOWER
0,190103273,PATIENT WAS GETTING UP FROM THE COUCH AND FELL...,,
0,190103291,PATIENT WAS AT A FRIENDS HOUSE AND SLIPPED ON ...,OBJ,SLIPPED ON WATER
0,190103294,PATIENT WAS STANDING ON A STEP STOOL AND FELL ...,LAD,STANDING ON A STEP STOOL


In [274]:
len(output_df)

144338

In [275]:
# Pivot the DataFrame
def tuple_aggregator(series):
    return tuple(series)

pivot_output_df = output_df.pivot_table(index=['cpsc_case_number', 'text'], columns='span_label', values='span_text', aggfunc=tuple_aggregator).reset_index()
pivot_output_df.columns.name = None 
pivot_output_df.head()
# Fill NA values with a default value if needed
# pivot_df.fillna(np.nan, inplace=True)


Unnamed: 0,cpsc_case_number,text,CHR,LAD,NA,OBJ,RCH,SF,SHW,SO,STR,SU,TRS,WT
0,190103269,PATIENT FELL TO THE FLOOR AT THE NURSING HOME ...,,,"(NA,)",,,,,,,,,
1,190103270,PATIENT FELL IN THE SHOWER AT HOME AND SUSTAIN...,,,,,,,"(FELL IN THE SHOWER,)",,,,,
2,190103273,PATIENT WAS GETTING UP FROM THE COUCH AND FELL...,,,"(NA,)",,,,,,,,,
3,190103291,PATIENT WAS AT A FRIENDS HOUSE AND SLIPPED ON ...,,,,"(SLIPPED ON WATER,)",,,,,,,,
4,190103294,PATIENT WAS STANDING ON A STEP STOOL AND FELL ...,,"(STANDING ON A STEP STOOL,)",,,,,,,,,,


In [276]:
# Make a copy of the original DataFrame
binary_df = pivot_output_df.copy()

# Update the DataFrame to have 0 for NaN and 1 for actual values
# We're excluding the first two columns ('A' and 'B') as they are index columns
binary_df.iloc[:, 2:] = binary_df.iloc[:, 2:].map(lambda x: 0 if x == 'NA' or pd.isna(x) else 1)
binary_df.head()


Unnamed: 0,cpsc_case_number,text,CHR,LAD,NA,OBJ,RCH,SF,SHW,SO,STR,SU,TRS,WT
0,190103269,PATIENT FELL TO THE FLOOR AT THE NURSING HOME ...,0,0,1,0,0,0,0,0,0,0,0,0
1,190103270,PATIENT FELL IN THE SHOWER AT HOME AND SUSTAIN...,0,0,0,0,0,0,1,0,0,0,0,0
2,190103273,PATIENT WAS GETTING UP FROM THE COUCH AND FELL...,0,0,1,0,0,0,0,0,0,0,0,0
3,190103291,PATIENT WAS AT A FRIENDS HOUSE AND SLIPPED ON ...,0,0,0,1,0,0,0,0,0,0,0,0
4,190103294,PATIENT WAS STANDING ON A STEP STOOL AND FELL ...,0,1,0,0,0,0,0,0,0,0,0,0


In [277]:
combo_df = pd.merge(binary_df, decoded_df, how="left", on="cpsc_case_number")

In [278]:
combo_sub = combo_df[(combo_df['SO']==1) | (combo_df['SF']==1) | (combo_df['SU']==1)]

In [279]:
combo_sub.columns

Index(['cpsc_case_number', 'text', 'CHR', 'LAD', 'NA', 'OBJ', 'RCH', 'SF',
       'SHW', 'SO', 'STR', 'SU', 'TRS', 'WT', 'narrative', 'treatment_date',
       'age', 'sex', 'race', 'other_race', 'hispanic', 'diagnosis',
       'other_diagnosis', 'diagnosis_2', 'other_diagnosis_2', 'body_part',
       'body_part_2', 'disposition', 'location', 'fire_involvement', 'alcohol',
       'drug', 'product_1', 'product_2', 'product_3'],
      dtype='object')

In [280]:
combo_sub2 = combo_sub[['SF','SO', 'SU', 'age', 'disposition']]
combo_sub2.head()

Unnamed: 0,SF,SO,SU,age,disposition
6,0,1,0,74,1 - TREATED/EXAMINED AND RELEASED
7,1,0,0,76,1 - TREATED/EXAMINED AND RELEASED
11,1,0,0,98,1 - TREATED/EXAMINED AND RELEASED
15,0,0,1,87,1 - TREATED/EXAMINED AND RELEASED
19,0,1,0,75,1 - TREATED/EXAMINED AND RELEASED


In [281]:
combo_sub2.groupby('SO')['disposition'].value_counts(normalize=True)

SO  disposition                          
0   1 - TREATED/EXAMINED AND RELEASED        0.622716
    4 - TREATED AND ADMITTED/HOSPITALIZED    0.333259
    5 - HELD FOR OBSERVATION                 0.019412
    2 - TREATED AND TRANSFERRED              0.018967
    6 - LEFT WITHOUT BEING SEEN              0.005646
1   1 - TREATED/EXAMINED AND RELEASED        0.716430
    4 - TREATED AND ADMITTED/HOSPITALIZED    0.236642
    2 - TREATED AND TRANSFERRED              0.020429
    5 - HELD FOR OBSERVATION                 0.020050
    6 - LEFT WITHOUT BEING SEEN              0.006448
Name: proportion, dtype: float64

In [282]:
combo_sub2.groupby('SF')['disposition'].value_counts(normalize=True)

SF  disposition                          
0   1 - TREATED/EXAMINED AND RELEASED        0.689180
    4 - TREATED AND ADMITTED/HOSPITALIZED    0.264609
    5 - HELD FOR OBSERVATION                 0.020365
    2 - TREATED AND TRANSFERRED              0.018670
    6 - LEFT WITHOUT BEING SEEN              0.007176
1   1 - TREATED/EXAMINED AND RELEASED        0.611559
    4 - TREATED AND ADMITTED/HOSPITALIZED    0.344920
    2 - TREATED AND TRANSFERRED              0.020444
    5 - HELD FOR OBSERVATION                 0.018717
    6 - LEFT WITHOUT BEING SEEN              0.004360
Name: proportion, dtype: float64