# Investigating clinical notes regarding falls in older adults
Author: Wendy Phillips

## Problem statement

Falls in older adults are frequent and can have major health impacts.

## Data description

Clinical notes and metadata associated with 111,000 visits to the doctor were analyzed.

## Analytic approach

Natural Language Processing (NLP) was applied to the clinical notes to extract informative content. Specifically, the NLP tool spaCy was used to build a model that could identify and label specific spans of the text that held information of interest. 

In [None]:
import numpy as np
import pandas as pd
import json
from pathlib import Path
import re
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
from spacy.tokens import Span
from spacy.tokens import DocBin
import collections

In [None]:
print(spacy.__version__)

## Import data 

#### Load variable mapping

In [None]:
with Path("primary_data/variable_mapping.json").open("r") as f:
    mapping = json.load(f, parse_int=True)

# convert the encoded values in the mapping to integers since they get read in as strings
for c in mapping.keys():
    mapping[c] = {int(k): v for k, v in mapping[c].items()}

#### Load primary data

In [None]:
df = pd.read_csv(
    "primary_data/primary_data.csv",
    # set columns that can be null to nullable ints
    dtype={"body_part_2": "Int64", "diagnosis_2": "Int64"},
)

#### Replace numeric values with corresponding strings

In [None]:
decoded_df = df.copy()

for col in mapping.keys():
    decoded_df[col] = decoded_df[col].map(mapping[col])

# Check on the df
decoded_df.head(1)

In [None]:
# ensure mappings were applied correctly by checking that the number of missing values did not change
assert (decoded_df.isnull().sum() == df.isnull().sum()).all()

Because the narrative text will be a main focus of this analysis, it deserves some investigation. First, create a column that holds how many characters are in each narrative text string.

In [None]:
# Add character length column
decoded_df['narrative_characters'] = decoded_df['narrative'].str.len()

# Exploratory plot of distribution
sns.kdeplot(decoded_df['narrative_characters'])

Find the value that occurs most frequently, called the mode, which is the point at the top of the curve. Also find the median of the character lengths.

In [None]:
print(decoded_df['narrative_characters'].mode())
print(decoded_df['narrative_characters'].median())

By plotting this for the male and female sexes separately, we can see how the two compare to each other.

In [None]:
sns.displot(decoded_df, x = 'narrative_characters', hue = 'sex', kind = 'kde', fill = True)
plt.show()

In [None]:
# Save to file if one wants to come back at this point
# decoded_df.to_csv("decoded_primary_data_with_char_lens.csv")

## Narrative manipulation
Convert narrative text so that the most common abbreviations become words

In [None]:
medical_terms = {
    "&": "and",
    "***": "",
    ">>": "clinical diagnosis",
    "@": "at",
    "+": "with",
    "?": "unknown if",
    "abd": "abdomen",
    "af": "accidental fall",
    "afib": "atrial fibrillation",
    "alf": "assisted living facility",
    "aki": "acute kidney injury",
    "am": "morning",
    "ams": "altered mental status",
    "bac": "blood alcohol content",
    "biba": "brought in by ambulance",
    "bwd": "backwards",
    "c/o": "complains of",
    "chi": "closed-head injury",
    "clsd": "closed",
    "cpk": "creatine phosphokinase",
    "cva": "cerebral vascular accident",
    "dn": "down",
    "dtr": "daughter",
    "dx": "clinical diagnosis",
    "ecf": "extended care facility",
    "er": "emergency room",
    "etoh": "ethyl alcohol",
    "eval": "evaluation",
    "fd": "found",
    "ft": "foot",
    "fx": "fracture",
    "fxs": "fractures",
    "fwd": "forwards",
    "glf": "ground level fall",
    "h/o": "history of",
    "hr": "hours",
    "htn": "hypertension",
    "hx": "history of",
    "inj": "injury",
    "inr": "international normalized ratio",
    "intox": "intoxication",
    "lac": "laceration",
    "loc": "loss of consciousness",
    "lt": "left",
    "mech": "mechanical",
    "mult": "multiple",
    "n h ": "nursing home",
    "nh": "nursing home",
    "p/w": "presents with",
    "pm": "afternoon",
    "pt": "patient",
    "pta": "prior to arrival",
    "pts": "patient's",
    "px": "physical examination", # not "procedure",
    "r/o": "rules out",
    "rt": "right",
    "s/p": "after",
    "sah": "subarachnoid hemorrhage",
    "sdh": "acute subdural hematoma",
    "sts": "sit to stand",
    "tr": "trauma",
    "uti": "urinary tract infection",
    "unwit'd": "unwitnessed",
    "w/o": "without",
    "w/": "with",
    "wks": "weeks"
}

def add_space_after(text, target=","):
    replaced_text = re.sub(r'({})(\S)'.format(re.escape(target)), r'{} \2'.format(target),text)
    return replaced_text

def add_space_before(text, target=","):
    replaced_text = re.sub(r'(\S)({})'.format(re.escape(target)), r'\1 {}'.format(target), text)
    return replaced_text

# cleanning
def clean_narrative(text):
    # lowercase everything
    if pd.isna(text):
        return "NA"
    else:     
        text = text.lower()
        
        # unglue DX
        regex_dx = r"([ˆ\W]*(dx)[ˆ\W]*)"
        text = re.sub(regex_dx, r". dx: ", text)
    
        # remove age and sex identifications
        ## regex to capture age and sex (not perfect but captures almost all of the cases)
        regex_age_sex = r"(\d+)\s*?(yof|yf|yo\s*female|yo\s*f|yom|ym|yo\s*male|yo\s*m)"
        age_sex_match = re.search(regex_age_sex, text)
    
        ## format age and sex
        if age_sex_match:
            age = age_sex_match.group(1)
            sex = age_sex_match.group(2)
            
            # probably not best practice but it works with this data
            if "f" in sex:
                #text = text.replace(age_sex_match.group(0), f"{age} years old female")
                text = text.replace(age_sex_match.group(0), f"patient")
            elif "m" in sex:
                #text = text.replace(age_sex_match.group(0), f"{age} years old male")
                text = text.replace(age_sex_match.group(0), f"patient")
                
        text = add_space_after(text, target=",") 
        text = add_space_after(text, target=";") 
        text = add_space_after(text, target=":") 
        text = add_space_before(text, target="--")  
        text = add_space_after(text, target="--")
        text = re.sub("-", " ", text)
        text = re.sub(r"([0-9]+(\.[0-9]+)?)",r" \1 ", text).strip()
        text = re.sub("\.", " ", text)
        
        # The below two are separated from the dictionary because of the '&' character conflict
        #   with the translate medical terms section use of '&'
        text = re.sub("t'd&f", "tripped and fell", text)
        text = re.sub("s'd&f", "slipped and fell",text)     
    
        
        # translate medical terms
        for term, replacement in medical_terms.items():
            if term == "@" or term == ">>" or term == "&" or term == "***" or term == "+" or term == "?":
                pattern = fr"({re.escape(term)})"
                text = re.sub(pattern, f" {replacement} ", text) # force spaces around replacement
                
            else:
                pattern = fr"(?<!-)\b({re.escape(term)})\b(?!-)"
                text = re.sub(pattern, replacement, text)
                            
        # This done after translate medical terms because some use a '/'
        text = add_space_before(text, target="/")  
        text = add_space_after(text, target="/")
        
        # remove extra white spaces
        text = re.sub(r'\s+', ' ', text)
    
        return text.upper()

Check how well that function works to change some complex text.

In [None]:
text = '72 YOF SLIPPED&FELL OFF 4FT LADDER ONTO THE FLOOR THIS AM DTR FD DN+LOC>>L-3, L-4 FRACTURE, RT RIB FRACTURES X 3'
print("Original text:", text)
print("Clean text:", clean_narrative(text))

While it would probably be better for things like "L3" to remain together, I will sacrifice that for the easy advantage of separaing other number-character strings that would be better off separated.

In [None]:
# Subset the data to just case id and narrative strings
df_narratives = df.iloc[:, 0:2]

# Apply the text cleaning function to all narrative entries
df_narratives['narrative'] = df_narratives['narrative'].apply(lambda x: clean_narrative(x))

In [None]:
# Rename original narrative column in the decoded df
df_nn = decoded_df.rename(columns={"narrative": "narrative_original"})

# Merge the modified narratives with the decoded dataframe
df_final = pd.merge(df_narratives, df_nn, on="cpsc_case_number", how="left")

# Check head to verify
df_final.head(2)

In [None]:
# Save to file if one wants to come back at this point
# df_final.to_csv("corrected_narrative_primary.csv", index=False)

## Model training

Import semi-manually labeled training data

In [None]:
labeled_sets = pd.read_csv('secondary_data/labeled_training_data.csv')
labeled_sets.head(2)

Check how long the imported data frame is

In [None]:
len(labeled_sets)

Check how many unique cases are contained in the data set

In [None]:
key_list = labeled_sets.cpsc_case_number.unique()
len(key_list)

Check how many cases in the training set have no labeled span. (Note, this is also an important component of one's training set!)

In [None]:
labeled_sets['span'].isna().sum()

Randomly shuffle the key list in case there is some bias in the order of the entries

In [None]:
np.random.seed(99)
np.random.shuffle(key_list, )
key_list

The following chunk of code goes through the labeled training set, processing it to be a list of Spacy doc objects.

In [None]:
nlp = spacy.blank("en")

# keeping span token lengths to appropriately set config
token_lengths = []

# Keeping labels in case I want to check
label_list = []

# This will hold the processed string and span docs
docs=[] 

for case in key_list:
    
    # Subest to just one case
    temp_df = labeled_sets[labeled_sets.cpsc_case_number == case]
    
    # Get the narrative and process into a doc
    nar = temp_df.iloc[0,1]
    doc = nlp(nar)
    
    if len(temp_df)==1:    
        if pd.isna(temp_df.iloc[0,2]):
            print("this entry has no label")
            doc.spans["sc"] = []
            docs.append(doc)
        else:     
            span_text = temp_df.iloc[0,2]
            temp_label = temp_df.iloc[0,3]   
            span_start_char = nar.find(span_text)
            span_end_char = span_start_char + len(span_text)
            
            # Finding the start and end tokens using character offsets
            start_token = None
            end_token = None
            for token in doc:
                if token.idx == span_start_char:
                     start_token = token.i
                if token.idx + len(token.text) == span_end_char:
                    end_token = token.i
                    break
                    
            if start_token is not None and end_token is not None:
                temp_start = start_token
                temp_end = end_token + 1
                
                doc.spans["sc"] = [Span(doc, temp_start, temp_end, temp_label)]
                
                token_lengths.append(temp_end - temp_start)
                label_list.append(temp_label)
            
                docs.append(doc)
            else:
                print(nar, "span=", span_text,"couldn't find tokens")
    else:
        print("temp_df has length > 1")
        span_list = []
        for ent in range(len(temp_df)):
            span_text = temp_df.iloc[ent,2]
            temp_label = temp_df.iloc[ent,3]
            span_start_char = nar.find(span_text)
            span_end_char = span_start_char + len(span_text)
            # print(span_start_char, span_end_char)

            # Finding the start and end tokens using character offsets
            start_token = None
            end_token = None
            for token in doc:
                if token.idx == span_start_char:
                     start_token = token.i
                if token.idx + len(token.text) == span_end_char:
                    end_token = token.i
                    break
            if start_token is not None and end_token is not None:
                temp_start = start_token
                temp_end = end_token + 1             
                span_list.append(Span(doc, temp_start, temp_end, temp_label))
                token_lengths.append(temp_end - temp_start)
                label_list.append(temp_label)
            else:
                print(nar, "span=",span_text, "couldn't find tokens")
        
        doc.spans["sc"] = span_list
        docs.append(doc)

In [None]:
len(docs)

In [None]:
for doc in docs[0:1]:
    print(doc, doc.spans)

In [None]:
len(token_lengths)

Check on length of tokens in spans to set appropriate parameters in training config file

In [None]:
print(np.min(token_lengths), np.max(token_lengths), np.median(token_lengths))

In [None]:
np.quantile(token_lengths, q =[0.05,0.95])

Want to split the data into train and test such that a single case is not split between the two. Check two consecutive rows to make sure they are separate cases.

In [None]:
docs[1100]

In [None]:
docs[1101]

Make training and test sets with the docs, saving in spacy format.

In [None]:
doc_bin = DocBin(docs=docs[0:1101])
doc_bin.to_disk("./train_falling.spacy")

doc_bin = DocBin(docs=docs[1101:])
doc_bin.to_disk("./dev_falling.spacy")

Initialize the training config file. This will get some manual adjustments.

In [None]:
!python -m spacy init config configs/config_spancat_singlelabel.cfg --lang en --pipeline spancat_singlelabel --force

These adjustments were made to the above config file:

[components.spancat_singlelabel.suggester]   
@misc = "spacy.ngram_range_suggester.v1"  
min_size = 2 <---  
max_size = 12 <---  

[training.score_weights]  
spans_sc_f = 0.3 <---  
spans_sc_p = 0.5 <---  
spans_sc_r = 0.2 <---  

Run the training!

In [None]:
!python -m spacy train configs/config_spancat_singlelabel.cfg --paths.train train_falling.spacy --paths.dev dev_falling.spacy --training.eval_frequency 100  --system.seed 99 --output spacy_falling_model/

## Using model 

Load the model and check its component steps,

In [None]:
nlp_spancat = spacy.load("spacy_falling_model/model-best")

In [None]:
nlp_spancat.pipeline

**Test model on a subset of the samples**

Take a random sample for testing purposes

In [None]:
random_sample = df_final.sample(n=5000, replace=False, random_state=42)

# Subset to just first two columns containing case # and narrative
random_sample_sub = random_sample.iloc[:,0:2]

# Check the result
random_sample_sub.head(2)

In [None]:
cols = ['cpsc_case_number','text', 'span_text',  'span_label', ]
output_df = pd.DataFrame(columns=cols)

for row in random_sample_sub.iloc[1000:1050:,0:2].iterrows():
    cpsc = (row[1]['cpsc_case_number'])
    text = (row[1]['narrative'])
    doc = nlp_spancat(text)

    
    if len(doc.spans["sc"]) == 0:
        df2 = pd.DataFrame([[cpsc,text, "NA", "NA"]], columns= cols)
        # Append the new row to the DataFrame
        output_df = pd.concat([output_df, df2])
    else:
        for span in doc.spans["sc"]:
        # Create a new row as a dictionary
        
            df2 = pd.DataFrame([[cpsc,text, span.text, span.label_,]], columns=cols)
            # Append the new row to the DataFrame
            output_df = pd.concat([output_df, df2])

In [None]:
output_df.head()

## Run model on all samples

Before running on all samples, double check everything is in working order.

In [None]:
decoded_df.iloc[0:3:,0:2]

In [None]:
for row in decoded_df.iloc[0:12:,0:2].iterrows():
    cpsc = (row[1]['cpsc_case_number'])
    text = (row[1]['narrative'])
    doc = nlp_spancat(text)
    print(doc.spans["sc"])

In [None]:
# Create an empty DataFrame with column names
cols = ['cpsc_case_number','text', 'span_text', 'span_label']
output_df = pd.DataFrame(columns=cols)

for row in decoded_df.iloc[:,0:2].iterrows():
    cpsc = (row[1]['cpsc_case_number'])
    text = (row[1]['narrative'])
    doc = nlp_spancat(text)

    
    if len(doc.spans["sc"]) == 0:
        df2 = pd.DataFrame([[cpsc, text, "NA", "NA"]], columns=cols)
        # Append the new row to the DataFrame
        output_df = pd.concat([output_df, df2])
    else:
        for span in doc.spans["sc"]:
        # Create a new row as a dictionary
        
            df2 = pd.DataFrame([[cpsc, text, span.text, span.label_]], columns=cols)
            # Append the new row to the DataFrame
            output_df = pd.concat([output_df, df2])

In [None]:
output_df.head()

In [None]:
len(output_df)

In [None]:
# Save to file for coming back and starting up at this point
# output_df.to_csv("predictions_falling_full_set.csv")

In [None]:
temp_df = pd.read_csv("predictions_falling_full_set.csv")
output_df = temp_df.iloc[:,1:5]
output_df.head(8)

### Process predicted spans into dataframes  

Pivot the DataFrame so that each span label is a column and the span texts are the entries in those columns. A single narrative could have two occurrences of the same span label associated with it. Therefore, when collapsing each case into a single row, make a tuple with the individual span text entries.

In [None]:
def tuple_aggregator(series):
    return tuple(series)

# Pivot the DataFrame
pivot_output_df = output_df.pivot_table(index=['cpsc_case_number', 'text'], columns='span_label', values='span_text', aggfunc=tuple_aggregator).reset_index()
pivot_output_df.columns.name = None 
pivot_output_df.head(3)

In [None]:
# Make a copy of the original DataFrame
binary_df = pivot_output_df.copy()

# Update the DataFrame to have 0 for NaN and 1 for actual values
binary_df.iloc[:, 2:] = binary_df.iloc[:, 2:].map(lambda x: 0 if x == 'NA' or pd.isna(x) else 1)
binary_df.head()

In [None]:
combo_df = pd.merge(binary_df, decoded_df, how="left", on="cpsc_case_number")
combo_df.columns

### Words within labeled spans

In [None]:
SO_entries = output_df[output_df['span_label'] == 'SO']
len(SO_entries)

In [None]:
SO_entries.head()

In [None]:
def flatten_list(nested_list):
    return [item for sublist in nested_list for item in sublist]

In [None]:
OBJ_entries = output_df[output_df['span_label'] == 'OBJ']

In [None]:
list_remove_OBJ = ['TRIPPED', 'OVER', 'A', 'AN','ON', 'THE', 'HER', 
                   'OF', 'HIS', 'TRIPPING', 'FALL', 'FELL', 'SLIPPED', 'OWN', 'FEET']
OBJ_words = []
for i in range(len(OBJ_entries)):
    temp_entry = OBJ_entries.iloc[i,2]
    temp_text  = temp_entry.split(' ')
    out_temp = list(set(temp_text)-set(list_remove_OBJ))
    OBJ_words.append(out_temp) 

In [None]:
OBJ_word_counts = collections.Counter(flatten_list(OBJ_words))
my_dict = dict(OBJ_word_counts)

rows = []
for key, value in my_dict.items():
    row = {'element': key, 'count': value}
    rows.append(row)
df = pd.DataFrame(rows).sort_values('count', ascending=False)

df.set_index('element', inplace=True)
OBJ_plot_subset = df.head(13).sort_values('count')

In [None]:
OBJ_plot_subset

In [None]:
ax = OBJ_plot_subset.plot.barh(width = 0.8)
plt.ylabel('')
plt.xlabel('Count')
plt.title('Object causing fall')
plt.gca().get_legend().remove()
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', linewidth=0.7, alpha=0.7)
plt.savefig("Object_causing.pdf", format="pdf", pad_inches = 0.5, bbox_inches = 'tight')

In [None]:
list_remove = ['A', 'HEAD', 'HIT', 'THE', 'ON', 'HITTING', 'STRIKING', 
               'HER', 'OF', 'AGAINST', 'STRUCK', 'ONTO', 'HIS', 'INTO',
              'FACE','BACK', 'SIDE', 'CHEST', 'RIGHT', 'LEFT', 'EDGE', 
               'FOREHEAD', 'STAND','FRAME', 'AND', 'COFFEE', 'LANDED', 'ARM'
              ]
SO_words = []
for i in range(len(SO_entries)):
    temp_entry = SO_entries.iloc[i,2]
    temp_text  = temp_entry.split(' ')
    out_temp = list(set(temp_text)-set(list_remove))
    SO_words.append(out_temp)    

In [None]:
sns.set_theme(style="white")
              
SO_word_counts = collections.Counter(flatten_list(SO_words))
my_dict = dict(SO_word_counts)

rows = []
for key, value in my_dict.items():
    row = {'element': key, 'count': value}
    rows.append(row)
df = pd.DataFrame(rows).sort_values('count', ascending=False)

df.set_index('element', inplace=True)
SO_plot_subset = df.head(15).sort_values('count')
SO_plot_subset

In [None]:
ax = SO_plot_subset.plot.barh(width = 0.8)
plt.ylabel('')
plt.xlabel('Count')
plt.title('Object struck')
plt.gca().get_legend().remove()
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', linewidth=0.7, alpha=0.7)
plt.savefig("Object_struck.pdf", format="pdf", pad_inches = 0.5, bbox_inches = 'tight')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,4))
# fig.suptitle('Falls involving objects', size=20, y=1.02)
OBJ_plot_subset.plot.barh(width = 0.8, ax = ax1, color = '#4893ff')
SO_plot_subset.plot.barh(width = 0.8, ax = ax2, color = '#7cd357')
ax1.legend().set_visible(False) 
ax2.legend().set_visible(False) 
ax1.set_ylabel('')
ax2.set_ylabel('')
ax1.tick_params(labelsize=12)
ax2.tick_params(labelsize=12)
ax1.set_title("Object instigating fall", fontsize = 16, color = '#0001ac')
ax2.set_title("Object struck", fontsize = 16, color = '#216326')
ax1.set_xlabel('Count', fontsize = 14, color = '#0001ac')
ax2.set_xlabel('Count', fontsize = 14, color = '#216326')
ax1.grid(True, which='both', axis='x', linestyle='--', linewidth=0.8, color='lightgray')
ax2.grid(True, which='both', axis='x', linestyle='--', linewidth=0.8, color='lightgray')
ax1.tick_params(axis='y', labelcolor='#0001ac')
ax2.tick_params(axis='y', labelcolor='#216326')
ax1.tick_params(axis='x', labelcolor='#0001ac')
ax2.tick_params(axis='x', labelcolor='#216326')
for spine in ['left', 'right', 'top', 'bottom']:
    ax1.spines[spine].set_edgecolor('#0001ac')
    ax2.spines[spine].set_edgecolor('#216326')

plt.subplots_adjust(wspace=0.35)  #
plt.savefig("Objects_associated_with_falls.pdf", format="pdf", pad_inches = 0.25, bbox_inches = 'tight')

### Clean labels for plotting

In [None]:
# Remove the starting numbers from the category codes
combo_df['diagnosis'] = combo_df['diagnosis'].str.replace(r'\d+ - ', '', regex=True)
combo_df['disposition'] = combo_df['disposition'].str.replace(r'\d+ - ', '', regex=True)

# Change the diagnosis codes to sentence case
combo_df['diagnosis']= combo_df['diagnosis'].str.title()
print(combo_df['disposition'].unique())

In [None]:
# Shorten the disposition names, keeping general meaning
label_mapping = {'TREATED AND ADMITTED/HOSPITALIZED': 'Admitted', 
                 'TREATED/EXAMINED AND RELEASED': 'Released', 
                 'HELD FOR OBSERVATION': 'Held', 
                 'TREATED AND TRANSFERRED': 'Transferred',  
                 'LEFT WITHOUT BEING SEEN': 'Unseen', }
combo_df['disposition'] = combo_df['disposition'].map(label_mapping)
print(combo_df['disposition'].unique())

In [None]:
sns.displot(combo_df, x = 'narrative_characters', hue = 'disposition', kind = 'kde')

Gather rows for specific strike types

In [None]:
object_involved = combo_df[combo_df['OBJ'] == 1]
len(object_involved)

In [None]:
struck_object = combo_df[(combo_df['SO']==1) & (combo_df['SF']!=1) & (combo_df['SU']!=1)]
struck_floor = combo_df[(combo_df['SO']!=1) & (combo_df['SF']==1) & (combo_df['SU']!=1)]
struck_unknown = combo_df[(combo_df['SO']!=1) & (combo_df['SF']!=1) & (combo_df['SU']==1)]
print(len(struck_floor), len(struck_object), len(struck_unknown))

Bind the struck object and struck floor sets back together for plotting

In [None]:
struck_object = struck_object.assign(struck = 'Object')
struck_floor = struck_floor.assign(struck = 'Floor')
striking_df = pd.concat([struck_object, struck_floor ], ignore_index=True) 
striking_df.head(3)

Plot disposition distribution for each category

In [None]:
strike_disp = striking_df[['disposition', 'struck' ]]
strike_disp_counts = strike_disp.groupby(['struck', 'disposition']).size().reset_index(name='count')
temp = strike_disp_counts.pivot(columns = 'disposition', index = 'struck', values = 'count' )
temp.columns.name=''
col_order = ['Released', 'Admitted', 'Held', 'Transferred', 'Unseen']
strike_disp_counts = temp[col_order]

In [None]:
# Create the figure
plt.figure(figsize=(8, 4))

#sns.set_palette('colorblind')
plt.style.use('tableau-colorblind10')

# Convert to relative proportions 
strike_disp_counts.apply(lambda x: x*100/sum(x), axis = 1).plot(kind = 'bar', stacked = True, fontsize = 14, edgecolor='none')
plt.legend(bbox_to_anchor=(1, 1), loc=2) 
plt.ylabel("Percent", size = 16)
plt.xlabel("")
plt.savefig("Disposition_by_strike.pdf", format="pdf", pad_inches = 0.5, bbox_inches = 'tight')

In [None]:
diag_cols_to_use = ['Internal Injury',
 'Fracture',
 'Laceration',
 'Contusions, Abr.',
 'Avulsion',
 'Hematoma',
 'Concussion',
 'Strain, Sprain',
 'Dislocation']

In [None]:
striking_other = striking_df[striking_df['diagnosis'].isin(diag_cols_to_use) == False]

In [None]:
other_diag_counts = striking_other.groupby(['struck', 'diagnosis']).size().reset_index(name='count').groupby(['struck']).sum('count')
other_diag_counts.reset_index()
other_diag_counts.rename(columns = {'count': 'Other'}, inplace = True)
other_diag_counts

In [None]:
strike_diag = striking_df[['diagnosis', 'struck' ]]
strike_diag_sub = strike_diag[strike_diag['diagnosis'].isin(diag_cols_to_use)]
strike_diag_counts = strike_diag_sub.groupby(['struck', 'diagnosis']).size().reset_index(name='count')
temp = strike_diag_counts.pivot(columns = 'diagnosis', index = 'struck', values = 'count' )
temp.columns.name=''
temp_ordered = temp[diag_cols_to_use]
strike_diag_counts = temp_ordered.merge(other_diag_counts, left_on = 'struck', right_on = 'struck')
strike_diag_counts

In [None]:
# Create the figure
plt.figure(figsize=(8, 4))
sns.set_style("whitegrid")
#sns.set_palette('deep')
plt.style.use('tableau-colorblind10')
strike_diag_counts.apply(lambda x: x*100/sum(x), axis = 1).plot(kind = 'bar', stacked = True, fontsize = 14, edgecolor = 'none')
plt.legend(bbox_to_anchor=(1, 1), loc=2) 
plt.ylabel("Percent", size = 16)
plt.xlabel("")
plt.savefig("Diagnosis_by_strike.pdf", format="pdf", pad_inches = 0.5, bbox_inches = 'tight')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,4))
strike_disp_counts.apply(lambda x: x*100/sum(x), axis = 1).plot(kind = 'bar', ax = ax1, stacked = True, fontsize = 14, edgecolor = 'none')
strike_diag_counts.apply(lambda x: x*100/sum(x), axis = 1).plot(kind = 'bar', ax = ax2, stacked = True, fontsize = 14, edgecolor = 'none')
ax1.legend().set_bbox_to_anchor((1, 1))
ax2.legend().set_bbox_to_anchor((1, 1))
plt.subplots_adjust(wspace=1.25)  #
plt.savefig("Struck_object_Disp_Diag.pdf", format="pdf", pad_inches = 0.25, bbox_inches = 'tight')

Look at frequencies of dispositon and diagnosis by strike type

In [None]:
sf_disp = struck_floor['disposition'].value_counts()
sf_disp

In [None]:
so_disp = struck_object['disposition'].value_counts()
print(so_disp["Released"], so_disp["Admitted"])

Use an odds ratio contigency table to determine if the striking the floor has a higher risk of hospitalization than striking an object.

In [None]:
from scipy.stats.contingency import odds_ratio
res = odds_ratio([[sf_disp["Admitted"],so_disp["Admitted"]],[sf_disp["Released"],so_disp["Released"]]])
print(res.statistic, res.confidence_interval(confidence_level=0.95))

The odds of being admitted/hospitalized if a patient strikes the floor are 1.xx times (95% CI = 1.xx-1.xx) that of being admitted if a patient strikes an object.

In [None]:
struck_unknown['disposition'].value_counts(normalize=True)

In [None]:
so_diag = struck_object['diagnosis'].value_counts()

In [None]:
sf_diag = struck_floor['diagnosis'].value_counts()

In [None]:
sf_diag.sum() - sf_diag["Fracture"]

In [None]:
res = odds_ratio([[sf_diag["Fracture"],so_diag["Fracture"]],[sf_diag.sum() - sf_diag["Fracture"], so_diag.sum() - so_diag["Fracture"]]])
print(res.statistic, res.confidence_interval(confidence_level=0.95))

In [None]:
res = odds_ratio([[sf_diag["Laceration"],so_diag["Laceration"]],[sf_diag.sum() - sf_diag["Laceration"], so_diag.sum() - so_diag["Laceration"]]])
print(res.statistic, res.confidence_interval(confidence_level=0.95))

In [None]:
struck_unknown['diagnosis'].value_counts(normalize=True)

In [None]:
# another possible version of a stacked bar plot
# struck_floor.groupby('disposition')['diagnosis'].value_counts(normalize=True).unstack('diagnosis').plot.bar(stacked=True)

In [None]:
import ipywidgets as widgets
widgets.ColorPicker(concise=False, description='Pick a color',value='blue',disabled=False)