In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
from icd9cms import search
import os
import pandas as pd
from collections import defaultdict
import json
from itertools import chain
import ipywidgets as widgets
from IPython.display import display
import re
from collections import Counter
from collections import defaultdict
import difflib
from differ import html_diffs, align_seqs, sentencize, diff_ratio, diff_quick_ratio, show_diffs

In [6]:
codes = pd.read_csv('/Users/tom/phd/summariser/primary_codes.csv')
sorted_primary_codes_df =codes['icd9_code'].value_counts().rename_axis('code').reset_index()
sorted_primary_codes_df['long_title'] = sorted_primary_codes_df['code'].apply(lambda c: search(c).long_desc or search(c).short_desc)
sorted_primary_codes_df.columns = ['code', 'count', 'long_title']
sorted_primary_codes_df = sorted_primary_codes_df[~sorted_primary_codes_df.code.str.startswith('V3')]

In [7]:
sorted_primary_codes_df[sorted_primary_codes_df.code.str.startswith('41401')]

Unnamed: 0,code,count,long_title
1,41401,3498,Coronary atherosclerosis of native coronary ar...


In [8]:
sorted_primary_codes_df[sorted_primary_codes_df.code.str.startswith('41')].head(5)

Unnamed: 0,code,count,long_title
1,41401,3498,Coronary atherosclerosis of native coronary ar...
4,41071,1751,"Subendocardial infarction, initial episode of ..."
15,41041,482,Acute myocardial infarction of other inferior ...
16,41011,482,Acute myocardial infarction of other anterior ...
20,41519,343,Other pulmonary embolism and infarction


In [424]:
sorted_primary_codes_df[sorted_primary_codes_df.code.str.startswith('434')].long_title.tolist()

['Cerebral embolism with cerebral infarction',
 'Cerebral artery occlusion, unspecified with cerebral infarction',
 'Cerebral thrombosis with cerebral infarction',
 'Cerebral artery occlusion, unspecified without mention of cerebral infarction',
 'Cerebral embolism without mention of cerebral infarction']

### 41401 Notes

In [11]:
notes = pd.read_csv('/Users/tom/phd/summariser/data/41401_notes.csv')

In [12]:
discharge_notes = notes[(notes.category == 'Discharge summary') & (notes.description != 'Addendum')]

In [14]:
splitter = '------\n------\n------\n'
def split_notes(notes):
    care_notes = defaultdict(list)
    discharge_notes = defaultdict(list)
    for adm, df in notes.sort_values(['hadm_id', 'chartdate', 'charttime']).groupby('hadm_id'):
        dis_notes = df[(df.category == 'Discharge summary') & (df.description != 'Addendum')].text.tolist()
        care_df = df[df.category != 'Discharge summary']
        if len(dis_notes) == 0 or care_df.shape[0] == 0:
            continue

        hadm_id = care_df.hadm_id.iloc[0]
        # pick first as there are rarely 2 summaries??

        discharge_notes['hadm_id'].append(hadm_id)
        discharge_notes['text'].append(dis_notes[0])
        # ignore addendums - as not sure what section they are in.

        care_notes['hadm_id'].append(hadm_id)
        care_notes['text'].append(care_df.text.str.cat(sep=splitter))
        care_notes['first_time'].append(care_df.iloc[0].charttime or care_df.iloc[0].chartdate)
        care_notes['last_time'].append(care_df.iloc[-1].charttime or care_df.iloc[-1].chartdate)
        care_notes['categories'].append(care_df.category.tolist())
        care_notes['descriptions'].append(care_df.description.tolist())
        care_notes['icd_code'].append(care_df.icd9_code.iloc[0])
    care_notes_df = pd.DataFrame(care_notes)
    discharge_notes_df = pd.DataFrame(discharge_notes)
    return care_notes_df, discharge_notes_df

In [18]:
care_notes_df, discharge_notes_df = split_notes(notes)

In [None]:
## note types
set(chain.from_iterable([set(c) for c in care_notes_df.categories if 'Physician ' not in c]))

In [34]:
care_notes_df['cat_descs'] = care_notes_df.apply(lambda r: [(f'{c.strip()}:{d}') for c,d in zip(r.categories, r.descriptions)], axis=1)

In [99]:
def compute_diff(notes: pd.DataFrame) -> pd.DataFrame:
    internote_sort_cols = ['hadm_id', 'chartdate', 'charttime', 'category', 'description']
    internote_group_keys = ['hadm_id', 'category', 'description']
    groups_gen = notes.sort_values(internote_sort_cols).groupby(internote_group_keys)
    results = defaultdict(list)
    for k, g in groups_gen:
        if g.shape[0] > 1:
            ratios, std_devs, max_ratios, min_ratios, redundant_toks, total_tokens = [], [], [], [], [], []
            for i in range(len(g.text.tolist()) - 1):
                ratio, std_dev, max_ratio, min_ratio, redundant_toks_len, total_toks = \
                    diff_ratio(g.text.iloc[i], g.text.iloc[i+1])
                ratios.append(ratio)
                std_devs.append(std_dev)
                max_ratios.append(max_ratio)
                min_ratios.append(min_ratio)
                redundant_toks.append(redundant_toks_len)
                total_tokens.append(total_toks)
            results['diff_ratios'].append(ratios)
            results['max_ratios'].append(max(max_ratios))
            results['min_ratios'].append(min(min_ratios))
            results['redundant_toks'].append(sum(redundant_toks))
            results['total_tokens'].append(sum(total_tokens))
            results['avg_diff_ratio'].append(sum(ratios) / len(ratios))
            txt_lens = g.text.apply(len)
            results['avg_txt_len'].append(sum(txt_lens) / len(txt_lens))
            results['hadm_id'].append(k[0])
            results['category'].append(k[1])
            results['description'].append(k[2])
    return pd.DataFrame(results)

In [108]:
def compute_avgs(results_df: pd.DataFrame) -> pd.DataFrame:
    cat_desc_avg = defaultdict(list)
    for k, df in results_df.groupby(['category', 'description']):
        cat_desc_avg['cat_desc'].append(f'{k[0]}:{k[1]}')
        cat_desc_avg['redundant_toks'].append(sum(df.redundant_toks))
        cat_desc_avg['total_toks'].append(sum(df.total_tokens))
        cat_desc_avg['avg_txt_len'].append(sum(df.avg_txt_len) / len(df.avg_txt_len))
        cat_desc_avg['macro_avg'].append(sum(df.avg_diff_ratio) / df.avg_diff_ratio.shape[0])
        d_r = list(chain.from_iterable(df.diff_ratios))
        cat_desc_avg['micro_avg'].append(sum(d_r) / len(d_r))
        cat_desc_avg['num_instances'].append(len(d_r))
    group_avgs = pd.DataFrame(cat_desc_avg)
    group_avgs = group_avgs[~group_avgs.cat_desc.str.contains('Discharge summary')]
    group_avgs = group_avgs[group_avgs['num_instances'] > 5]
    group_avgs = group_avgs.sort_values('num_instances', ascending=False).reset_index(drop=True).head(20)
    return group_avgs

In [88]:
print('Token level / global note redundancy with concurrent notes:')
print(f'Overall Average Internote type redundnacy:{results_df.avg_diff_ratio.sum() / results_df.avg_diff_ratio.shape[0]}')

Token level / global note redundancy with concurrent notes:
Overall Average Internote type redundnacy:0.25971102225247733


### 43411 Note Analysis

In [83]:
stroke_notes = pd.read_csv('/Users/tom/phd/summariser/data/43411_notes.csv')

In [101]:
results_df = compute_diff(stroke_notes)

In [109]:
group_avgs = compute_avgs(results_df)

In [113]:
group_avgs['redundant_ratio'] = group_avgs['redundant_toks'] / group_avgs.total_toks

In [115]:
group_avgs.sort_values('redundant_ratio', ascending=False).reset_index(drop=True).head(20)

Unnamed: 0,cat_desc,redundant_toks,total_toks,avg_txt_len,macro_avg,micro_avg,num_instances,redundant_ratio
0,Physician :Physician Resident Admission Note,28842.0,33109,12525.819048,0.856518,0.809448,19,0.871123
1,Physician :Physician Surgical Progress Note,3181.0,4347,5587.0,0.809635,0.764214,7,0.731769
2,Physician :Physician Attending Progress Note,14380.0,21148,5941.186706,0.532457,0.536417,31,0.67997
3,Physician :Intensivist Note,100737.0,159909,5958.690231,0.691003,0.639584,211,0.629965
4,Respiratory :Respiratory Care Shift Note,7263.0,12124,1230.545915,0.404803,0.443782,79,0.59906
5,Nursing:Nursing Transfer Note,30799.0,53260,2230.841242,0.598621,0.550831,159,0.578276
6,Physician :Physician Resident Progress Note,40675.0,73517,6890.886364,0.551072,0.467104,94,0.553273
7,Nursing:Nursing Progress Note,64447.0,136297,1291.039215,0.442277,0.419754,620,0.472842
8,Radiology:PORTABLE ABDOMEN,923.0,2992,1084.420513,0.27408,0.259291,23,0.308489
9,Radiology:CHEST PORT. LINE PLACEMENT,1237.0,4903,1240.14386,0.203611,0.219498,34,0.252295


In [None]:
ptn = re.compile(r'(?<=brief\shospital\scourse\:)(?:.*\n)+?(?=\n\n)', flags=re.I)
m = re.search(ptn, discharge_notes_df.iloc[idx].text)
hosp_course = re.split(r'(\n\n.*\:\n?)', discharge_notes_df.iloc[idx].text[m.start():m.end()])[0]
print(hosp_course)

In [68]:
sent_a = "this is some sentence, it's the same. As the other sentence. how about some more stuff"
sent_b = "this is some sentence, it's the same. As the other sentence. "

In [69]:
diff_ratio(sent_a, sent_b)

[7.0, 4.0, 0.0]


(0.6666666666666666, 0.4714045207910317, 1.0, 0.0, 11.0)

In [57]:
show_diffs(sent_a, sent_b)