In [50]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
%autoreload 2

In [220]:
from icd9cms import search
import os
import pandas as pd
from collections import defaultdict
import json
from itertools import chain
import ipywidgets as widgets
from IPython.display import display
import re
from collections import Counter
from collections import defaultdict
import difflib
from differ import html_diffs, align_seqs, sentencize, diff_ratio

In [6]:
codes = pd.read_csv('/Users/tom/phd/summariser/primary_codes.csv')
sorted_primary_codes_df =codes['icd9_code'].value_counts().rename_axis('code').reset_index()
sorted_primary_codes_df['long_title'] = sorted_primary_codes_df['code'].apply(lambda c: search(c).long_desc or search(c).short_desc)
sorted_primary_codes_df.columns = ['code', 'count', 'long_title']
sorted_primary_codes_df = sorted_primary_codes_df[~sorted_primary_codes_df.code.str.startswith('V3')]

In [7]:
sorted_primary_codes_df[sorted_primary_codes_df.code.str.startswith('41401')]

Unnamed: 0,code,count,long_title
1,41401,3498,Coronary atherosclerosis of native coronary ar...


In [8]:
sorted_primary_codes_df[sorted_primary_codes_df.code.str.startswith('41')].head(5)

Unnamed: 0,code,count,long_title
1,41401,3498,Coronary atherosclerosis of native coronary ar...
4,41071,1751,"Subendocardial infarction, initial episode of ..."
15,41041,482,Acute myocardial infarction of other inferior ...
16,41011,482,Acute myocardial infarction of other anterior ...
20,41519,343,Other pulmonary embolism and infarction


In [424]:
sorted_primary_codes_df[sorted_primary_codes_df.code.str.startswith('434')].long_title.tolist()

['Cerebral embolism with cerebral infarction',
 'Cerebral artery occlusion, unspecified with cerebral infarction',
 'Cerebral thrombosis with cerebral infarction',
 'Cerebral artery occlusion, unspecified without mention of cerebral infarction',
 'Cerebral embolism without mention of cerebral infarction']

### 41401 Notes

In [11]:
notes = pd.read_csv('/Users/tom/phd/summariser/41401_notes.csv')

In [415]:
discharge_notes = notes[(notes.category == 'Discharge summary') & (notes.description != 'Addendum')]

In [14]:
splitter = '------\n------\n------\n'
def split_notes(notes):
    care_notes = defaultdict(list)
    discharge_notes = defaultdict(list)
    for adm, df in notes.sort_values(['hadm_id', 'chartdate', 'charttime']).groupby('hadm_id'):
        dis_notes = df[(df.category == 'Discharge summary') & (df.description != 'Addendum')].text.tolist()
        care_df = df[df.category != 'Discharge summary']
        if len(dis_notes) == 0 or care_df.shape[0] == 0:
            continue

        hadm_id = care_df.hadm_id.iloc[0]
        # pick first as there are rarely 2 summaries??

        discharge_notes['hadm_id'].append(hadm_id)
        discharge_notes['text'].append(dis_notes[0])
        # ignore addendums - as not sure what section they are in.

        care_notes['hadm_id'].append(hadm_id)
        care_notes['text'].append(care_df.text.str.cat(sep=splitter))
        care_notes['first_time'].append(care_df.iloc[0].charttime or care_df.iloc[0].chartdate)
        care_notes['last_time'].append(care_df.iloc[-1].charttime or care_df.iloc[-1].chartdate)
        care_notes['categories'].append(care_df.category.tolist())
        care_notes['descriptions'].append(care_df.description.tolist())
        care_notes['icd_code'].append(care_df.icd9_code.iloc[0])
    care_notes_df = pd.DataFrame(care_notes)
    discharge_notes_df = pd.DataFrame(discharge_notes)
    return care_notes_df, discharge_notes_df

In [18]:
care_notes_df, discharge_notes_df = split_notes(notes)

In [None]:
## note types
set(chain.from_iterable([set(c) for c in care_notes_df.categories if 'Physician ' not in c]))

In [34]:
care_notes_df['cat_descs'] = care_notes_df.apply(lambda r: [(f'{c.strip()}:{d}') for c,d in zip(r.categories, r.descriptions)], axis=1)

In [416]:
def compute_diff(notes: pd.DataFrame) -> pd.DataFrame:
    internote_sort_cols = ['hadm_id', 'chartdate', 'charttime', 'category', 'description']
    internote_group_keys = ['hadm_id', 'category', 'description']
    groups_gen = notes.sort_values(internote_sort_cols).groupby(internote_group_keys)
    results = defaultdict(list)
    for k, g in groups_gen:
        if g.shape[0] > 1:
            ratios = [diff_ratio(g.text.iloc[i], g.text.iloc[i+1]) for i in range(len(g.text.tolist()) - 1)]
            results['diff_ratios'].append(ratios)
            results['avg_diff_ratio'].append(sum(ratios) / len(ratios))
            txt_lens = g.text.apply(len)
            results['avg_txt_len'].append(sum(txt_lens) / len(txt_lens))
            results['hadm_id'].append(k[0])
            results['category'].append(k[1])
            results['description'].append(k[2])
    return pd.DataFrame(results)

In [425]:
def compute_avgs(results_df: pd.DataFrame) -> pd.DataFrame:
    cat_desc_avg = defaultdict(list)
    for k, df in results_df.groupby(['category', 'description']):
        cat_desc_avg['cat_desc'].append(f'{k[0]}:{k[1]}')
        cat_desc_avg['macro_avg'].append(sum(df.avg_diff_ratio) / df.avg_diff_ratio.shape[0])
        d_r = list(chain.from_iterable(df.diff_ratios))
        cat_desc_avg['micro_avg'].append(sum(d_r) / len(d_r))
        cat_desc_avg['num_instances'].append(len(d_r))
    group_avgs = pd.DataFrame(cat_desc_avg)
    group_avgs['macro_avg'] = (1 - group_avgs['macro_avg'])
    group_avgs['micro_avg'] = (1 - group_avgs['micro_avg'])
    group_avgs = group_avgs[~group_avgs.cat_desc.str.contains('Discharge summary')]
    group_avgs = group_avgs[group_avgs['num_instances'] > 5]
    group_avgs = group_avgs.sort_values('num_instances', ascending=False).reset_index(drop=True).head(20)
    return group_avgs

In [401]:
print('Token level / global note redundancy with concurrent notes:')
print(f'Overall Average Internote type redundnacy:{1 - (results_df.avg_diff_ratio.sum() / results_df.avg_diff_ratio.shape[0])}')

Token level / global note redundancy with concurrent notes:
Overall Average Internote Redundnacy:0.918539924817974


In [364]:
def show_diffs(a, b):
    display(HTML(html_diffs(a,b)))

### 43411 Note Analysis

In [419]:
stroke_notes = pd.read_csv('/Users/tom/phd/summariser/43411_notes.csv')

In [420]:
%%time
results_df = compute_diff(stroke_notes)
group_avgs = compute_avgs(results_df)

CPU times: user 44.2 s, sys: 197 ms, total: 44.4 s
Wall time: 44.5 s


In [426]:
group_avgs.sort_values('num_instances', ascending=False).reset_index(drop=True).head(20)

Unnamed: 0,cat_desc,macro_avg,micro_avg,num_instances
0,Nursing/other:Report,0.98204,0.97995,1889
1,Radiology:CHEST (PORTABLE AP),0.932936,0.93271,837
2,Nursing:Nursing Progress Note,0.897988,0.911357,620
3,ECG:Report,0.848381,0.844022,512
4,Radiology:CT HEAD W/O CONTRAST,0.944593,0.944366,373
5,Physician :Intensivist Note,0.943441,0.945627,211
6,Nursing:Nursing Transfer Note,0.934466,0.939193,159
7,Physician :Physician Resident Progress Note,0.946314,0.940744,94
8,Respiratory :Respiratory Care Shift Note,0.74339,0.710084,79
9,Echo:Report,0.978231,0.978041,77


In [None]:
ptn = re.compile(r'(?<=brief\shospital\scourse\:)(?:.*\n)+?(?=\n\n)', flags=re.I)
m = re.search(ptn, discharge_notes_df.iloc[idx].text)
hosp_course = re.split(r'(\n\n.*\:\n?)', discharge_notes_df.iloc[idx].text[m.start():m.end()])[0]
print(hosp_course)