In [1]:
%load_ext autoreload

%autoreload 2

from icd9cms import search
import os
import numpy as np
import pandas as pd
from collections import defaultdict
import json
from itertools import chain
import ipywidgets as widgets
from IPython.display import display
import re
from collections import Counter
from collections import defaultdict
import difflib
from differ import html_diffs, align_seqs, sentencize, diff_ratio, show_diffs
from datasets import list_metrics, load_metric

In [7]:
codes = pd.read_csv('../data/primary_codes.csv')
sorted_primary_codes_df =codes['icd9_code'].value_counts().rename_axis('code').reset_index()
sorted_primary_codes_df['long_title'] = sorted_primary_codes_df['code'].apply(lambda c: search(c).long_desc or search(c).short_desc)
sorted_primary_codes_df.columns = ['code', 'count', 'long_title']
sorted_primary_codes_df = sorted_primary_codes_df[~sorted_primary_codes_df.code.str.startswith('V3')]

In [8]:
sorted_primary_codes_df[sorted_primary_codes_df.code.str.startswith('41401')]

Unnamed: 0,code,count,long_title
1,41401,3498,Coronary atherosclerosis of native coronary ar...


In [9]:
sorted_primary_codes_df[sorted_primary_codes_df.code.str.startswith('41')].head(5)

Unnamed: 0,code,count,long_title
1,41401,3498,Coronary atherosclerosis of native coronary ar...
4,41071,1751,"Subendocardial infarction, initial episode of ..."
15,41041,482,Acute myocardial infarction of other inferior ...
16,41011,482,Acute myocardial infarction of other anterior ...
20,41519,343,Other pulmonary embolism and infarction


In [10]:
sorted_primary_codes_df[sorted_primary_codes_df.code.str.startswith('434')].long_title.tolist()

['Cerebral embolism with cerebral infarction',
 'Cerebral artery occlusion, unspecified with cerebral infarction',
 'Cerebral thrombosis with cerebral infarction',
 'Cerebral artery occlusion, unspecified without mention of cerebral infarction',
 'Cerebral embolism without mention of cerebral infarction']

### 41401 Notes

In [30]:
notes = pd.read_csv('../data/41401_notes.csv')

In [31]:
discharge_notes = notes[(notes.category == 'Discharge summary') & (notes.description != 'Addendum')]

In [11]:
splitter = '------\n------\n------\n'
def split_notes(notes):
    care_notes = defaultdict(list)
    discharge_notes = defaultdict(list)
    for adm, df in notes.sort_values(['hadm_id', 'chartdate', 'charttime']).groupby('hadm_id'):
        dis_notes = df[(df.category == 'Discharge summary') & (df.description != 'Addendum')].text.tolist()
        care_df = df[df.category != 'Discharge summary']
        if len(dis_notes) == 0 or care_df.shape[0] == 0:
            continue

        hadm_id = care_df.hadm_id.iloc[0]
        # pick first as there are rarely 2 summaries??

        discharge_notes['hadm_id'].append(hadm_id)
        discharge_notes['text'].append(dis_notes[0])
        # ignore addendums - as not sure what section they are in.

        care_notes['hadm_id'].append(hadm_id)
        care_notes['text'].append(care_df.text.str.cat(sep=splitter))
        care_notes['first_time'].append(care_df.iloc[0].charttime or care_df.iloc[0].chartdate)
        care_notes['last_time'].append(care_df.iloc[-1].charttime or care_df.iloc[-1].chartdate)
        care_notes['categories'].append(care_df.category.tolist())
        care_notes['descriptions'].append(care_df.description.tolist())
        care_notes['icd_code'].append(care_df.icd9_code.iloc[0])
    care_notes_df = pd.DataFrame(care_notes)
    discharge_notes_df = pd.DataFrame(discharge_notes)
    return care_notes_df, discharge_notes_df

In [33]:
care_notes_df, discharge_notes_df = split_notes(notes)

In [34]:
care_notes_df['cat_descs'] = care_notes_df.apply(lambda r: [(f'{c.strip()}:{d}') for c,d in zip(r.categories, r.descriptions)], axis=1)

In [464]:
def compute_diff(notes: pd.DataFrame) -> pd.DataFrame:
    internote_sort_cols = ['hadm_id', 'chartdate', 'charttime', 'category', 'description']
    internote_group_cols = ['hadm_id', 'category', 'description']
    results = defaultdict(list)
    rouge_metric = load_metric('rouge')
    bert_metric = load_metric('bertscore')
    for k, g in notes.groupby(internote_group_cols):
        if g.shape[0] > 1:
            ratios, std_devs, max_ratios, min_ratios, redundant_toks, total_tokens = [], [], [], [], [], []
            for i in range(len(g.text.tolist()) - 1):
                ratio, std_dev, max_ratio, min_ratio, redundant_toks_len, total_toks = \
                    diff_ratio(g.text.iloc[i], g.text.iloc[i+1])
                rouge_metric.add(prediction=g.text.iloc[i+1], reference=g.text.iloc[i])
                bert_metric.add(prediction=g.text.iloc[i+1], reference=g.text.iloc[i])
                ratios.append(ratio)
                std_devs.append(std_dev)
                max_ratios.append(max_ratio)
                min_ratios.append(min_ratio)
                redundant_toks.append(redundant_toks_len)
                total_tokens.append(total_toks)
            results['diff_ratios'].append(ratios)
            results['max_ratios'].append(max(max_ratios))
            results['min_ratios'].append(min(min_ratios))
            results['redundant_toks'].append(sum(redundant_toks))
            results['total_tokens'].append(sum(total_tokens))
            results['avg_diff_ratio'].append(sum(ratios) / len(ratios))
            txt_lens = g.text.apply(len)
            results['avg_txt_len'].append(sum(txt_lens) / len(txt_lens))
            results['hadm_id'].append(k[0])
            results['category'].append(k[1])
            results['description'].append(k[2])
            # compute batched summarisation metrics
            bert_scores = bert_metric.compute(lang='en', rescale_with_baseline=True, model_type='xlnet-base-cased')
            rougeL = rouge_metric.compute(rouge_types=['rougeL'], use_agregator=False)['rougeL']
            _compute_rouge_stats(rougeL, 'recall', results)
            _compute_rouge_stats(rougeL, 'precision', results)
            _compute_bert_score_stats(bert_scores, 'recall', results)
            _compute_bert_score_stats(bert_scores, 'precision', results)
        
    return pd.DataFrame(results)

def _compute_rouge_stats(scores: list, prop: str, results: dict):
    measure = [getattr(l, prop) for l in scores]
    results[f'rg_{prop}'].append(measure)
    results[f'rg_{prop}_avg'].append(np.average(measure))
    results[f'rg_{prop}_med'].append(np.median(measure))
    results[f'rg_{prop}_iqr'].append(np.subtract(*np.percentile(measure, [75, 25])))

def _compute_bert_score_stats(scores: list, prop: str, results):
    results[f'bs_{prop}'].append(scores[prop])
    results[f'bs_{prop}_avg'].append(np.average(scores[prop]))
    results[f'bs_{prop}_med'].append(np.median(scores[prop]))
    results[f'bs_{prop}_iqr'].append(np.subtract(*np.percentile(scores[prop], [75, 25])))

In [375]:
results_df = compute_diff(stroke_notes_subset)

2021-01-14 15:04:44,948: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-a839b99b-ad50-4d10-a8b1-63d145f84ea3-1-0.arrow
2021-01-14 15:04:46,719: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-55ceaab2-48fa-417f-8d2b-a18a718820e6-1-0.arrow
2021-01-14 15:04:47,876: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-bc20af4b-cec7-48eb-9299-629cce5d949d-1-0.arrow
2021-01-14 15:04:48,996: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-36af7843-e5bb-4bfd-b85c-805316fbd13a-1-0.arrow
2021-01-14 15:04:51,947: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-128a36c6-5907-4e4d-a455-ff146f203507-1-0.arrow
2021-01-14 15:04:52,986: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-377dc43b-559f-4677-b8ce-1087238020fc-1-0.arrow
2021-01-14 15:04:54,083: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/de

In [386]:
results_df = pd.concat([rougeL, results_df.loc[:, [c for c in results_df.columns if 'rougeL' in c]]], axis=1)

In [399]:
scores = results_df.loc[:, ['category', 'description', 'avg_diff_ratio', 'rougeL_recall_med', 'rougeL_recall_iqr', 'rougeL_precision_med', 'rougeL_precision_iqr', 
                            'bert-score_recall_med', 'bert-score_recall_iqr', 'bert-score_precision_med', 'bert-score_precision_iqr']]

In [400]:
scores.sort_values(['rougeL_recall_med', 'rougeL_precision_med'], ascending=[False, True])

Unnamed: 0,category,description,avg_diff_ratio,rougeL_recall_med,rougeL_recall_iqr,rougeL_precision_med,rougeL_precision_iqr,bert-score_recall_med,bert-score_recall_iqr,bert-score_precision_med,bert-score_precision_iqr
2,Nursing,Nursing Transfer Note,0.872569,0.997854,0.0,0.621658,0.0,0.982755,0.0,0.744808,0.0
3,Nutrition,Clinical Nutrition Note,0.154612,0.921967,0.323136,0.523822,0.13838,0.916548,0.203272,0.601892,0.248073
4,Physician,Intensivist Note,0.234087,0.651907,0.127212,0.692593,0.11877,0.857477,0.060351,0.853575,0.046447
1,Nursing,Nursing Progress Note,0.440298,0.632599,0.675619,0.666618,0.682953,0.701072,0.424521,0.730515,0.501814
5,Radiology,CHEST (PORTABLE AP),0.08883,0.417902,0.02135,0.54094,0.270825,0.791767,0.009032,0.827279,0.074302
6,Radiology,CT HEAD W/O CONTRAST,0.088893,0.411212,0.231364,0.399956,0.201956,0.75525,0.107351,0.77263,0.083226
7,Social Work,Social Work Progress Note,0.073434,0.383871,0.0,0.162568,0.0,0.702947,0.0,0.552549,0.0
0,ECG,Report,0.186325,0.331579,0.167044,0.23164,0.15938,0.569639,0.144755,0.42831,0.057963


In [402]:
scores.sort_values(['bert-score_recall_med', 'bert-score_precision_med'], ascending=[False, True])

Unnamed: 0,category,description,avg_diff_ratio,rougeL_recall_med,rougeL_recall_iqr,rougeL_precision_med,rougeL_precision_iqr,bert-score_recall_med,bert-score_recall_iqr,bert-score_precision_med,bert-score_precision_iqr
2,Nursing,Nursing Transfer Note,0.872569,0.997854,0.0,0.621658,0.0,0.982755,0.0,0.744808,0.0
3,Nutrition,Clinical Nutrition Note,0.154612,0.921967,0.323136,0.523822,0.13838,0.916548,0.203272,0.601892,0.248073
4,Physician,Intensivist Note,0.234087,0.651907,0.127212,0.692593,0.11877,0.857477,0.060351,0.853575,0.046447
5,Radiology,CHEST (PORTABLE AP),0.08883,0.417902,0.02135,0.54094,0.270825,0.791767,0.009032,0.827279,0.074302
6,Radiology,CT HEAD W/O CONTRAST,0.088893,0.411212,0.231364,0.399956,0.201956,0.75525,0.107351,0.77263,0.083226
7,Social Work,Social Work Progress Note,0.073434,0.383871,0.0,0.162568,0.0,0.702947,0.0,0.552549,0.0
1,Nursing,Nursing Progress Note,0.440298,0.632599,0.675619,0.666618,0.682953,0.701072,0.424521,0.730515,0.501814
0,ECG,Report,0.186325,0.331579,0.167044,0.23164,0.15938,0.569639,0.144755,0.42831,0.057963


In [424]:
category = 'Radiology'
description = 'CT HEAD W/O CONTRAST'
idx = 0

In [425]:
texts = stroke_notes_subset[(stroke_notes_subset.category == category) & (stroke_notes_subset.description == description)].text.tolist()
show_diffs(texts[idx], texts[idx+1])

In [465]:
more_notes_results_df = compute_diff(stroke_notes[stroke_notes.hadm_id.isin([195402, 152943])])

2021-01-14 17:31:27,389: Removing /Users/tom/.cache/huggingface/metrics/bert_score/default/default_experiment-4a37d248-f29f-4b4b-90b8-2e8f61749670-1-0.arrow
2021-01-14 17:31:27,786: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-5e39592f-84e2-4b87-8095-aacd8efe0b83-1-0.arrow
2021-01-14 17:32:18,938: Removing /Users/tom/.cache/huggingface/metrics/bert_score/default/default_experiment-64161000-833a-4b38-b4e1-12e8a8ac58d9-1-0.arrow
2021-01-14 17:32:19,383: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-0dbbd235-4e69-4d2e-ac82-a88e7b474083-1-0.arrow
2021-01-14 17:32:27,764: Removing /Users/tom/.cache/huggingface/metrics/bert_score/default/default_experiment-c052fe8e-ca0d-4640-93d3-394a83f72a73-1-0.arrow
2021-01-14 17:32:27,965: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-3d921f76-5fdd-4cc1-ba43-dc45e9dc591c-1-0.arrow
2021-01-14 17:50:44,340: Removing /Users/tom/.cache/huggingface/metrics/b

In [None]:
more_notes_results_df

In [None]:
# sort and re-rank, set of documents, extract phenotypes to test how excluding 
# the set of redundant document effects phenotype extraction??

In [154]:
stroke_notes_subset = stroke_notes[stroke_notes.hadm_id == 195402]

In [221]:
list(stroke_notes_subset.groupby(['hadm_id', 'category', 'description']))[2][1].text

7139    Atrial fibrillation.  Predominantly ventricula...
7140    Atrial fibrillation with a rapid ventricular r...
7141    Atrial fibrillation with variable ventricular ...
7142    Compared to the previous tracing ventricular f...
7143    Probable atrial fibrillation with semi-regular...
Name: text, dtype: object

In [251]:
sample_diffs = compute_diff(stroke_notes_subset)

2021-01-14 11:32:18,040: Removing /Users/tom/.cache/huggingface/metrics/bert_score/default/default_experiment-1-0.arrow
2021-01-14 11:32:18,047: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
2021-01-14 11:33:13,206: Removing /Users/tom/.cache/huggingface/metrics/bert_score/default/default_experiment-1-0.arrow
2021-01-14 11:33:13,970: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
2021-01-14 11:33:21,868: Removing /Users/tom/.cache/huggingface/metrics/bert_score/default/default_experiment-1-0.arrow
2021-01-14 11:33:22,022: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
2021-01-14 11:33:34,371: Removing /Users/tom/.cache/huggingface/metrics/bert_score/default/default_experiment-1-0.arrow
2021-01-14 11:33:34,490: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
2021-01-14 11:36:11,716: Removing /Users/tom/.cache/huggingf

In [None]:
# plots between max / min... as things progress... - heavy bias between how the clinician records the notes..
# 

In [439]:
more_notes_results_df.columns

Index(['diff_ratios', 'max_ratios', 'min_ratios', 'redundant_toks',
       'total_tokens', 'avg_diff_ratio', 'avg_txt_len', 'hadm_id', 'category',
       'description', 'rg_recall', 'rg_recall_avg', 'rg_recall_med',
       'rg_recall_iqr', 'rg_precision', 'rg_precision_avg', 'rg_precision_med',
       'rg_precision_iqr'],
      dtype='object')

In [None]:
list(chain.from_iterable(more_notes_results_df.rg_recall))

In [478]:
def compute_avgs(results_df: pd.DataFrame) -> pd.DataFrame:
    cat_desc_avg = defaultdict(list)
    for k, df in results_df.groupby(['category', 'description']):
        cat_desc_avg['cat_desc'].append(f'{k[0]}:{k[1]}')
        cat_desc_avg['redundant_toks'].append(sum(df.redundant_toks))
        cat_desc_avg['total_toks'].append(sum(df.total_tokens))
        cat_desc_avg['avg_txt_len'].append(np.average(df.avg_txt_len))
        cat_desc_avg['macro_avg'].append(np.average(df.avg_diff_ratio))
        
        d_r = list(chain.from_iterable(df.diff_ratios))
        cat_desc_avg['micro_avg'].append(np.average(d_r))
        cat_desc_avg['num_instances'].append(len(d_r))
        # micro avgs of median / iqr
        rg_rec = list(chain.from_iterable(df.rg_recall))
        rg_prec = list(chain.from_iterable(df.rg_precision))
        cat_desc_avg['rg_rec_avg'].append(np.average(rg_rec))
        cat_desc_avg['rg_rec_med'].append(np.median(rg_rec))
        cat_desc_avg['rg_rec_iqr'].append(np.subtract(*np.percentile(rg_rec, [75, 25])))
        cat_desc_avg['rg_prec_avg'].append(np.average(rg_prec))
        cat_desc_avg['rg_prec_med'].append(np.median(rg_prec))
        cat_desc_avg['rg_prec_iqr'].append(np.subtract(*np.percentile(rg_prec, [75, 25])))
        
        bs_rec = list(chain.from_iterable(df.bs_recall))
        bs_prec = list(chain.from_iterable(df.bs_precision))
        cat_desc_avg['bs_rec_avg'].append(np.average(bs_rec))
        cat_desc_avg['bs_rec_med'].append(np.median(bs_rec))
        cat_desc_avg['bs_rec_iqr'].append(np.subtract(*np.percentile(bs_rec, [75, 25])))
        cat_desc_avg['bs_prec_avg'].append(np.average(bs_prec))
        cat_desc_avg['bs_prec_med'].append(np.median(bs_prec))
        cat_desc_avg['bs_prec_iqr'].append(np.subtract(*np.percentile(bs_prec, [75, 25])))
    group_avgs = pd.DataFrame(cat_desc_avg)
    group_avgs = group_avgs[~group_avgs.cat_desc.str.contains('Discharge summary')]
    group_avgs = group_avgs[group_avgs['num_instances'] > 5]
    group_avgs = group_avgs.sort_values('num_instances', ascending=False).reset_index(drop=True).head(20)
    return group_avgs

In [479]:
avgs = compute_avgs(more_notes_results_df)

In [482]:
avgs

Unnamed: 0,cat_desc,redundant_toks,total_toks,avg_txt_len,macro_avg,micro_avg,num_instances,rg_rec_avg,rg_rec_med,rg_rec_iqr,rg_prec_avg,rg_prec_med,rg_prec_iqr,bs_rec_avg,bs_rec_med,bs_rec_iqr,bs_prec_avg,bs_prec_med,bs_prec_iqr
0,Nursing:Nursing Progress Note,3083.0,10644,1532.342857,0.260937,0.227994,49,0.345551,0.183099,0.453135,0.348121,0.1875,0.543152,0.496873,0.415269,0.325653,0.494258,0.407386,0.30074
1,Physician :Intensivist Note,2268.0,19917,6768.888889,0.18833,0.169265,24,0.594225,0.59326,0.10183,0.59755,0.601893,0.10616,0.856726,0.865678,0.039429,0.858064,0.862904,0.042023
2,Radiology:CT HEAD W/O CONTRAST,202.0,3299,1934.0,0.085815,0.084276,16,0.400569,0.36734,0.179989,0.471806,0.399956,0.531606,0.732597,0.752343,0.096644,0.739952,0.76899,0.193871
3,Nursing:Nursing Transfer Note,1780.0,3085,3215.5,0.618746,0.421328,9,0.725352,0.997854,0.470588,0.545839,0.612613,0.374493,0.701177,0.963624,0.418861,0.572983,0.604204,0.20379
4,Nutrition:Clinical Nutrition Note,504.0,1656,2180.266667,0.340495,0.278534,6,0.723751,0.921967,0.430212,0.591569,0.553689,0.121907,0.789677,0.916548,0.14537,0.708747,0.741177,0.260565


In [495]:
avgs.loc[:, ['cat_desc', 'micro_avg', 'rg_rec_avg', 'rg_prec_avg', 'bs_rec_avg', 'bs_prec_avg']]

Unnamed: 0,cat_desc,micro_avg,rg_rec_avg,rg_prec_avg,bs_rec_avg,bs_prec_avg
0,Nursing:Nursing Progress Note,0.227994,0.345551,0.348121,0.496873,0.494258
1,Physician :Intensivist Note,0.169265,0.594225,0.59755,0.856726,0.858064
2,Radiology:CT HEAD W/O CONTRAST,0.084276,0.400569,0.471806,0.732597,0.739952
3,Nursing:Nursing Transfer Note,0.421328,0.725352,0.545839,0.701177,0.572983
4,Nutrition:Clinical Nutrition Note,0.278534,0.723751,0.591569,0.789677,0.708747


### Compare Discharge Summary to last note types via Summarisation Metrics
- recall will likely be low, but any non-zero ness is potentially interesting
- do neural methods work at all here??

In [None]:
# only take last note...? Or take all notes of each category
#

In [500]:
rouge_metric.add(prediction='foo bar baz', reference='foo')

In [501]:
rouge_metric.add(prediction='foo bar baz', reference='foo')

In [502]:
rouge_metric.compute(rouge_types=['rougeL'], use_agregator=False)['rougeL']

2021-01-15 00:19:26,721: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow


[Score(precision=0.3333333333333333, recall=1.0, fmeasure=0.5),
 Score(precision=0.3333333333333333, recall=1.0, fmeasure=0.5)]

In [519]:
def calc_diff_metrics(ds: str, prev_notes: dict):
    # calc diff between 
    rouge_metric = load_metric('rouge')
    bert_metric = load_metric('bertscore')
    output = defaultdict(list)
    for k, note_text in prev_notes.items():
        ratio, std_dev, max_ratio, min_ratio, redundant_toks_len, total_toks = diff_ratio(ds, note_text)
        output['ratio'].append(ratio)
        output['cat_desc'].append(':'.join(k))
        rouge_metric.add(prediction=ds, reference=note_text)
        bert_metric.add(prediction=ds, reference=note_text)
#     bert_scores = bert_metric.compute(lang='en', rescale_with_baseline=True, model_type='xlnet-base-cased')
#     output['bs_prec'].extend(bert_scores.precision)
#     output['bs_rec'].extend(bert_scores.recall)
    rougeL = rouge_metric.compute(rouge_types=['rougeL'], use_agregator=False)['rougeL']
    output['rg_prec'].extend([s.precision for s in rougeL])
    output['rg_rec'].extend([s.recall for s in rougeL])
    return pd.DataFrame(output)
    
def calc_discharge_summary_redundancy(notes: pd.DataFrame):
    summaries = notes[notes.category == 'Discharge summary'].loc[:, ['hadm_id', 'text']]
    ds_diffs = []
    for r in summaries.itertuples():
        hadm_notes = notes[(notes.hadm_id == r.hadm_id) & (notes.category != 'Discharge summary')]
        last_notes_per_type = {}
        for k, df_g in hadm_notes.groupby(['category', 'description']):
            last_notes_per_type[':'.join(k)] = df_g.text.iloc[-1]
        ds_diffs.append(calc_diff_metrics(r.text, last_notes_per_type))
    return pd.concat(ds_diffs)

In [518]:
calc_discharge_summary_redundancy(stroke_notes_subset)

2021-01-15 00:24:18,249: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow
2021-01-15 00:24:21,168: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow


In [488]:
stroke_notes[stroke_notes.category == 'Discharge summary']

Unnamed: 0,row_id,subject_id,hadm_id,chartdate,charttime,storetime,category,description,cgid,iserror,text,row_id.1,subject_id.1,hadm_id.1,seq_num,icd9_code
6306,754,62884,112885,2100-08-12 00:00:00,,,Discharge summary,Report,,,Admission Date: [**2100-7-16**] ...,484345,62884,112885,1,43411
6307,493,24650,134784,2149-11-20 00:00:00,,,Discharge summary,Report,,,Admission Date: [**2149-11-5**] ...,275122,24650,134784,1,43411
6308,262,53876,180979,2164-03-27 00:00:00,,,Discharge summary,Report,,,Admission Date: [**2164-3-19**] ...,442902,53876,180979,1,43411
6309,602,25433,144885,2186-01-04 00:00:00,,,Discharge summary,Report,,,Admission Date: [**2185-12-22**] Discha...,285019,25433,144885,1,43411
6310,657,26203,186592,2152-12-02 00:00:00,,,Discharge summary,Report,,,Admission Date: [**2152-11-25**] ...,293426,26203,186592,1,43411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6668,58693,29666,105336,2106-04-06 00:00:00,,,Discharge summary,Addendum,,,"Name: [**Known lastname **],[**Known firstnam...",339754,29666,105336,1,43411
6669,59041,86516,193789,2148-11-26 00:00:00,,,Discharge summary,Addendum,,,"Name: [**Known lastname 339**],[**Known first...",590278,86516,193789,1,43411
6670,59375,30882,132521,2105-09-18 00:00:00,,,Discharge summary,Addendum,,,"Name: [**Known lastname 17498**],[**Known fir...",355278,30882,132521,1,43411
6671,59319,72017,129643,2155-08-25 00:00:00,,,Discharge summary,Addendum,,,"Name: [**Known lastname 4755**],[**Known firs...",525982,72017,129643,1,43411


In [38]:
print('Token level / global note redundancy with concurrent notes:')
print(f'Overall Average Internote type redundnacy:{results_df.avg_diff_ratio.sum() / results_df.avg_diff_ratio.shape[0]}')

Token level / global note redundancy with concurrent notes:


NameError: name 'results_df' is not defined

### 43411 Note Analysis

In [6]:
stroke_notes = pd.read_csv('../data/43411_notes.csv')

In [17]:
stroke_care_notes_df, stroke_discharge_df = split_notes(stroke_notes)

In [18]:
stroke_notes_subset = stroke_notes[stroke_notes.hadm_id.isin(stroke_notes.hadm_id.unique()[0:10])]

In [None]:
results_df = compute_diff(stroke_notes)

In [3]:
results_df = pd.read_pickle('temp_results.pickle')

In [79]:
stroke_notes[(stroke_notes.hadm_id == 100349) & (stroke_notes.category == 'ECG')]

Unnamed: 0,row_id,subject_id,hadm_id,chartdate,charttime,storetime,category,description,cgid,iserror,text,row_id.1,subject_id.1,hadm_id.1,seq_num,icd9_code
7466,238674,94135,100349,2114-07-10 00:00:00,,,ECG,Report,,,Sinus bradycardia. Since the previous tracing...,623980,94135,100349,1,43411
7467,238675,94135,100349,2114-07-04 00:00:00,,,ECG,Report,,,Sinus bradycardia with sinus arrhythmia. Other...,623980,94135,100349,1,43411


In [89]:
results_df.iloc[1].rougeL['rougeL'].mid

Score(precision=0.42266940560951455, recall=0.38509458229752874, fmeasure=0.397148908439231)

In [95]:
results_df

Unnamed: 0,rougeL,diff_ratios,max_ratios,min_ratios,redundant_toks,total_tokens,avg_diff_ratio,avg_txt_len,hadm_id,category,description,rougeL_rec,rougeL_prec
0,"{'rougeL': ((0.3076923076923077, 0.28571428571...",[0.43452380952380953],1.0,0.0,4.0,15,0.434524,102.000000,100349,ECG,Report,0.285714,0.307692
1,"{'rougeL': ((0.3103448275862069, 0.30344827586...","[0.19021853192883126, 0.08999497234791352, 0.2...",1.0,0.0,109.0,359,0.192468,1010.000000,100349,Radiology,CHEST (PORTABLE AP),0.385095,0.422669
2,"{'rougeL': ((0.29992194969995634, 0.2885761183...","[0.21624302874302875, 0.06432178932178932, 0.0...",1.0,0.0,122.0,1172,0.133209,1809.166667,100349,Radiology,CT HEAD W/O CONTRAST,0.384382,0.560188
3,"{'rougeL': ((0.20408163265306123, 0.5882352941...",[0.23611111111111113],1.0,0.0,6.0,22,0.236111,222.000000,100662,ECG,Report,0.588235,0.204082
4,"{'rougeL': ((0.10173527879103708, 0.1150165016...","[0.052083333333333336, 0.041666666666666664, 0...",1.0,0.0,9.0,609,0.061439,906.000000,100662,Nursing/other,Report,0.142737,0.133319
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,"{'rougeL': ((0.43902439024390244, 0.3170731707...","[0.8125, 0.23214285714285715, 0.14285714285714...",1.0,0.0,21.0,79,0.395833,182.500000,199382,ECG,Report,0.620529,0.558781
1131,"{'rougeL': ((0.5216049382716049, 0.36034115138...",[0.14557819934011518],1.0,0.0,54.0,457,0.145578,2605.500000,199382,Echo,Report,0.360341,0.521605
1132,"{'rougeL': ((0.11400334927566057, 0.1001966695...","[0.06666666666666667, 0.10743801652892562, 0.1...",1.0,0.0,110.0,4449,0.092307,706.729730,199382,Nursing/other,Report,0.126103,0.139933
1133,"{'rougeL': ((0.18218623481781376, 0.2834008097...","[0.13262222222222222, 0.11997617469486797]",1.0,0.0,50.0,333,0.126299,1430.666667,199382,Radiology,BY DIFFERENT PHYSICIAN,0.368973,0.276278


In [10]:
comp_metrics = results_df.loc[:, ['hadm_id', 'category', 'description', 'avg_diff_ratio', 'rougeL_rec',  'rougeL_prec']]

In [11]:
comp_metrics.sort_values('rougeL_rec', ascending=False)

Unnamed: 0,hadm_id,category,description,avg_diff_ratio,rougeL_rec,rougeL_prec
247,121824,Physician,Physician Resident Admission Note,0.853598,1.000000,0.769582
98,107913,Physician,Physician Surgical Admission Note,0.996592,1.000000,0.980357
860,180112,Nursing,Nursing Transfer Note,0.932375,1.000000,0.420744
450,140761,Physician,Physician Surgical Admission Note,0.999477,1.000000,0.996753
586,154473,General,Cardiology consult,0.904114,1.000000,0.949074
...,...,...,...,...,...,...
1079,195985,Nursing/other,Report,0.035890,0.073285,0.183257
889,182153,Nursing/other,Report,0.036163,0.066773,0.135223
799,171903,Nursing/other,Report,0.028571,0.060440,0.180328
122,109832,Nursing,Nursing Transfer Note,0.037115,0.049407,0.806452


In [22]:
stroke_notes.category.unique()

array(['Nursing', 'Physician ', 'Rehab Services', 'Nutrition',
       'Respiratory ', 'Social Work', 'General', 'Radiology',
       'Nursing/other', 'Discharge summary', 'Echo', 'ECG', 'Pharmacy',
       'Case Management '], dtype=object)

In [30]:
texts = stroke_notes[(stroke_notes.hadm_id == 121824) & (stroke_notes.category == 'Physician ') & (stroke_notes.description == 'Physician Resident Admission Note')].text.tolist()

In [45]:
group_avgs = compute_avgs(results_df)

In [46]:
group_avgs['redundant_ratio'] = group_avgs['redundant_toks'] / group_avgs.total_toks

In [47]:
group_avgs.sort_values('redundant_ratio', ascending=False).reset_index(drop=True).head(20)

Unnamed: 0,cat_desc,redundant_toks,total_toks,avg_txt_len,macro_avg,micro_avg,num_instances,redundant_ratio
0,Physician :Physician Resident Admission Note,28842.0,33109,12525.819048,0.856518,0.809448,19,0.871123
1,Physician :Physician Surgical Progress Note,3181.0,4347,5587.0,0.809635,0.764214,7,0.731769
2,Physician :Physician Attending Progress Note,14380.0,21148,5941.186706,0.532457,0.536417,31,0.67997
3,Physician :Intensivist Note,100737.0,159909,5958.690231,0.691003,0.639584,211,0.629965
4,Respiratory :Respiratory Care Shift Note,7263.0,12124,1230.545915,0.404803,0.443782,79,0.59906
5,Nursing:Nursing Transfer Note,30799.0,53260,2230.841242,0.598621,0.550831,159,0.578276
6,Physician :Physician Resident Progress Note,40675.0,73517,6890.886364,0.551072,0.467104,94,0.553273
7,Nursing:Nursing Progress Note,64447.0,136297,1291.039215,0.442277,0.419754,620,0.472842
8,Radiology:PORTABLE ABDOMEN,923.0,2992,1084.420513,0.27408,0.259291,23,0.308489
9,Radiology:CHEST PORT. LINE PLACEMENT,1237.0,4903,1240.14386,0.203611,0.219498,34,0.252295


In [91]:
pt = stroke_notes[stroke_notes.hadm_id == 195402]

2021-01-07 10:30:29,573: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow


In [100]:
pt_nurs_notes = pt[pt.description == 'Nursing Progress Note']

In [125]:
def single_group_ratios(pt):
    internote_sort_cols = ['hadm_id', 'chartdate', 'charttime', 'category', 'description']
    internote_group_keys = ['hadm_id', 'category', 'description']
    groups_gen = pt.sort_values(internote_sort_cols).groupby(internote_group_keys)
    results = defaultdict(list)
    metric = load_metric('rouge')
    score = None
    ratios = []
    for k, g in [list(groups_gen)[-1]]:
        if g.shape[0] > 1:
            for i in range(len(g.text.tolist()) - 1):
                metric.add(prediction=g.text.iloc[i], reference=g.text.iloc[i+1])
                ratio, std_dev, max_ratio, min_ratio, redundant_toks_len, total_toks = \
                    diff_ratio(g.text.iloc[i], g.text.iloc[i+1])
                ratios.append(ratio)
            score = metric.compute(rouge_types=['rougeL'])
    return ratios, score

In [126]:
ratios, rouge_scores = single_group_ratios(pt_nurs_notes)

2021-01-07 11:32:03,220: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow


TypeError: index() takes at least 1 argument (0 given)

In [140]:
metric.add(prediction=sent_a, reference=sent_b, 

NameError: name 'sent_a' is not defined

TypeError: _compute() got an unexpected keyword argument 'model'

In [108]:
sum(ratios) / len(ratios)

0.5489322665281713

In [109]:
single_group_ratios(pt)

2021-01-07 10:48:45,551: Removing /Users/tom/.cache/huggingface/metrics/rouge/default/default_experiment-1-0.arrow


([0.07447204531062934],
 {'rouge1': AggregateScore(low=Score(precision=0.29508196721311475, recall=0.6967741935483871, fmeasure=0.4145873320537428), mid=Score(precision=0.29508196721311475, recall=0.6967741935483871, fmeasure=0.4145873320537428), high=Score(precision=0.29508196721311475, recall=0.6967741935483871, fmeasure=0.4145873320537428)),
  'rouge2': AggregateScore(low=Score(precision=0.12722298221614228, recall=0.30097087378640774, fmeasure=0.17884615384615388), mid=Score(precision=0.12722298221614228, recall=0.30097087378640774, fmeasure=0.17884615384615388), high=Score(precision=0.12722298221614228, recall=0.30097087378640774, fmeasure=0.17884615384615388)),
  'rougeL': AggregateScore(low=Score(precision=0.16256830601092895, recall=0.38387096774193546, fmeasure=0.22840690978886755), mid=Score(precision=0.16256830601092895, recall=0.38387096774193546, fmeasure=0.22840690978886755), high=Score(precision=0.16256830601092895, recall=0.38387096774193546, fmeasure=0.2284069097888675

In [48]:
discharge_notes_df = stroke_discharge_df

In [94]:
g.text

43    SW consulted to participate in family/team mee...
34    Team/family meeting to confer on pt\ns son and...
Name: text, dtype: object

### Brief Hospital Course Analysis

In [14]:
# ptn = re.compile(r'(?<=brief\shospi§tal\scourse\:)(?:.*\n)+?(?=\n\n)', flags=re.I)
ptn = re.compile(r'(?<=ospital\scourse\:)(?:.*\n)+?(?=\n\n)', flags=re.I)
def extract_hospital_course(row):
    m = re.search(ptn, row.text)
    if m:
        return re.split(r'(\n\n.*\:\n?)', row.text[m.start():m.end()])[0].strip()
    return None 

In [27]:
pmx_ptn = re.compile(r'(?:past\smedical\shistory\:?)\n?(?:.*\n)+(?=\n)')
def extract_pmx(row):
    m = re.search(pmx_ptn, row.text)
    if m:
        return re.split(r'(\n\n.*\:\n?)', row.text[m.start():m.end()])[0].strip()
    return None

In [15]:
discharge_notes_df['hospital_course_text'] = discharge_notes_df.apply(extract_hospital_course, axis=1)

In [28]:
discharge_notes_df['pmx_text'] = discharge_notes_df.apply(extract_hospital_course, axis=1)

In [25]:
discharge_notes_df[pd.isna(discharge_notes_df.pmx_text)].shape

(262, 4)

In [None]:
print(discharge_notes_df[pd.isna(discharge_notes_df.pmx_text)].text.iloc[1])

In [150]:
metric = load_metric('bertscore')

In [None]:
metric

In [151]:
metric.add(prediction=nut_notes.iloc[0], reference=nut_notes.iloc[1])

In [152]:
metric.compute(lang='en', rescale_with_baseline=True, model_type='xlnet-base-cased')

2021-01-13 12:30:53,610: Lock 140536987522512 acquired on /Users/tom/.cache/huggingface/transformers/06bdb0f5882dbb833618c81c3b4c996a0c79422fa2c95ffea3827f92fc2dba6b.da982e2e596ec73828dbae86525a1870e513bd63aae5a2dc773ccc840ac5c346.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…


2021-01-13 12:30:53,940: Lock 140536987522512 released on /Users/tom/.cache/huggingface/transformers/06bdb0f5882dbb833618c81c3b4c996a0c79422fa2c95ffea3827f92fc2dba6b.da982e2e596ec73828dbae86525a1870e513bd63aae5a2dc773ccc840ac5c346.lock
2021-01-13 12:30:54,218: Lock 140537333938576 acquired on /Users/tom/.cache/huggingface/transformers/df73bc9f8d13bf2ea4dab95624895e45a550a0f0a825e41fc25440bf367ee3c8.d93497120e3a865e2970f26abdf7bf375896f97fde8b874b70909592a6c785c9.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…


2021-01-13 12:30:56,199: Lock 140537333938576 released on /Users/tom/.cache/huggingface/transformers/df73bc9f8d13bf2ea4dab95624895e45a550a0f0a825e41fc25440bf367ee3c8.d93497120e3a865e2970f26abdf7bf375896f97fde8b874b70909592a6c785c9.lock
2021-01-13 12:30:56,821: Lock 140534691015632 acquired on /Users/tom/.cache/huggingface/transformers/9461853998373b0b2f8ef8011a13b62a2c5f540b2c535ef3ea46ed8a062b16a9.3e214f11a50e9e03eb47535b58522fc3cc11ac67c120a9450f6276de151af987.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…


2021-01-13 12:32:41,918: Lock 140534691015632 released on /Users/tom/.cache/huggingface/transformers/9461853998373b0b2f8ef8011a13b62a2c5f540b2c535ef3ea46ed8a062b16a9.3e214f11a50e9e03eb47535b58522fc3cc11ac67c120a9450f6276de151af987.lock
2021-01-13 12:32:51,689: Removing /Users/tom/.cache/huggingface/metrics/bert_score/default/default_experiment-1b50ecca-ea05-4abe-b070-60cd6a8343b4-1-0.arrow


{'precision': tensor([0.9828]),
 'recall': tensor([0.7448]),
 'f1': tensor([0.8541]),
 'hashcode': 'xlnet-base-cased_L5_no-idf_version=0.3.7(hug_trans=4.1.1)-rescaled'}

In [134]:
show_diffs(nut_notes.iloc[0], nut_notes.iloc[1])