In [2]:
%load_ext autoreload

%autoreload 2

In [5]:
import os
os.chdir('../')

In [6]:
import pandas as pd
from clean.clean import append_addendums
import os
import re

In [7]:
notes = pd.read_csv('data/notes.csv')

In [8]:
notes = append_addendums(notes)
codes = pd.read_csv('data/admissions.csv')

In [9]:
top_50_codes = pd.DataFrame(codes.groupby('icd9_code').count().sort_values('hadm_id', ascending=False)['hadm_id'][0:50])

In [10]:
top_50_codes.reset_index(inplace=True)
top_50_codes['desc'] = top_50_codes.apply(lambda x: codes[codes['icd9_code'] == x.icd9_code].short_title.iloc[0], axis=1)
top_50_codes.columns = ['ICD9_Code', 'Count', 'Desc']

In [11]:
top_50_codes.head(20)

Unnamed: 0,ICD9_Code,Count,Desc
0,4019,20703,Hypertension NOS
1,4280,13111,CHF NOS
2,42731,12891,Atrial fibrillation
3,41401,12429,Crnry athrscl natve vssl
4,5849,9119,Acute kidney failure NOS
5,25000,9058,DMII wo cmp nt st uncntr
6,2724,8690,Hyperlipidemia NEC/NOS
7,51881,7497,Acute respiratry failure
8,5990,6555,Urin tract infection NOS
9,53081,6326,Esophageal reflux


In [337]:
DISCHARGE_DIAG_SYNONYMS = [
    'discharge diagnosis',
    'discharge diagnoses',
    'discharge diagnose',
    'final diagnosis',
    'final diagnoses',
    'death diagnosis',
    'death diagnoses',
    'discharge diagnosis list',
    'discharge diagnoses list',
    'death diagnosis',
]

In [338]:
srs = notes.text
srs_hadmid = notes.hadm_id

In [339]:
diag_pattern = '|'.join([f'(?<={term}:)(?: ?\n?\n?.*)(?:.+\n)+' for term in DISCHARGE_DIAG_SYNONYMS])

In [340]:
reg = re.compile(f'({diag_pattern})', re.M | re.I)

In [341]:
# for non matches, that are excluded Series.str.extractAll
single_extract = srs.str.extract(reg)
no_matches = single_extract.loc[single_extract.isnull()[0], :]

In [345]:
all_matches = srs.str.extractall(reg)

In [346]:
groups = list(all_matches[0:100].reset_index().groupby('level_0'))

In [347]:
# tuples of: index of hadm_id,  last match in the discharge_summary
last_matches = [(g.iloc[-1, :].level_0, g.iloc[-1, :][0]) for k, g in all_matches.reset_index().groupby('level_0')]

In [349]:
notes.shape[0]

52726

In [373]:
list(single_extract[pd.isna(single_extract.reset_index()[0])].iterrows())[0][1][0]

nan

In [376]:
no_match_tuples =[(d[0], d[1][0]) for d in single_extract[pd.isna(single_extract.reset_index()[0])].iterrows()]

In [352]:
last_matches[8:11]

[(8,
  "\nPrimary diagnosis:\nSubglottic stenosis\nHosptial acquired pneumonia\n.\nSecondary diagnoses:\n? Adrenal insufficiency\nDown's syndrome\nSeizure disorder\n"),
 (9,
  '\nCervical myelopathy\nC1 tumor with cervical myelopathy\nAcute on chronic diastolic heart failure\n'),
 (11,
  '\n1. Multiple Sclerosis\n2. Urinary Tract Infection, complicated\n3. Hyponatremia\n.\nSecondary:\n1. Chronic Diastolic CHF\n')]

In [377]:
last_matches.extend(no_match_tuples)

In [379]:
sorted_matches = sorted(last_matches, key=lambda x: x[0])

In [382]:
matches = pd.DataFrame(sorted_matches, columns=['idx', 'match'])

In [None]:
matches = matches.drop('idx', axis=1)
matches['note'] = srs
matches['hadm_id'] = srs_hadmid
matches.reset_index(drop=True, inplace=True)

In [467]:
codes

Unnamed: 0.1,Unnamed: 0,hadm_id,icd9_code,short_title,long_title
0,0,169009,9971,Surg compl-heart,"Cardiac complications, not elsewhere classified"
1,1,169009,42731,Atrial fibrillation,Atrial fibrillation
2,2,169009,4295,Chordae tendinae rupture,Rupture of chordae tendineae
3,3,169009,2720,Pure hypercholesterolem,Pure hypercholesterolemia
4,4,169009,4169,Chr pulmon heart dis NOS,"Chronic pulmonary heart disease, unspecified"
5,5,169009,E8789,Abn react-surg proc NOS,Unspecified surgical operations and procedures...
6,6,110320,V3000,Single lb in-hosp w/o cs,"Single liveborn, born in hospital, delivered w..."
7,7,110320,76519,Preterm NEC 2500+g,"Other preterm infants, 2,500 grams and over"
8,8,110320,76528,35-36 comp wks gestation,35-36 completed weeks of gestation
9,9,110320,V292,Obsrv NB suspc resp cond,Observation and evaluation of newborn for susp...


### The matches for diagnosis: , diagnoses: can result in fale positives on sections that are not the final diagnosis. These are marked as false positives.

In [393]:
matches['dd_line_len'] = matches.match.str.split('\n').apply(lambda x: len(x) if type(x) == list else 1)

In [431]:
no_dis_diag = matches[(pd.isna(matches.match)) | (matches['dd_line_len'] > 60)]
dis_diag = matches[(~pd.isna(matches.match)) & (matches['dd_line_len'] <= 60)]

- death of patient: 50
- followed up with primary care dr: 51, 55, 58 (medications on discharge), 
- followed up with non-icu department in hospital: 56, 101960, 101967, 101969
- patient is still in hospital at time of writing:  53
- notes are incomplete: 54, 57 (addendum referred to but not provided in dataset), 61
- addenudems are added, but no clearly defined list still: 101936

In [395]:
# should be possible to further clean this to just include the hospital course section..

### Total Notes with a discharge diag section: 

In [434]:
print(f'Summaries w/ DD Sections:{dis_diag.shape[0]}')
print(f'Summaries w/o DD Sections{no_dis_diag.shape[0]}')
print(f'{round(dis_diag.shape[0] / (dis_diag.shape[0] + no_dis_diag.shape[0]) * 100, 2)}%')

Summaries w/ DD Sections:48898
Summaries w/o DD Sections3828
92.74%


In [465]:
notes.groupby('hadm_id').count().sum()

subject_id    52726
chartdate     52726
text          52726
dtype: int64

## Convert dis_diag to MedCATTrainer Upload Format

In [444]:
medcat_df = dis_diag.loc[:, ['hadm_id', 'match']]
medcat_df['name'] = medcat_df['hadm_id'].apply(lambda i: f'Admission ID:{str(i)}')
medcat_df = medcat_df.drop('hadm_id', axis=1)

In [462]:
medcat_df.columns = ['text', 'name']

In [463]:
medcat_df.to_csv('mimic_medcat.csv')

In [464]:
medcat_df

Unnamed: 0,text,name
0,\nDiabetic keotacidosis\nHematemesis (blood in...,Admission ID:100001
1,\nPeptic ulcer\nGI bleed\n,Admission ID:100003
2,\n1. Multiple myeloma.\n2. Congestive obstruct...,Admission ID:100006
3,\nSmall bowel obstruction\nInternal hernia wit...,Admission ID:100007
4,\nCoronary Artery Disease\nCAD-(AMI [**2143-7-...,Admission ID:100009
5,\nMetastatic RCC/Left renal mass s/p open left...,Admission ID:100010
6,\nS/P scooter v tree\n1. Left eye abrasion\n2....,Admission ID:100011
7,\nCoronary Artery Disease s/p Coronary Bypass ...,Admission ID:100012
8,\nPrimary diagnosis:\nSubglottic stenosis\nHos...,Admission ID:100016
9,\nCervical myelopathy\nC1 tumor with cervical ...,Admission ID:100018


In [456]:
tuis = [l[:-2] for l in open('/Users/tom/phd/cattrainer/all_tuis.txt')]

In [460]:
','.join(tuis)

'T11,T02,T05,T10,T08,T01,T19,T00,T01,T19,T19,T12,T00,T03,T02,T05,T03,T01,T02,T09,T12,T02,T03,T02,T04,T02,T01,T10,T12,T10,T18,T20,T20,T07,T04,T08,T06,T05,T20,T04,T06,T06,T19,T05,T01,T07,T12,T20,T05,T09,T02,T01,T03,T00,T16,T16,T04,T08,T02,T06,T10,T09,T06,T09,T05,T13,T12,T01,T07,T12,T05,T19,T03,T17,T13,T17,T05,T03,T01,T06,T06,T07,T04,T07,T04,T04,T08,T19,T11,T07,T08,T05,T09,T10,T03,T04,T00,T09,T04,T04,T07,T06,T03,T12,T00,T10,T09,T09,T09,T08,T08,T19,T01,T06,T07,T08,T16,T09,T05,T18,T08,T02,T07,T06,T00,T12,T01'

In [398]:
records = [{'hadm_id': hadm_id, 'codes': code_df.icd9_code.tolist()} for hadm_id, code_df in codes.loc[:, ['hadm_id', 'icd9_code']].groupby('hadm_id')]

In [399]:
codes_per_hadm = pd.DataFrame(records)

In [400]:
df = pd.merge(dis_diag, codes_per_hadm, 'inner', 'hadm_id')

In [401]:
df['codes_len'] = df['codes'].apply(len)

In [None]:
no_dis_diag

In [None]:
print(no_dis_diag[~pd.isna(no_dis_diag.match)].note.iloc[6])

In [None]:
print(no_dis_diag[~pd.isna(no_dis_diag.match)].match.iloc[6])

## Listed Diagnosis cleaning:
numbers: 1-20

type listed using 'primary' or 'secondary'

diagnosis: also spread over multiple new lines

1st line should also be listed as primary??

In [32]:
# is numbers present then join new lines that do not start with numbers
def numbers_present(match):
    return '1.' in match or '1)' in match

In [33]:
# split by is numbers and join those lines that are numbered.

In [34]:
dd_with_num = dis_diag[dis_diag.match.apply(numbers_present)]
# join sentences if numbers do not continue. I.e. positive look behind and positive lookahead...

In [35]:
dd_with_no_num = dis_diag[~dis_diag.match.apply(numbers_present)]

In [None]:
[print(m) for m in dd_with_no_num.match[60:90]]

In [None]:
dd_with_no_num

In [None]:
dd_with_num

## Further cleaning of extra metadata lines:
All matches ignore case 

In [39]:
metadir_lines = [
  'primary:',
  'secondary:',
  'primary diagnos(?:i|e)s:?',
  'secondary diagnos(?:i|e)s:?',
  'diagnos(?:i|e)s:',
]
within_dd_reg = re.compile('|'.join(metadir_lines), re.I)

In [40]:
[l for l in dd_with_no_num.match.str.split('\n')[8] if len(l) > 1 and re.search(within_dd_reg, l) == None]

['Subglottic stenosis',
 'Hosptial acquired pneumonia',
 '? Adrenal insufficiency',
 "Down's syndrome",
 'Seizure disorder']

In [41]:
split_dd_no_num = dd_with_no_num.match.str.split('\n').apply(lambda x: [i for i in x if len(i) > 1 and re.search(within_dd_reg, i) == None])

In [42]:
name_repl_reg = re.compile('\[\*\*.*\*\*\]', re.I)

In [43]:
with_id_dd = split_dd_no_num[split_dd_no_num.apply(lambda x: [i for i in x if re.search(name_repl_reg, i) != None]).str.len() > 0]

In [44]:
# clear out the named bracket things... MedCAT will ignore anyways.

In [45]:
# why not just run medcat on this...

In [46]:
# Clean up those sentences that are just ., or blanks or primary diagonses...

In [47]:
num_reg = re.compile('(?:\(?[0-9]\)?\.?)(.*)')
def clean_numbers(match):    
    [re.match(num_reg, s)[1] for s in match.split('\n')]

In [None]:
strs.apply(clean_numbers)

In [49]:
numbers = df[df.match.apply(numbers_present)].reset_index(drop=True)

In [None]:
print(numbers.iloc[13,:].note)

In [51]:
print(numbers.reset_index(drop=True).iloc[13, :].match)


primary diagnosis:
1. respiratory depression
2. altered mental status
3. substance abuse/overdose



In [None]:
print(notes[notes.hadm_id == 100061].text.tolist()[0])

In [None]:
for m in numbers.iloc[10:30, :].itertuples():
    print(f'hadm_id:{m.hadm_id}')
    print(m.match)