# Convert study_id+diagnostic_date to document SID so we can associate SID to ICD codes

In [1]:
import numpy as np
import pandas as pd

In [None]:
studyICDdf = pd.read_csv("studyid_icd.csv")
studyICDdf.head()

In [3]:
notesdf = pd.read_csv("Stroke_Discharge_Notes_Adjudicated.csv")
notesdf.columns

Index(['StudyID', 'PatientSSN', 'EntryDateTime', 'TIUDocumentSID',
       'TIUStandardTitle', 'ReportText', 'Src'],
      dtype='object')

In [9]:
studyICDdf.StudyID.unique().shape

(72008,)

In [10]:
#quick check to make sure that all studyids in notes df have a study id in the icd df
for studyid in notesdf.StudyID:
    if studyid not in studyICDdf.StudyID:
        print(studyid)

In [11]:
notesdf = notesdf.dropna().drop_duplicates('TIUDocumentSID', keep='first') #same processing we did to select the appropriate non duplicates

In [23]:
# Convert date time columns to date time
# this is needed as DiagDateTime != EntryDateTime;
# we need to construct a range for comparison / inclusion later
notesdf['EntryDateTime'] = pd.to_datetime(notesdf['EntryDateTime'])
studyICDdf.DiagDateTime = pd.to_datetime(studyICDdf.DiagDateTime)

In [62]:
docidtoicds = dict()
for index,row in notesdf.iterrows():
    docid = row.TIUDocumentSID
    etime = row.EntryDateTime
    studyid = row.StudyID
    start_date = etime - pd.Timedelta(14, unit='D') #subtract 14 days
    end_date = etime + pd.Timedelta(14, unit='D') # add 14 days
    icdsubset = studyICDdf[studyICDdf.StudyID == studyid]
    daterangemask = icdsubset.DiagDateTime.between(start_date, end_date)
    maxdate = icdsubset[daterangemask].DiagDateTime.max()
    lasticds = icdsubset[icdsubset.DiagDateTime == maxdate].ICD.values.tolist()
    docidtoicds[docid] = lasticds
print(len(docidtoicds))

30087


In [63]:
import json
with open("docidtooldesticds.json", 'w') as outfile:
    json.dump(docidtoicds, outfile)

In [69]:
np.unique(list(docidtoicds.items())[2][1])

array(['B95.4', 'D64.9', 'G47.33', 'G89.29', 'I13.0', 'I33.0', 'I50.31',
       'I63.422', 'I63.431', 'K25.3', 'M25.512', 'N17.9', 'N18.3',
       'N40.0', 'R10.9', 'Z85.528', 'Z90.5'], dtype='<U7')

In [70]:
strokeicd10s = set(['G43.609', 'G43.619', 'G43.601', 'G43.611', 'I60.00', 'I60.01', 'I60.02', 'I60.10', 'I60.11', 'I60.12', 'I60.2', 'I60.30', 'I60.31', 'I60.32', 'I60.4', 'I60.50', 'I60.51', 'I60.52', 'I60.6', 'I60.7', 'I60.8', 'I60.9', 'I61.0', 'I61.1', 'I61.2', 'I61.3', 'I61.4', 'I61.5', 'I61.6', 'I61.8', 'I61.9', 'I62.1', 'I62.00', 'I62.01', 'I62.02', 'I62.03', 'I62.9', 'I63.02', 'I63.12', 'I63.22', 'I63.031', 'I63.032', 'I63.033', 'I63.039', 'I63.131', 'I63.132', 'I63.133', 'I63.139', 'I63.231', 'I63.232', 'I63.233', 'I63.239', 'I63.011', 'I63.012', 'I63.013', 'I63.019', 'I63.111', 'I63.112', 'I63.113', 'I63.119', 'I63.211', 'I63.212', 'I63.213', 'I63.219', 'I63.59', 'I63.09', 'I63.19', 'I63.59', 'I63.00', 'I63.10', 'I63.20', 'I63.29', 'I66.01', 'I66.02', 'I66.03', 'I66.09', 'I66.11', 'I66.12', 'I66.13', 'I66.19', 'I66.21', 'I66.22', 'I66.23', 'I66.29', 'I66.3', 'I66.8', 'I66.9', 'I63.30', 'I63.311', 'I63.312', 'I63.313', 'I63.319', 'I63.321', 'I63.322', 'I63.323', 'I63.329', 'I63.331', 'I63.332', 'I63.333', 'I63.339', 'I63.341', 'I63.342', 'I63.343', 'I63.349', 'I63.39', 'I63.6', 'I63.40', 'I63.411', 'I63.412', 'I63.413', 'I63.419', 'I63.421', 'I63.422', 'I63.423', 'I63.429', 'I63.431', 'I63.432', 'I63.433', 'I63.439', 'I63.441', 'I63.442', 'I63.443', 'I63.449', 'I63.49', 'I63.50', 'I63.511', 'I63.512', 'I63.513', 'I63.519', 'I63.521', 'I63.522', 'I63.523', 'I63.529', 'I63.531', 'I63.532', 'I63.533', 'I63.539', 'I63.541', 'I63.542', 'I63.543', 'I63.549', 'I63.59', 'I63.8', 'I63.81', 'I63.89', 'I63.9', 'I67.89', 'I65.1', 'I65.21', 'I65.22', 'I65.23', 'I65.29', 'I65.01', 'I65.02', 'I65.03', 'I65.09', 'I65.8', 'I65.9', 'I67.2', 'I67.81', 'I67.82', 'I67.89', 'I67.1', 'I67.7', 'I68.2', 'I67.5', 'I67.6', 'G45.4', 'G46.3', 'G46.4', 'G46.5', 'G46.6', 'G46.7', 'G46.8', 'I67.89', 'I68.0', 'I68.8', 'I67.9', 'G45.0', 'G45.8', 'G45.1', 'G45.2', 'G45.8', 'G46.0', 'G46.1', 'G46.2', 'G45.9', 'I67.841', 'I67.848'])

In [90]:
uniq_icd_combos = set()
for key,values in docidtoicds.items():
    strokecodes = set()
    for value in values:
        if value in strokeicd10s:
            strokecodes.update((value.split('.')[0],))
    uniq_icd_combos.update(('_'.join(sorted(list(strokecodes))),))

In [93]:
uniq_icd_combos

{'',
 'G43_I60_I63',
 'G43_I61_I63',
 'G43_I63',
 'G45',
 'G45_G46',
 'G45_G46_I63',
 'G45_G46_I65',
 'G45_I60',
 'G45_I61',
 'G45_I62',
 'G45_I62_I65',
 'G45_I63',
 'G45_I63_I65',
 'G45_I63_I66',
 'G45_I65',
 'G45_I65_I66',
 'G45_I65_I66_I67',
 'G45_I65_I67',
 'G45_I66',
 'G45_I67',
 'G46',
 'G46_I60',
 'G46_I61',
 'G46_I61_I63',
 'G46_I63',
 'G46_I63_I65',
 'G46_I63_I65_I67',
 'G46_I63_I67',
 'G46_I65',
 'G46_I65_I67',
 'G46_I66',
 'G46_I67',
 'G46_I67_I68',
 'I60',
 'I60_I61',
 'I60_I61_I62',
 'I60_I61_I62_I68',
 'I60_I61_I63',
 'I60_I61_I63_I67',
 'I60_I61_I67',
 'I60_I61_I68',
 'I60_I62',
 'I60_I62_I63',
 'I60_I62_I68',
 'I60_I63',
 'I60_I63_I65',
 'I60_I63_I67',
 'I60_I65',
 'I60_I65_I67',
 'I60_I66',
 'I60_I67',
 'I60_I68',
 'I61',
 'I61_I62',
 'I61_I62_I63',
 'I61_I62_I63_I67',
 'I61_I63',
 'I61_I63_I65',
 'I61_I63_I68',
 'I61_I65',
 'I61_I65_I67',
 'I61_I65_I67_I68',
 'I61_I66',
 'I61_I67',
 'I61_I67_I68',
 'I61_I68',
 'I62',
 'I62_I63',
 'I62_I63_I65',
 'I62_I63_I65_I67',
 'I