In [36]:
import plotly.express as px
import pandas as pd
import ast
from zensols.mednlp import ApplicationFactory

In [75]:
data_folder = "data/raw/"
d_icd = pd.read_csv(data_folder + 'D_ICD_DIAGNOSES.csv.gz', usecols=['ICD9_CODE', 'SHORT_TITLE', 'LONG_TITLE']) # ICD Code lookup
diagnoses = pd.read_csv(data_folder + 'DIAGNOSES_ICD.csv.gz', usecols=['HADM_ID', 'ICD9_CODE']) # Linkage between ICD codes and Note events
drg = pd.read_csv(data_folder + 'DRGCODES.csv.gz', usecols=['HADM_ID','DESCRIPTION']) # DRG Codes
note_events= pd.read_csv(data_folder + 'NOTEEVENTS.csv.gz', usecols=['HADM_ID','TEXT'], nrows=20000) # Number of rows = 2,083,180 without filter # We will handle this later

In [44]:
# Data Sizes
print(f"D_ICD Shape: {d_icd.shape}")
print(f"DIAGNOSES Shape: {diagnoses.shape}")
print(f"DRG Shape: {drg.shape}")
# print(f"Note Events Shape: {note_events.shape}")

D_ICD Shape: (14567, 3)
DIAGNOSES Shape: (651047, 2)
DRG Shape: (125557, 2)


In [24]:
# Schemas
print(f"D_ICD Schema: {d_icd.columns}")
print(f"DIAGNOSES Schema: {diagnoses.dtypes}")
print(f"DRG Schema: {drg.columns}")
print(f"Note Events Schema: {note_events.columns}")

D_ICD Schema: Index(['ICD9_CODE', 'SHORT_TITLE', 'LONG_TITLE'], dtype='object')
DIAGNOSES Schema: HADM_ID       int64
ICD9_CODE    object
dtype: object
DRG Schema: Index(['HADM_ID', 'DESCRIPTION'], dtype='object')
Note Events Schema: Index(['HADM_ID', 'TEXT'], dtype='object')


In [46]:
note_events['TEXT'] = note_events['TEXT'].apply(lambda x: "\"" + str(x) + "\"")
diagnoses = diagnoses.groupby(['HADM_ID']).agg(tuple).map(list).reset_index()
joined = note_events.join(diagnoses.set_index("HADM_ID"), on=['HADM_ID'], how='inner')
# joined = joined.join(drg.set_index('HADM_ID'), on='HADM_ID', how='inner').groupby(['HADM_ID','TEXT', 'ICD9_CODE']).agg(tuple).map(list).reset_index()
# joined = joined.join(d_icd.set_index('ICD9_CODE'), on='ICD9_CODE', how='inner')

In [26]:
print(joined.columns)
print(joined.shape)
display(joined.head(20))

Index(['HADM_ID', 'TEXT', 'ICD9_CODE'], dtype='object')
(20000, 3)


Unnamed: 0,HADM_ID,TEXT,ICD9_CODE
0,167853,"""Admission Date: [**2151-7-16**] Discha...","[01193, 4254, 42731, 2639, 2762, 5070, 5119, 2..."
245,167853,"""Admission Date: [**2151-7-16**] Discha...","[01193, 4254, 42731, 2639, 2762, 5070, 5119, 2..."
1,107527,"""Admission Date: [**2118-6-2**] Dischar...","[5191, 49121, 51881, 486, 2761, 2449, 311]"
2,167118,"""Admission Date: [**2119-5-4**] ...","[5191, 5185, 496, 2762, 45340, 5533]"
3,196489,"""Admission Date: [**2124-7-21**] ...","[51884, 5849, 34830, 49121, 2760, 4160, 3594, ..."
4,135453,"""Admission Date: [**2162-3-3**] ...","[80506, 5070, 42823, 2930, 4538, E882, 4280, 4..."
5,170490,"""Admission Date: [**2172-3-5**] ...","[2252, 7140, 7102, 4430, 53081, V4364, V4365]"
6,134727,"""Admission Date: [**2112-12-8**] ...","[51881, 486, 41519, 4280, 5779, 4019, V4582, V..."
7,114236,"""Admission Date: [**2150-2-25**] ...","[2252, 4019, 42731, V1046, V4501, V5861]"
8,163469,"""Admission Date: [**2118-8-10**] ...","[4378, 74781, 3485, 9092, 34590]"


In [27]:
display(joined.loc[joined['HADM_ID'] == 100195])
display(joined.loc[joined['HADM_ID'] == 100195]['TEXT'].values[0])
display(len(joined.loc[joined['HADM_ID'] == 100195]['TEXT'].values[0]))

Unnamed: 0,HADM_ID,TEXT,ICD9_CODE
145,100195,"""Admission Date: [**2143-8-23**] Discha...","[41072, 99672, 99812, 41402, 41401, 4019, 2720..."


'"Admission Date: [**2143-8-23**]        Discharge Date: [**2143-8-27**]\n\nDate of Birth:  [**2077-7-13**]        Sex:  M\n\nService:  CCU\n\n\nHISTORY OF PRESENT ILLNESS:  This is a 66-year-old man with\nsevere CAD, status post CABG in [**2135**] with recent PCI to the\nLMCA and SVG to the PDL in [**2143-7-14**].  He presented on\n[**2143-8-23**] for an elective intervention brachytherapy of the\nSVG to PL and native RCA.  The patient reported that he had\nbeen feeling well without chest pain, shortness of breath, or\ndyspnea on exertion.  He was noted to have an ejection\nfraction of greater than 60 percent in [**2143-7-14**].  The\npatient underwent a cardiac catheterization on the morning of\narrival with PCI to the native RCA and 4 stents and\nbrachytherapy to the vein graft.  The patient tolerated the\nprocedure well and approximately 6 hours later developed a\nchest pain noted as 4 out of 10 substernal radiating to his\nthroat and back without shortness of breath, diaphoresis,\

8677

In [28]:
display(d_icd.loc[d_icd['ICD9_CODE'] == '1890'])

Unnamed: 0,ICD9_CODE,SHORT_TITLE,LONG_TITLE
2079,1890,Malig neopl kidney,"Malignant neoplasm of kidney, except pelvis"


In [29]:
diagnoses.dtypes

HADM_ID       int64
ICD9_CODE    object
dtype: object

## Look at Note Format

In [31]:
joined['TEXT'][1]

'"Admission Date:  [**2118-6-2**]       Discharge Date:  [**2118-6-14**]\n\nDate of Birth:                    Sex:  F\n\nService:  MICU and then to [**Doctor Last Name **] Medicine\n\nHISTORY OF PRESENT ILLNESS:  This is an 81-year-old female\nwith a history of emphysema (not on home O2), who presents\nwith three days of shortness of breath thought by her primary\ncare doctor to be a COPD flare.  Two days prior to admission,\nshe was started on a prednisone taper and one day prior to\nadmission she required oxygen at home in order to maintain\noxygen saturation greater than 90%.  She has also been on\nlevofloxacin and nebulizers, and was not getting better, and\npresented to the [**Hospital1 18**] Emergency Room.\n\nIn the [**Hospital3 **] Emergency Room, her oxygen saturation was\n100% on CPAP.  She was not able to be weaned off of this\ndespite nebulizer treatment and Solu-Medrol 125 mg IV x2.\n\nReview of systems is negative for the following:  Fevers,\nchills, nausea, vomiting, nig

### Examine Code Distribution

In [76]:
# Get Distribution of codes from Diagnoses table
def remove_evm_codes(df: pd.DataFrame) -> pd.DataFrame:
    # Drop E, V, and M codes
    df['ICD9_CODE'] = df['ICD9_CODE'].apply(lambda x: str(x))
    output = df[~df['ICD9_CODE'].str.startswith("E")]
    output = output[~output['ICD9_CODE'].str.startswith("V")]   
    output = output[~output['ICD9_CODE'].str.startswith("M")]
    return output

# DIAGNOSES Schema: Index(['HADM_ID', 'ICD9_CODE'], dtype='object')

df = diagnoses[diagnoses['ICD9_CODE'].notna()]

df = remove_evm_codes(df)

# Drop codes that start with E, V, and M
df = remove_evm_codes(df)

# Shorten codes to 3 digits (for now)
df['ICD9_CODE'] = df['ICD9_CODE'].str.slice(0, 3)

# Explode list
df['ICD9_CODE'] = df['ICD9_CODE'].explode('ICD9_CODE').reset_index(drop=True)
df = df.drop(columns=['HADM_ID'])

# Get count for each ecode
df = df.value_counts().reset_index()

display(df.head())




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,ICD9_CODE,count
0,401,18188
1,427,17736
2,428,17586
3,276,17441
4,250,13918


In [77]:
# Plot ALL

fig = px.bar(df.nlargest(columns=['count'], n=500), x='ICD9_CODE', y='count',color='ICD9_CODE', title='Code Counts')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()


In [78]:
# Plot ALL

fig = px.bar(df.nlargest(columns=['count'], n=200), x='ICD9_CODE', y='count',color='ICD9_CODE', title='Code Counts')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [82]:
# Plot Scoped to MAX

# Filter out codes out of scope
subset_df = pd.read_csv("./data/joined/dataset_single_001_088.csv.gz")
subset_df.drop(columns=['TEXT'], inplace=True)

# Explode list
subset_df['ICD9_CODE'] = subset_df['ICD9_CODE'].explode('ICD9_CODE').reset_index(drop=True)
display(subset_df.head(3))
subset_df.drop(columns=['HADM_ID'], inplace=True)

# Get count for each ecode
subset_df = subset_df.value_counts().reset_index()



fig = px.bar(subset_df, x='ICD9_CODE', y='count',color='ICD9_CODE', title='Code Counts')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

Unnamed: 0,HADM_ID,ICD9_CODE
0,100020,['041']
1,100074,"['038', '070']"
2,100099,['041']


## The Long Tail Problem

In [94]:
# The TOP X codes represent what percent of the data? (Total Dataset)
x=260

top_100 = df.nlargest(columns=['count'], n=x)
top_100_sum = top_100['count'].sum()
total_sum = df['count'].sum()

print(f"Top {x} Codes represent {top_100_sum/total_sum*100:.2f}% of the data")

Top 260 Codes represent 95.08% of the data


## Test out MedNLP Package for Note Parsing

In [None]:
# Sample
doc_parser = ApplicationFactory.get_doc_parser()
doc = doc_parser('John was diagnosed with kidney failure')

for tok in doc.tokens: 
    print(tok.norm, tok.pos_, tok.tag_, tok.cui_, tok.detected_name_)

print(doc.entities)

In [None]:
# Sample from MIMIC-III dataset

#print(f"***\n{joined['TEXT'][1]} \n***")

doc_parser = ApplicationFactory.get_doc_parser()
doc = doc_parser(joined['TEXT'][1])

print("**Parsed:**")
# print(doc.entities)
new_note = set([])
for tok in doc.tokens:
    if tok.is_concept and tok.tuis_ in ['T184', 'T047', 'T046', 'T033', 'T037','T191']:
        '''
        Maybe add?:
        T060: Diagnostic Procedure
        T061 Therapeutic or Preventive Procedure
        T033: Finding
        T046: Pathologic Function
        '''
        print(tok, tok.detected_name_, tok.sub_names, tok.pref_name_, tok.tuis_, tok.tui_descs_)
        new_note.add(tok.detected_name_.replace("~"," "))
        new_note.add(tok.pref_name_.lower())

print(f"**New Note:**\n{new_note}")