# Preprocessing of the NRD data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

%matplotlib inline

In [2]:
path = '/nfs/turbo/intmed-bnallamo-turbo/wsliu/Data/NRD/'

In [2]:
path = '/nfs/turbo/umms-awaljee/wsliu/Data/NRD/'

In [3]:
from ccs_tools import dx_multi, pr_multi, dx_label, core_cols, core_dtypes_pd, na_values

Readmission labels:

In [4]:
readm_short = pd.read_csv(path+'cohorts/readm_labels_short.csv')

Comorbidities:

In [5]:
severity_cols = ['APRDRG', 'APRDRG_Risk_Mortality', 'APRDRG_Severity', 'CM_AIDS',
       'CM_ALCOHOL', 'CM_ANEMDEF', 'CM_ARTH', 'CM_BLDLOSS', 'CM_CHF',
       'CM_CHRNLUNG', 'CM_COAG', 'CM_DEPRESS', 'CM_DM', 'CM_DMCX', 'CM_DRUG',
       'CM_HTN_C', 'CM_HYPOTHY', 'CM_LIVER', 'CM_LYMPH', 'CM_LYTES', 'CM_METS',
       'CM_NEURO', 'CM_OBESE', 'CM_PARA', 'CM_PERIVASC', 'CM_PSYCH',
       'CM_PULMCIRC', 'CM_RENLFAIL', 'CM_TUMOR', 'CM_ULCER', 'CM_VALVE',
       'CM_WGHTLOSS', 'HOSP_NRD', 'KEY_NRD']

severity_labels = ['All Patient Refined DRG', 'All Patient Refined DRG: Risk of Mortality Subclass', 'All Patient Refined DRG: Severity of Illness Subclass', 'AHRQ comorbidity measure: Acquired immune deficiency syndrome', 'AHRQ comorbidity measure: Alcohol abuse', 'AHRQ comorbidity measure: Deficiency anemias', 'AHRQ comorbidity measure: Rheumatoid arthritis/collagen vascular diseases', 'AHRQ comorbidity measure: Chronic blood loss anemia', 'AHRQ comorbidity measure: Congestive heart failure', 'AHRQ comorbidity measure: Chronic pulmonary disease', 'AHRQ comorbidity measure: Coagulopathy', 'AHRQ comorbidity measure: Depression', 'AHRQ comorbidity measure: Diabetes, uncomplicated', 'AHRQ comorbidity measure: Diabetes with chronic complications', 'AHRQ comorbidity measure: Drug abuse', 'AHRQ comorbidity measure: Hypertension (combine uncomplicated and complicated)', 'AHRQ comorbidity measure: Hypothyroidism', 'AHRQ comorbidity measure: Liver disease', 'AHRQ comorbidity measure: Lymphoma', 'AHRQ comorbidity measure: Fluid and electrolyte disorders', 'AHRQ comorbidity measure: Metastatic cancer', 'AHRQ comorbidity measure: Other neurological disorders', 'AHRQ comorbidity measure: Obesity', 'AHRQ comorbidity measure: Paralysis', 'AHRQ comorbidity measure: Peripheral vascular disorders', 'AHRQ comorbidity measure: Psychoses', 'AHRQ comorbidity measure: Pulmonary circulation disorders', 'AHRQ comorbidity measure: Renal failure', 'AHRQ comorbidity measure: Solid tumor without metastasis', 'AHRQ comorbidity measure: Peptic ulcer disease excluding bleeding', 'AHRQ comorbidity measure: Valvular disease', 'AHRQ comorbidity measure: Weight loss', 'NRD hospital identifier', 'NRD record identifier']

severity_dtypes_pd = dict(zip(severity_cols, [float]*len(severity_cols)))

severity = pd.read_table(path+'raw/2014/NRD_2014_Severity.CSV', sep=',', 
                           header = None, 
                           names=severity_cols, 
                           dtype=severity_dtypes_pd, 
                           na_values=na_values)

### Cohorts for prediction model:

In [28]:
cohort = 'pna'
folder = 'elder/'

In [29]:
ami = pd.read_csv(path+'cohorts/{}_core_2014.csv'.format(cohort), dtype=core_dtypes_pd)
ami_label = pd.merge(ami, readm_short[['KEY_NRD', 'planned', 'readm30']], on='KEY_NRD')

For AMI:

In [8]:
ami_label = ami_label[(ami_label['LOS']!=0) & (ami_label['DIED']==0) & (ami_label['DISPUNIFORM'] != 7) 
                      & (ami_label['DMONTH'] != 12) & (ami_label['DISPUNIFORM'] != 2) 
                      & (ami_label['AGE']>=65)]

For CHF and PNA:

In [30]:
ami_label = ami_label[(~ami_label['LOS'].isna()) & (ami_label['DIED']==0) & (ami_label['DISPUNIFORM'] != 7) 
                      & (ami_label['DMONTH'] != 12) & (ami_label['DISPUNIFORM'] != 2)
                      & (ami_label['AGE']>=65)]

In [31]:
len(ami_label), len(ami_label['HOSP_NRD'].value_counts())

(231839, 1979)

In [102]:
len(ami_label) - len(ami_label.drop_duplicates(subset=['NRD_VisitLink'], keep='first'))

6875

In [32]:
ami_label = ami_label.drop_duplicates(subset=['NRD_VisitLink'], keep='first')

hosp_freq = ami_label.HOSP_NRD.value_counts()

hosp_in = hosp_freq[hosp_freq>=10].index

In [11]:
sum(hosp_freq<10), len(ami_label.loc[~ami_label.HOSP_NRD.isin(hosp_in)])

(603, 2092)

In [33]:
ami_label = ami_label[ami_label['HOSP_NRD'].isin(hosp_in)]

In [34]:
len(ami_label), len(ami_label['HOSP_NRD'].value_counts())

(212365, 1852)

In [35]:
ami_label.to_csv(path+folder+'cohorts10/{}/{}_pred.csv'.format(cohort, cohort), index=False)

In [91]:
ami_label = pd.read_csv(path+folder+'cohorts10/{}/{}_pred.csv'.format(cohort, cohort), dtype=core_dtypes_pd)

In [36]:
ami_severity = severity.loc[severity['KEY_NRD'].isin(ami_label['KEY_NRD'])]

ami_pred = pd.merge(ami_label, ami_severity[list(ami_severity.columns.difference(ami_label.columns))+['KEY_NRD']], on='KEY_NRD')

In [37]:
ami_pred.to_csv(path+folder+'cohorts10/{}/pred_comorb.csv'.format(cohort), index=False)


### Sample sizes of exclusion criteria:

In [51]:
len(ami_label), len(ami_label['HOSP_NRD'].value_counts())

(453850, 2008)

In [52]:
sum(ami_label.LOS.isna())

0

In [33]:
sum(ami_label['LOS'] == 0)

4188

In [53]:
sum(ami_label['DIED']==1)

18648

In [54]:
sum(ami_label['DISPUNIFORM']==7)

5057

In [55]:
sum(ami_label['DMONTH']==12)

47523

In [56]:
sum(ami_label['DISPUNIFORM']==2)

4839

In [79]:
sum(ami_label['PAY1'].isna())

363

In [57]:
sum(ami_label['AGE']<18)

28159

In [91]:
len(ami_label) - len(ami_label.drop_duplicates(subset=['NRD_VisitLink'], keep='first'))

19234

In [85]:
hosp_freq = ami_label.HOSP_NRD.value_counts()

In [87]:
sum(hosp_freq<10)

593

## Cross-Validation Splitting

In [38]:
from sklearn.model_selection import StratifiedKFold

In [39]:
skf = StratifiedKFold(n_splits=10, random_state=24, shuffle=True)

In [50]:
cohort = 'pna'
folder = 'elder/cohorts10/'

In [51]:
data_df = pd.read_csv(path+folder+'{}/pred_comorb.csv'.format(cohort), dtype=core_dtypes_pd)

In [52]:
tst_keys = []
for train_idx, tst_idx in skf.split(data_df, data_df.HOSP_NRD):
    tst_keys.append(data_df.loc[tst_idx, 'KEY_NRD'])

In [53]:
for tst_key in tst_keys:
    print(len(data_df.loc[data_df.KEY_NRD.isin(tst_key), 'HOSP_NRD'].value_counts()))

1852
1852
1852
1852
1852
1852
1852
1852
1852
1852


In [54]:
for j, tst_key in enumerate(tst_keys):
    tst_key.to_csv(path+folder+'{}/tst_key{}.csv'.format(cohort, j), index=False)

### Code Frequency

In [5]:
folder = 'multi_space_glove/'

In [6]:
DX_freq = pd.read_csv(path+folder+'DX_freq.csv')

In [8]:
PR_freq = pd.read_csv(path+folder+'PR_freq.csv')

In [9]:
sum(DX_freq.frequency>0), sum(PR_freq.frequency>0)

(12233, 3722)

In [12]:
sum(DX_freq.frequency>10), sum(PR_freq.frequency>10)

(9905, 3183)

In [15]:
sum(DX_freq.frequency>=40), sum(PR_freq.frequency>=40)

(8245, 2758)