# Create the train-val-test split

In [1]:
from imports import *

## Tabular data & ECG preprocessing

In [2]:
# UNOS
df_org = pd.read_excel('./datasets/UNOSkidney_part1_updated_death_date2_anonym.xlsx')
print(len(df_org), len(df_org.MRN_DEID.unique()))

# required filtering
df_sel = df_org.iloc[1232:-1]
df_sel = df_sel.dropna(subset=['MRN_DEID'])

df_sel['transplantDate_DEID'] = pd.to_datetime(df_sel['Transplant Date_DEID']).dt.date
df_sel = df_sel.loc[df_sel['transplantDate_DEID']<=datetime.date(2020, 6, 30)]
df_sel = df_sel.sort_values(by='transplantDate_DEID')
df_sel = df_sel.drop_duplicates(subset=['MRN_DEID'], keep='first')
print('patient data read:\t', len(df_sel), len(df_sel.MRN_DEID.unique()))

df_sel = df_sel.fillna('UNKNOWN')
df_sel['Number of Previous Transplants'] = df_sel['Number of Previous Transplants'].replace({'UNKNOWN': -1})
df_sel['HLA Mismatch Level'] = df_sel['HLA Mismatch Level'].replace({'UNKNOWN': -1})
df_sel['CMV IgG'] = df_sel['CMV IgG'].replace({'Not Done': 'UNKNOWN'})
df_sel['Diabetes at Listing'] = df_sel['Diabetes at Listing'].replace({'Type II': 'Yes', 'Type I':'Yes', 'Type Other':'Yes', 'Type Unknown':'Yes'})
df_sel['HBV Surface Antibody Total'] = df_sel['HBV Surface Antibody Total'].replace({'Unknown': 'UNKNOWN'})
df_sel['HCV Serostatus'] = df_sel['HCV Serostatus'].replace({'Not Done': 'UNKNOWN'})
df_sel['Donor CMV IgG (Living Donor)'] = df_sel['Donor CMV IgG (Living Donor)'].replace({'Not Reported': 'UNKNOWN', 'Not Done': 'UNKNOWN'})
df_sel['EBV IgG (Deceased Donor)'] = df_sel['EBV IgG (Deceased Donor)'].replace({'Not Done': 'UNKNWON', 'Not Reported': 'UNKNOWN', 'Indeterminate': 'UNKNOWN', 'Pending': 'UNKNOWN'})
df_sel['Donor History of Diabetes'] = df_sel['Donor History of Diabetes'].replace({'Unknown': 'UNKNOWN',  'Not Reported': 'UNKNOWN'})
df_sel['Donor History of Cancer (Y/N)'] = df_sel['Donor History of Cancer (Y/N)'].replace({'Unknown': 'UNKNOWN',  'Not Reported': 'UNKNOWN'})
df_sel['Donor History of Cigarettes in Past (>20 Pack Years)'] = df_sel['Donor History of Cigarettes in Past (>20 Pack Years)'].replace({'Unknown': 'UNKNOWN',  'Not Reported': 'UNKNOWN'})
df_sel['Donor History of Cocaine Use in Past'] = df_sel['Donor History of Cocaine Use in Past'].replace({'Unknown': 'UNKNOWN',  'Not Reported': 'UNKNOWN'})
df_sel['Donor History of Hypertension'] = df_sel['Donor History of Hypertension'].replace({'YES, 0-5 YEARS': 'Yes' ,'YES, 6-10 YEARS': 'Yes', 'YES, >10 YEARS': 'Yes', 'YES, UNKNOWN DURATION': 'Yes',
                                                                                           'Not Reported': 'UNKNOWN'})
df_sel['Donor History of Other Drug Use in Past'] = df_sel['Donor History of Other Drug Use in Past'].replace({'Unknown': 'UNKNOWN',  'Not Reported': 'UNKNOWN'})
df_sel['Donor HCV Antibody (Deceased Donor)'] = df_sel['Donor HCV Antibody (Deceased Donor)'].replace({'Not Reported': 'UNKNOWN'})
df_sel['Donor HBV Core Antibody (Living and Deceased)'] = df_sel['Donor HBV Core Antibody (Living and Deceased)'].replace({'Not Done': 'UNKNOWN'})
df_sel['Donor HBV Surface Antibody Total (Deceased)'] = df_sel['Donor HBV Surface Antibody Total (Deceased)'].replace({'Not Done': 'UNKNOWN', 'Not Reported': 'UNKNOWN'})
df_sel['Donor HCV Antibody (Living Donor)'] = df_sel['Donor HCV Antibody (Living Donor)'].replace({'Not Done': 'UNKNOWN'})
df_sel['Donor HCV RIBA (Living Donor)'] = df_sel['Donor HCV RIBA (Living Donor)'].replace({'Not Done': 'UNKNOWN', 'Unknown': 'UNKNOWN', 'Not Reported': 'UNKNOWN'})
df_sel['Donor HCV RNA (Living Donor)'] = df_sel['Donor HCV RNA (Living Donor)'].replace({'Not Done': 'UNKNOWN', 'Unknown': 'UNKNOWN', 'Not Reported': 'UNKNOWN'})

df_sel['Race/Ethnicity Category'] = df_sel['Race/Ethnicity Category'].replace({'Amer Ind/Alaska Native': 'UNKNOWN', 'Asian': 'UNKNOWN', 'Multiracial': 'UNKNOWN', 'Native Hawaiian/other Pacific Islander': 'UNKNOWN'})
df_sel['Donor Race/Ethnicity Category'] = df_sel['Donor Race/Ethnicity Category'].replace({'Donor Amer Ind/Alaska Native': 'UNKNOWN', 'Asian': 'UNKNOWN', 'Multiracial': 'UNKNOWN', 'Native Hawaiian/other Pacific Islander': 'UNKNOWN'})

mask = df_sel['Primary Diagnosis at Transplant']=='OTHER SPECIFY'
df_sel.loc[mask, 'Primary Diagnosis at Transplant'] = df_sel.loc[mask, 'Primary Diagnosis at Transplant, Specify']

mask = df_sel['Previous Malignancy Type at Transplant']==' Other, specify'
df_sel.loc[mask, 'Previous Malignancy Type at Transplant'] = df_sel.loc[mask, 'Previous Malignancy Type, Specify at Transplant']

mask = df_sel['Living Donor Relation to Recipient']=='Non-Biological, Other Unrelated Directed Donation: Specify'
df_sel.loc[mask, 'Living Donor Relation to Recipient'] = df_sel.loc[mask, 'Living Donor Relation to Recipient, Specify']

df_sel['Transplant Date_DEID'] = pd.to_datetime(df_sel['Transplant Date_DEID'], format = "%Y-%m-%d", errors = 'ignore') 
df_sel['Graft Failure  Date_DEID'] = pd.to_datetime(df_sel['Graft Failure  Date_DEID'], format = "%Y-%m-%d", errors = 'ignore') 
df_sel['Date of Birth_DEID'] = pd.to_datetime(df_sel['Date of Birth_DEID'], format = "%Y-%m-%d", errors = 'ignore') 
                                                
df_sel['Date GFR Became <=20 Mil/Min (Kidney)_DEID'] = pd.to_datetime(df_sel['Date GFR Became <=20 Mil/Min (Kidney)_DEID'], format = "%Y-%m-%d", errors = 'ignore')
df_sel['Date First Dialyzed (TRR)_DEID'] = pd.to_datetime(df_sel['Date First Dialyzed (TRR)_DEID'], format = "%Y-%m-%d", errors = 'ignore')
df_sel['Date Began Dialysis (WL)_DEID'] = pd.to_datetime(df_sel['Date Began Dialysis (WL)_DEID'], format = "%Y-%m-%d", errors = 'ignore')
                                                
df_sel['Composite Patient Death Date From OPTN Or Verified From External Sources_DEID'] = pd.to_datetime(df_sel['Composite Patient Death Date From OPTN Or Verified From External Sources_DEID'], format = "%Y-%m-%d", errors = 'ignore')

for i,j in df_sel.iterrows():
    dt = df_sel.at[i, 'Transplant Date_DEID']
    dial_1st = df_sel.at[i, 'Date First Dialyzed (TRR)_DEID']
    dial_begin = df_sel.at[i, 'Date Began Dialysis (WL)_DEID']
    death_dt = df_sel.at[i, 'Composite Patient Death Date From OPTN Or Verified From External Sources_DEID']
    birth_dt = df_sel.at[i, 'Date of Birth_DEID']
    graft_dt = df_sel.at[i, 'Graft Failure  Date_DEID']
    if type(dial_1st) is not str:
        df_sel.at[i, 'Date First Dialyzed (TRR)_diff'] = (dt-dial_1st).days
    else:
        df_sel.at[i, 'Date First Dialyzed (TRR)_diff'] = 1e5
    if type(dial_begin) is not str:
        df_sel.at[i, 'Date Began Dialysis (WL)_diff'] = (dt-dial_begin).days
    else:
        df_sel.at[i, 'Date Began Dialysis (WL)_diff'] = 1e5
    if type(death_dt) is not str:
        df_sel.at[i, 'Death Date_diff'] = (death_dt-dt).days
    else:
        df_sel.at[i, 'Death Date_diff'] = 1e5
    if type(graft_dt) is not str:
        df_sel.at[i, 'Graft Failure Date_diff'] = (graft_dt-dt).days
    else:
        df_sel.at[i, 'Graft Failure Date_diff'] = 1e5
                                                
#     if df_sel.at[i, 'Age in Years at Transplant'] == 'UNKNOWN' and birth_dt != 'UNKNOWN':  ### birthdays are unknown too in these cases
#         print(birth_dt)
#         df_sel.at[i, 'Age in Years at Transplant'] = np.round((dt-birth_dt)/365.25)
#         print(i, np.round((dt-birth_dt)/365.25))
    if i%500==0:
        print(i)
bins = [ 0, 365, 2*365, 3*365, 4*365, 5*365, 6*365, 7*365, 8*365, 9*365, 3650, 1e5]
df_sel['Date Began Dialysis (WL)_diff_binned'] = pd.cut(df_sel['Date Began Dialysis (WL)_diff'], bins = bins)
df_sel['Date First Dialyzed (TRR)_diff_binned'] = pd.cut(df_sel['Date First Dialyzed (TRR)_diff'], bins = bins)
df_sel['Date Began Dialysis (WL)_diff_binned'].value_counts(), df_sel['Date First Dialyzed (TRR)_diff_binned'].value_counts()

def map_bmi(x):
    if type(x) is str and x == 'UNKNOWN':
        return 'UNKNOWN'
    elif x<=18.5:
        return 'Underweight'
    elif x<=24.9:
        return 'Healthy'
    elif x<=29.9:
        return 'Overweight'
    else:
        return 'Obese'
    
def map_age(x):
    if type(x) is str and x == 'UNKNOWN':
        return -1
    else:
        return np.floor(x/10)

df_sel['Age in Years at Transplant_binned'] = df_sel['Age in Years at Transplant'].apply(map_age)
df_sel['Donor Age_binned'] = df_sel['Donor Age'].apply(map_age)
df_sel['BMI at Transplant_binned'] = df_sel['BMI at Transplant'].apply(map_bmi)


df_sel['GF_Death_diff'] = df_sel[['Graft Failure Date_diff','Death Date_diff']].min(axis=1)

df_sel['Donor Weight (Kg)'] = df_sel['Donor Weight (Kg)'].replace({'UNKNOWN':-1})
bins = [ -10, 0, 25, 2*25, 3*25, 4*25, 5*25, 6*25, 7*25, 8*25, 9*25]
df_sel['Donor Weight (Kg)_binned'] = pd.cut(df_sel['Donor Weight (Kg)'], bins = bins)


bins = [-1, 0, 25, 50, 75, 100]
df_sel['CPRA at Transplant_binned'] = pd.cut(df_sel['CPRA at Transplant'], bins = bins)

df_sel[ 'Cold Ischemic Time (Hours) (KI,LI)'] = df_sel[ 'Cold Ischemic Time (Hours) (KI,LI)'].replace({'UNKNOWN':-1})
bins = [ -10, 0, 10, 20, 30, 40, 50, 60]
df_sel[ 'Cold Ischemic Time (Hours) (KI,LI)_binned'] = pd.cut(df_sel[ 'Cold Ischemic Time (Hours) (KI,LI)'], bins = bins)

df_sel[ 'Donor Terminal Lab Creatinine'] = df_sel[ 'Donor Terminal Lab Creatinine'].replace({'UNKNOWN':-1})
bins = [ -5, 0, 5, 10, 15]
df_sel[ 'Donor Terminal Lab Creatinine_binned'] = pd.cut(df_sel[ 'Donor Terminal Lab Creatinine'], bins = bins)

df_sel['Most Recent Serum Creatinine at Time of Transplant'] = df_sel['Most Recent Serum Creatinine at Time of Transplant'].replace({'UNKNOWN':-1})
bins = [-5, 0, 5, 10, 15, 20, 25, 30]
df_sel['Most Recent Serum Creatinine at Time of Transplant_binned'] = pd.cut(df_sel['Most Recent Serum Creatinine at Time of Transplant'], bins = bins)

df_sel['Donor Weight (Kg)_binned'].value_counts(), df_sel['CPRA at Transplant_binned'].value_counts(), df_sel[ 'Cold Ischemic Time (Hours) (KI,LI)_binned'].value_counts(),  df_sel[ 'Donor Terminal Lab Creatinine_binned'].value_counts(), df_sel['Most Recent Serum Creatinine at Time of Transplant_binned'].value_counts()

## features
discrete_cols = ['Blood Type', 'CMV IgG', 'Gender',  'Functional Status at Transplant', 'Race/Ethnicity Category', 'Status at Transplant', 'EBV Serostatus', 'Diabetes at Listing', 
                 'Primary Diagnosis at Transplant', 'Pretransplant Dialysis at Transplant', 'Kidney Received on Ice or Pump', 'Multi-Organ Transplant?', 
                 'Number of Previous Transplants',  'Medical Condition at Listing', 'Previous Malignancy at Listing', 
                 'Previous Malignancy at Transplant', 'Previous Malignancy Type at Transplant', 'HLA Mismatch Level', 'HIV Serostatus', 'HCV Serostatus', 'HBV Core Antibody', 'HBV Surface Antibody Total',
                 'History of a Previous Transplant Involving Exact Same Organ as Current Transplant', 'Previous Solid Organ Transplant (Any Organ)', 'Kidney Pump Used?',
                 'Age in Years at Transplant_binned', 'BMI at Transplant_binned',  'Donor Age_binned', 'Date Began Dialysis (WL)_diff_binned', 'CPRA at Transplant_binned', 
                 'Cold Ischemic Time (Hours) (KI,LI)_binned', 'Most Recent Serum Creatinine at Time of Transplant_binned',
                 
                 'Donor Blood Type', 'Donor CMV IgG (Living Donor)', 'Donor Gender', 'Donor Race/Ethnicity Category', 'EBV IgG (Living Donor)', 'EBV IgG (Deceased Donor)', 
                 'Donor Type - Deceased or Living', 'Donation After Circulatory Death?', 'Donor History of Diabetes', 'Donor History of Cancer (Y/N)', 
                 'Donor History of Cigarettes in Past (>20 Pack Years)', 'Donor History of Cocaine Use in Past', 'Donor History of Hypertension', 'Donor History of Other Drug Use in Past', 
                 'Donor HCV Antibody (Deceased Donor)', 'Donor HBV Core Antibody (Living and Deceased)', 'Donor HBV Surface Antibody Total (Deceased)', 'Donor HCV Antibody (Living Donor)', 
                 'Donor HCV NAT (Deceased)', 'Donor HCV RIBA (Living Donor)', 'Donor HCV RNA (Living Donor)', 'Donor Weight (Kg)_binned', 'Donor Terminal Lab Creatinine_binned'
                ]



4649 4516
patient data read:	 2821 2821
1500
2000
2500
3000
3500
4000


In [3]:
# Read dfkey data
dfkey = pd.read_csv('./datasets/ECG_data_NEW_w_cardiac_outcome_2yrs_future_confirmed_anonym.csv')
dfkey['transplantDate_DEID'] = pd.to_datetime(dfkey['transplantDate_DEID']).dt.date
# rename df_cpt and dfkey columns - remove underscore in names - _x columns do not result
dfkey.drop(columns=['MRN_DEID', 'Transplant Date_x_DEID', 'Transplant Date_y_DEID'], inplace=True)
dfkey.rename(columns={'MRN_formatted_DEID': 'MRN_DEID', 'transplantDate_DEID' : 'TransplantDate_DEID'}, inplace=True)

# Read ECG data
ecg = pkl.load(open('./datasets/ECG_data_NEW.pkl', 'rb'))
dfkey_ecg = pd.read_csv('./datasets/ECG_data_NEW_w_cardiac_outcome_2yrs_future_confirmed_w_ImageName_anonym.csv')
dfkey_ecg['transplantDate_DEID'] = pd.to_datetime(dfkey_ecg['transplantDate_DEID']).dt.date
# dfkey['transplantDate'] = pd.to_datetime(dfkey['transplantDate']).dt.date
dfkey_ecg = dfkey_ecg.loc[dfkey_ecg['transplantDate_DEID']<=datetime.date(2020,6,30)]
dfkey_ecg.drop(columns=['MRN_DEID', 'Transplant Date_x_DEID', 'Transplant Date_y_DEID'], inplace=True)
dfkey_ecg.rename(columns={'MRN_formatted_DEID': 'MRN_DEID', 'transplantDate_DEID' : 'TransplantDate_DEID'}, inplace=True)

# get all unique common MRN common in tabular and ecg dataset for split in train, val, test sets
mrn_unique = dfkey.MRN_DEID.unique()
ecg_mrn_unique = dfkey_ecg.MRN_DEID.unique()
common_mrn_unique = np.intersect1d(mrn_unique, ecg_mrn_unique)
df_common_mrn_unique = pd.DataFrame({'common_mrn_unique': common_mrn_unique})

# remove uncommon patients from dfkey and dfkey_ecg 
dfkey_common = pd.merge(dfkey, df_common_mrn_unique, left_on='MRN_DEID', right_on='common_mrn_unique', how='inner')
dfkey_ecg_common = pd.merge(dfkey_ecg, df_common_mrn_unique, left_on='MRN_DEID', right_on='common_mrn_unique', how='inner')


In [4]:
# more processing of discrete columns of UNOS data
df_sel.rename(columns={'transplantDate_DEID' : 'TransplantDate_DEID'}, inplace=True)
# cols_key = [c for c in dfkey_common.columns if c not in discrete_cols]

to_be_removed = ['Gender_Female','Diabetes at Listing_No', 'Pretransplant Dialysis at Transplant_No', 'Multi-Organ Transplant?_NO', 'Donor CMV IgG (Living Donor)_Negative',
 'Donor Gender_Female', 'EBV Serostatus_Negative', 'Kidney Received on Ice or Pump_Ice', 'Previous Malignancy at Listing_No', 'Previous Malignancy at Transplant_No', 'HIV Serostatus_Positive',
 'HCV Serostatus_Negative',
 'HBV Core Antibody_Negative',
 'HBV Surface Antibody Total_Negative',
 'History of a Previous Transplant Involving Exact Same Organ as Current Transplant_No',
 'Previous Solid Organ Transplant (Any Organ)_No',
 'Kidney Pump Used?_No','Donor CMV IgG (Living Donor)_Negative',
 'EBV IgG (Deceased Donor)_UNKNWON',
 'Donor Type - Deceased or Living_Living Donor',
 'Donation After Circulatory Death?_Yes',
 'Donor History of Diabetes_Yes',
 'Donor History of Cancer (Y/N)_Yes',
 'Donor History of Cigarettes in Past (>20 Pack Years)_Yes',
 'Donor History of Cocaine Use in Past_Yes',
 'Donor History of Hypertension_Yes',
 'Donor History of Other Drug Use in Past_Yes',
 'Donor HCV Antibody (Deceased Donor)_Positive',
 'Donor HBV Core Antibody (Living and Deceased)_Positive',
 'Donor HBV Surface Antibody Total (Deceased)_Positive',
 'Donor HCV Antibody (Living Donor)_Positive',
 'Donor HCV NAT (Deceased)_Positive'
] 
npd_discrete = df_sel[discrete_cols].copy().to_numpy()
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(npd_discrete)
feature_names = []
feature_names_wo_unknown = []
feature_names_for_explainer = {}
unknown_idx = []
k = 0
for i in range(len(discrete_cols)):
    c = discrete_cols[i]
    cats = enc.categories_[i]
    feature_names_for_explainer[i] = []
    for cc in cats:
        feature_names.append(c+'_'+str(cc))
        if cc=='UNKNOWN' or cc=='Unknown' or cc==-1:
            unknown_idx.append(k)
        else:
            if c+'_'+str(cc) not in to_be_removed:
                feature_names_wo_unknown.append(c+'_'+str(cc))
                feature_names_for_explainer[i].append(cc)
            else:
                unknown_idx.append(k)
        k+=1
    #print(c, cats, k)
# for c in cont_cols:
#     feature_names.append(c)
#     feature_names_wo_unknown.append(c)
print(len(feature_names), len(feature_names_wo_unknown))

# Convert discrete columns to numerical values
npd_discrete = df_sel[discrete_cols].copy().to_numpy()
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(npd_discrete)
mat = np.asarray(enc.transform(npd_discrete).todense())
mat = np.delete(mat, unknown_idx, axis=1)

# Label encode discrete columns and store the result in mat
mat = df_sel[discrete_cols].apply(LabelEncoder().fit_transform)

# Replace the discrete columns in df_sel with mat
df_sel[discrete_cols] = mat

cols_key = [c for c in dfkey.columns if c not in discrete_cols]
dfkey_unos = dfkey_common[cols_key].merge(df_sel[['MRN_DEID', 'TransplantDate_DEID']+discrete_cols], on=['MRN_DEID', 'TransplantDate_DEID'], how='left')
dfkey_unos.fillna(0, inplace=True)



398 341


## dataset split - train, val, test

In [5]:
# Create train val test split for common MRN_DEID and save them for reproducibility
# Define the proportions for train, validation, and test sets
train_split = 0.7  # 70% for training
val_split = 0.15  # 15% for validation
test_split = 0.15   # 15% for testing

# Calculate the sizes of each split
total_samples = len(common_mrn_unique)
train_size = int(train_split * total_samples)
val_size = int(val_split * total_samples)
test_size = int(test_split * total_samples)

# Create indices for the train, validation, and test splits
np.random.seed(seed)
indices = np.arange(total_samples)
np.random.shuffle(indices)

train_indices = indices[:train_size]
val_indices = indices[train_size:(train_size + val_size)]
test_indices = indices[(train_size + val_size):(train_size + val_size + test_size)]

# Split the original data array based on the indices
mrn_train = common_mrn_unique[train_indices]
mrn_val = common_mrn_unique[val_indices]
mrn_test = common_mrn_unique[test_indices]

# np.savetxt('mrn_train.csv', mrn_train, delimiter=',', fmt='%.0f')
# np.savetxt('mrn_val.csv', mrn_val, delimiter=',', fmt='%.0f')
# np.savetxt('mrn_test.csv', mrn_test, delimiter=',', fmt='%.0f')

# mrn_train = np.loadtxt('mrn_train.csv', delimiter=',', dtype=np.int64)
# mrn_val = np.loadtxt('mrn_val.csv', delimiter=',', dtype=np.int64)
# mrn_test = np.loadtxt('mrn_test.csv', delimiter=',', dtype=np.int64)

In [6]:
mrn_train

array([47054367, 36072161, 37995004, ..., 10716356, 51748141, 84354044])

In [7]:
dfkey_train = dfkey_common.loc[dfkey_common.MRN_DEID.isin(mrn_train)]
dfkey_val = dfkey_common.loc[dfkey_common.MRN_DEID.isin(mrn_val)]
dfkey_test = dfkey_common.loc[dfkey_common.MRN_DEID.isin(mrn_test)]


In [8]:
# Read CPT data
df_cpt = pd.read_csv('./datasets/Procedure_before_transplant_2yrs_history_anonym.csv')
df_cpt['Transplant Date_DEID'] = pd.to_datetime(df_cpt['Transplant Date_DEID']).dt.date
df_cpt.rename(columns={ 'Transplant Date_DEID':'TransplantDate_DEID'}, inplace=True)
cols_cpt = df_cpt.columns[8:-3]

# Read ICD data
df_icd = pd.read_csv('./datasets/ICD9_ICD10_before_transplant_2yrs_history_anonym.csv')
cols_icd = df_icd.columns[8:]
cols_icd = [c for c in cols_icd if c[0] not in 'RSTUVWXYZ']
df_icd['Transplant Date_DEID'] = pd.to_datetime(df_icd['transplantDate_DEID']).dt.date
df_icd.rename(columns={'Transplant Date_DEID':'TransplantDate_DEID'}, inplace=True)



In [9]:
# merge with CPT
dfkey_unos_cpt = dfkey_unos.merge(df_cpt[['MRN_DEID', 'TransplantDate_DEID']+list(cols_cpt)], on=['MRN_DEID', 'TransplantDate_DEID'], how='left')

# merge with ICD - all tabular datasets merged
dfkey_unos_cpt_icd = dfkey_unos_cpt.merge(df_icd[['MRN_DEID', 'TransplantDate_DEID'] + list(cols_icd)], on=['MRN_DEID', 'TransplantDate_DEID'], how='left')

# fill na values to 0
dfkey_unos_cpt_icd[cols_cpt].fillna(0).to_numpy().astype('float32')
dfkey_unos_cpt_icd[cols_icd].fillna(0).to_numpy().astype('float32')


# Merging cpt and icd & selecting relevant columns, which are already selected
df1 = dfkey_unos_cpt_icd[cols_cpt]
df2 = dfkey_unos_cpt_icd[cols_icd]
df3 = dfkey_unos_cpt_icd[discrete_cols]
mrn = dfkey_unos_cpt_icd['MRN_DEID']
y = dfkey_unos_cpt_icd['CardiacFuture_confirmed']
dfkey_tab = pd.concat([df1, df2, df3, mrn, y], axis=1)
dfkey_tab = dfkey_tab.fillna(0)

#  Splitting tabular data into train/val/test
df_train_tab = dfkey_tab.loc[dfkey_tab.MRN_DEID.isin(mrn_train)]
df_val_tab = dfkey_tab.loc[dfkey_tab.MRN_DEID.isin(mrn_val)]
df_test_tab = dfkey_tab.loc[dfkey_tab.MRN_DEID.isin(mrn_test)]

# Splitting df_ecg into train/val/test
x_train_ecg = dfkey_ecg_common.loc[dfkey_ecg_common.MRN_DEID.isin(mrn_train)]
x_val_ecg = dfkey_ecg_common.loc[dfkey_ecg_common.MRN_DEID.isin(mrn_val)]
x_test_ecg = dfkey_ecg_common.loc[dfkey_ecg_common.MRN_DEID.isin(mrn_test)]

x_train_tab = df_train_tab.drop(columns=['MRN_DEID', 'CardiacFuture_confirmed'])
x_val_tab = df_val_tab.drop(columns=['MRN_DEID', 'CardiacFuture_confirmed'])
x_test_tab = df_test_tab.drop(columns=['MRN_DEID', 'CardiacFuture_confirmed'])

y_train_tab = df_train_tab['CardiacFuture_confirmed'].values
y_val_tab = df_val_tab['CardiacFuture_confirmed'].values
y_test_tab = df_test_tab['CardiacFuture_confirmed'].values


In [10]:
# Test distribution of positive samples which is same in all the splits
y_train_tab.sum()/len(y_train_tab), y_val_tab.sum()/len(y_val_tab), y_test_tab.sum()/len(y_test_tab)

(0.04168893639764832, 0.04488778054862843, 0.04738154613466334)

In [11]:
x_train_tab.shape, x_val_tab.shape, x_test_tab.shape, y_train_tab.shape, y_val_tab.shape, y_test_tab.shape

((1871, 452), (401, 452), (401, 452), (1871,), (401,), (401,))

In [12]:
x_train_ecg.shape, x_val_ecg.shape, x_test_ecg.shape

((1871, 147), (401, 147), (401, 147))

In [13]:
# checking no data leakage: same patients in tabular and ecg data across the splits
# Convert patient IDs in each split to sets for easy comparison
train_ids_tab = set(df_train_tab['MRN_DEID'])
val_ids_tab = set(df_val_tab['MRN_DEID'])
test_ids_tab = set(df_test_tab['MRN_DEID'])

train_ids_ecg = set(x_train_ecg['MRN_DEID'])
val_ids_ecg = set(x_val_ecg['MRN_DEID'])
test_ids_ecg = set(x_test_ecg['MRN_DEID'])

# Compare patient IDs in each split
train_match = train_ids_tab == train_ids_ecg
val_match = val_ids_tab == val_ids_ecg
test_match = test_ids_tab == test_ids_ecg

# Print the results
print(f'Train IDs match: {train_match}')
print(f'Validation IDs match: {val_match}')
print(f'Test IDs match: {test_match}')


Train IDs match: True
Validation IDs match: True
Test IDs match: True


In [14]:
type(x_train_tab)

pandas.core.frame.DataFrame

In [15]:
df_train_tab.to_csv('./processed-datasets/df_train_tab.csv')
df_val_tab.to_csv('./processed-datasets/df_val_tab.csv')
df_test_tab.to_csv('./processed-datasets/df_test_tab.csv')
x_train_ecg.to_csv('./processed-datasets/x_train_ecg.csv')
x_val_ecg.to_csv('./processed-datasets/x_val_ecg.csv')
x_test_ecg.to_csv('./processed-datasets/x_test_ecg.csv')