<a href="https://colab.research.google.com/github/victormurcia/VCHAMPS/blob/main/VCHAMPS_Model_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, I'll show how to prepare the model for training. Perhaps I'll see about doing training here as well

In [None]:
#General utilities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm  # Import tqdm for the progress bar
import math
import glob,shutil,os,warnings,math,time,sys,re
from typing import List
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

#For UUID generation
import uuid

#For Slider viz
import ipywidgets as widgets
from IPython.display import display, clear_output,HTML

#Enable data to be extracted and downloaded from my Google Drive
from google.colab import drive, files
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Specify the path to the desired directory
directory_path = r'/content/drive/MyDrive/VCHAMPS - Final Train Data'

# Change the current working directory to the desired directory
os.chdir(directory_path)

# Verify the current working directory
cwd = os.getcwd()

print(f"Current working directory: {cwd}")

Current working directory: /content/drive/MyDrive/VCHAMPS - Final Train Data


In [None]:
#get list of parquet files
parquet_files = glob.glob(os.path.join(directory_path, '*.parquet'))

# Extract filenames from file paths and use them for rearrangement
file_names = [os.path.basename(file_path) for file_path in parquet_files]

# Rearrange the list so that demographics static is next to last and inpatient admissions is at the end
rearranged_list = [fn for fn in file_names if fn != 'demographics_static.parquet' and fn != 'inpatient_admissions.parquet']
rearranged_list.append('demographics_static.parquet')
rearranged_list.append('inpatient_admissions.parquet')

#Add file paths back
rearranged_file_paths = [os.path.join('/content/drive/MyDrive/VCHAMPS - Final Train Data', fn) for fn in rearranged_list]

rearranged_file_paths

['/content/drive/MyDrive/VCHAMPS - Final Train Data/conditions.parquet',
 '/content/drive/MyDrive/VCHAMPS - Final Train Data/demographics_event.parquet',
 '/content/drive/MyDrive/VCHAMPS - Final Train Data/death.parquet',
 '/content/drive/MyDrive/VCHAMPS - Final Train Data/ed_visits.parquet',
 '/content/drive/MyDrive/VCHAMPS - Final Train Data/immunizations.parquet',
 '/content/drive/MyDrive/VCHAMPS - Final Train Data/inpatient_location.parquet',
 '/content/drive/MyDrive/VCHAMPS - Final Train Data/medications_administered.parquet',
 '/content/drive/MyDrive/VCHAMPS - Final Train Data/outpatient_visits.parquet',
 '/content/drive/MyDrive/VCHAMPS - Final Train Data/lab_results.parquet',
 '/content/drive/MyDrive/VCHAMPS - Final Train Data/measurements.parquet',
 '/content/drive/MyDrive/VCHAMPS - Final Train Data/measurements_bp.parquet',
 '/content/drive/MyDrive/VCHAMPS - Final Train Data/inpatient_specialty.parquet',
 '/content/drive/MyDrive/VCHAMPS - Final Train Data/procedures.parquet',


# Combining Encounters
First thing I'll try  is to combine the encounter dataframes together since each of the rows here represents a hospitalization. I'll optimize the datatypes and remove any unnecessary features.

In [None]:
# Extract the specific files into a new list
encounter_files = [
    file_path for file_path in rearranged_file_paths
    if 'ed_visits.parquet' in file_path
    or 'outpatient_visits.parquet' in file_path
    or 'inpatient_admissions.parquet' in file_path
]

# Load each Parquet file into separate Dask DataFrames
ed_visits = dd.read_parquet(encounter_files[0])
outpatient_visits = dd.read_parquet(encounter_files[1])
inpatient_admissions = dd.read_parquet(encounter_files[2])

dataframes = [ed_visits, outpatient_visits, inpatient_admissions]

# Automate the categorical conversion process for all DataFrames
for df in dataframes:
    # Find categorical columns in the DataFrame
    categorical_columns = [col for col in df.columns if df[col].dtype.name == 'category']

    # Convert categorical columns to non-categorical data types (object)
    for col in categorical_columns:
        df[col] = df[col].astype('object')

# Concatenate the DataFrames vertically into a new DataFrame
concatenated_df = dd.concat([ed_visits,outpatient_visits,inpatient_admissions])
#Drop datetime columns
datetime_columns = concatenated_df.select_dtypes(include='datetime').columns
concatenated_df = concatenated_df.drop(columns=datetime_columns)
concatenated_df = concatenated_df.compute()

#Modify float columns
float_columns = concatenated_df.select_dtypes(include='float').columns
for col in float_columns:
  concatenated_df[col] = concatenated_df[col].fillna(-1).astype('int16')

#Fill NaNs with -1
concatenated_df = concatenated_df.fillna(-1)

# Define the mapping dictionary
mapping = {'Yes': 1, 'No': 0}

# Replace values based on the mapping dictionary using replace() with regex=True
concatenated_df['Agentorangeflag']       = concatenated_df['Agentorangeflag'].replace(mapping, regex=True).astype('int8')
concatenated_df['Combatflag']            = concatenated_df['Combatflag'].replace(mapping, regex=True).astype('int8')
concatenated_df['Ionizingradiationflag'] = concatenated_df['Ionizingradiationflag'].replace(mapping, regex=True).astype('int8')
concatenated_df['Serviceconnectedflag']  = concatenated_df['Serviceconnectedflag'].replace(mapping, regex=True).astype('int8')
concatenated_df['Swasiaconditionsflag']  = concatenated_df['Swasiaconditionsflag'].replace(mapping, regex=True).astype('int8')

# Columns to be converted to int8
cols_to_convert = [
    'pre_hosp_any',
    'pre_hosp_cv',
    'readmit_allcause_30d',
    'readmit_allcause_90d',
    'readmit_allcause_180d',
    'readmit_allcause_365d',
    'readmit_CV_30d',
    'readmit_CV_90d',
    'readmit_CV_180d',
    'readmit_CV_365d',
    'mortality_inhosp_allcause',
    'Outpatientreferralflag',
    'cc Status_CC',
    'cc Status_MCC',
    'cc Status_NCC',
    'Discharge disposition_Death with autopsy',
    'Discharge disposition_Death without autopsy',
    'Discharge disposition_Irregular',
    'Discharge disposition_NBC or while ASIH',
    'Discharge disposition_Regular',
    'Discharge disposition_Transfer',
    'rehosp_allcause',
    'Age at admission',
    'Age at visit',
    'Age at ed visit',
    'Died during ed visit',
    'CV diagnosis'
]

# Convert columns to int8
for col in cols_to_convert:
    concatenated_df[col] = concatenated_df[col].astype('int8')

concatenated_df.drop(columns=['cc Status'], inplace=True)

cols_to_replace = ['code', 'Stop code', 'Discharging unit service', 'Admitting specialty', 'Discharging specialty','diagnosis','Admitting unit service']

for col in cols_to_replace:
    concatenated_df[col] = concatenated_df[col].replace(-1, 'NA')

concatenated_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,pre_hosp_any,pre_hosp_cv,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435812,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,15,0,1,1,1,1,0,0,0,0
910894,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,16,0,1,1,1,1,0,0,0,0
910895,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,17,0,1,1,1,1,0,0,0,0
432541,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,0,0,-1,-1,-1,-1,0,0,0,0


In [None]:
concatenated_df.dtypes

Internalpatientid                               int32
Age at ed visit                                  int8
Died during ed visit                             int8
CV diagnosis                                     int8
code                                           object
Encounter ID                                   object
Age at visit                                     int8
Stop code                                      object
Agentorangeflag                                  int8
Combatflag                                       int8
Ionizingradiationflag                            int8
Serviceconnectedflag                             int8
Swasiaconditionsflag                             int8
diagnosis                                      object
Age at admission                                 int8
Admitting unit service                         object
Discharging unit service                       object
Admitting specialty                            object
Discharging specialty       

In [None]:
concatenated_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/encounters.parquet')

# Merging conditions
Now that I have the encounters dataframe, I'll start merging it with other dataframes. Here I'll clean the conditions dataframe and then merge it with the encounter df and start building the model df.

In [None]:
conditions = pd.read_parquet(rearranged_file_paths[0])
#Drop datetime columns
datetime_columns = conditions.select_dtypes(include='datetime').columns
conditions = conditions.drop(columns=datetime_columns)
conditions = conditions.drop(columns=['cc Status'])

#Modify float columns
bool_columns = conditions.select_dtypes(include='bool').columns
for col in bool_columns:
  conditions[col] = conditions[col].fillna(-1).astype('int8')

#Rename column
conditions = conditions.rename(columns={'Diagnosis sequence number or rank': 'DSNR'})

#One hot encode the Diagnosis Sequence or Rank column
one_hot = pd.get_dummies(conditions['DSNR'], prefix='DSNR')
conditions = pd.concat([conditions, one_hot], axis=1)

#Drop the DSNR column
conditions.drop(columns=['DSNR'], inplace=True)
conditions

Unnamed: 0,Internalpatientid,Age at condition documentation,Diagnosis,Problem,code,Encounter ID,DSNR_1,DSNR_10,DSNR_11,DSNR_12,...,DSNR_25,DSNR_3,DSNR_4,DSNR_5,DSNR_6,DSNR_7,DSNR_8,DSNR_9,DSNR_P,DSNR_S
0,1,58,1,0,M159,36c34b45-5879-4cc7-be9b-d733061e30a3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,58,1,0,M199,333ca061-57d1-4fb1-ab99-f142ead909a1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,58,1,0,I10,63a9f5d1-c77c-452c-9b7f-80e86f19d423,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,58,1,0,E782,3e246aaa-2d6d-4d78-a489-db9ef2373d65,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,59,1,0,E782,30ff0057-8fab-4c81-ac1b-034f8c8eedc1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9299995,118516,60,1,0,Z0389,3cf86993-a81b-57c3-964d-0857e5acd137,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9299996,118516,60,1,0,Z0389,ae641a0f-f803-5d7c-aade-62e93064e42b,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9299997,118516,61,1,0,Z0389,8e1c58fe-941f-521f-913e-5930bc72788f,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9299998,118516,62,1,0,E660,802eefc5-514a-54bc-8fbf-ad5b9236e637,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
conditions.dtypes

Internalpatientid                  int32
Age at condition documentation      int8
Diagnosis                           int8
Problem                             int8
code                              object
Encounter ID                      object
DSNR_1                             uint8
DSNR_10                            uint8
DSNR_11                            uint8
DSNR_12                            uint8
DSNR_13                            uint8
DSNR_14                            uint8
DSNR_15                            uint8
DSNR_16                            uint8
DSNR_17                            uint8
DSNR_18                            uint8
DSNR_19                            uint8
DSNR_2                             uint8
DSNR_20                            uint8
DSNR_21                            uint8
DSNR_22                            uint8
DSNR_23                            uint8
DSNR_24                            uint8
DSNR_25                            uint8
DSNR_3          

In [None]:
model_df = concatenated_df.merge(conditions,on=['Internalpatientid', 'Encounter ID', 'code'], how='left')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,DSNR_25,DSNR_3,DSNR_4,DSNR_5,DSNR_6,DSNR_7,DSNR_8,DSNR_9,DSNR_P,DSNR_S
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,,,,,,,,,,
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,,,,,,,,,,


In [None]:
#Modify float columns
float_columns = model_df.select_dtypes(include='float64').columns
for col in float_columns:
  model_df[col] = model_df[col].fillna(-1).astype('int8')

In [None]:
pd.set_option('display.max_rows', 10)
model_df.dtypes

Internalpatientid        int32
Age at ed visit           int8
Died during ed visit      int8
CV diagnosis              int8
code                    object
                         ...  
DSNR_7                    int8
DSNR_8                    int8
DSNR_9                    int8
DSNR_P                    int8
DSNR_S                    int8
Length: 74, dtype: object

In [None]:
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,DSNR_25,DSNR_3,DSNR_4,DSNR_5,DSNR_6,DSNR_7,DSNR_8,DSNR_9,DSNR_P,DSNR_S
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0,0,0,0,0,0,0,0,1,0
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0,0,0,0,0,0,0,0,0,1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0,0,0,0,0,0,0,0,1,0
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0,0,0,0,0,0,0,0,0,1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

# Merging to measurements_bp
Now I'll merge with the measurements bp dataframe. This dataframe has multiple measurements per encounter. However, the only features we need from this df are the map_avg and map_low variables we created. Therefore, I'll group this df by Encounter ID and then proceed to merge with the model df.

In [None]:
measurements_bp  = pd.read_parquet(rearranged_file_paths[10])
#Drop datetime columns
datetime_columns = measurements_bp.select_dtypes(include='datetime').columns
measurements_bp = measurements_bp.drop(columns=datetime_columns)
measurements_bp = measurements_bp.drop(columns=['Diastolic bp','Systolic bp','Age at measurement bp'])
measurements_bp

Unnamed: 0,Internalpatientid,Encounter ID,map,map_avg,map_low
0,83189,7f635061-7d06-532d-b462-3bc8a46e67b6,92.666667,81.884354,65.333333
1,83189,7f635061-7d06-532d-b462-3bc8a46e67b6,87.333333,81.884354,65.333333
2,83189,7f635061-7d06-532d-b462-3bc8a46e67b6,81.333333,81.884354,65.333333
3,83189,7f635061-7d06-532d-b462-3bc8a46e67b6,77.333333,81.884354,65.333333
4,83189,7f635061-7d06-532d-b462-3bc8a46e67b6,78.333333,81.884354,65.333333
...,...,...,...,...,...
9999995,73313,de85b144-0bbd-4423-85b3-08757a5e79e8,86.000000,86.000000,86.000000
9999996,144793,d761ab81-f7f9-4aad-ae62-db7e65da9626,83.000000,83.000000,83.000000
9999997,62421,ebcb336a-f816-4e01-b856-cdbeeee41cef,100.666667,100.666667,100.666667
9999998,141195,91d4f847-f532-4964-a229-5afc9fccd4f1,94.000000,94.000000,94.000000


I'll group this df by Encounter ID and drop map since map_avg will have that information and we also have map_low

In [None]:
measurements_bp = measurements_bp.groupby(['Internalpatientid', 'Encounter ID']).agg({'map_avg': 'first','map_low': 'first'}).reset_index()
measurements_bp

Unnamed: 0,Internalpatientid,Encounter ID,map_avg,map_low
0,1,00e485bd-8982-5611-b3fb-fe574552fbed,110.333333,110.333333
1,1,047272c1-378c-502f-958f-ea11da51fab1,109.000000,109.000000
2,1,0ce62d49-ea64-5941-b70c-89b4a1c4cc8a,103.666667,103.666667
3,1,1523ee84-7595-585e-9caf-cbd954cc66b1,106.666667,106.666667
4,1,1bc600f4-fe18-532d-a7e5-0bb5e1de5232,96.666667,96.666667
...,...,...,...,...
5755866,169064,f97b45e1-15de-4094-ba56-8f106d876cda,100.000000,100.000000
5755867,169064,fc797eae-a5f3-4afc-825c-4e7beb359ee9,94.666667,94.666667
5755868,169064,fda04ee4-1480-4294-947d-e27315314c48,96.666667,96.666667
5755869,169064,fe4184b7-1167-4b84-af81-4f0bf703980a,71.666667,71.666667


In [None]:
model_df = model_df.merge(measurements_bp,on=['Internalpatientid', 'Encounter ID'], how='left')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,DSNR_4,DSNR_5,DSNR_6,DSNR_7,DSNR_8,DSNR_9,DSNR_P,DSNR_S,map_avg,map_low
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0,0,0,0,0,0,1,0,,
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0,0,0,0,0,0,0,1,,
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0,0,0,0,0,0,1,0,76.333333,76.333333
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0,0,0,0,0,0,0,1,76.333333,76.333333
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,0,0,0,0,0,0,1,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,75.333333,75.333333
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,102.000000,102.000000
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,102.000000,102.000000
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,79.000000,65.000000


In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

# Merging to Measurements
Similar to measurements bp, this dataframe contains multiple entries per encounter, however, I've already constructed the variables of interest so I'll just need to merge those by first grouping the dataframe by encounter id

In [None]:
measurements  = pd.read_parquet(rearranged_file_paths[9])
#Drop datetime columns
datetime_columns = measurements.select_dtypes(include='datetime').columns
measurements = measurements.drop(columns=datetime_columns)
measurements = measurements.drop(columns=['Result numeric','Measurement','Age at measurement'])
measurements

Unnamed: 0,Internalpatientid,Encounter ID,heart_wt_admit,heart_wt_dc,heart_hr_high,heart_hr_avg,id_temp_high,id_temp_avg,id_temp_up,resp_rr_avg,resp_rr_high,discharge_hr,discharge_blood_pressure,BMI
0,1,0ca96b3d-9cca-461d-ba94-b1e8fe8ecfff,,,,,95.804066,,0.0,,,,,
1,9713,2395b56a-b7db-5a4b-869f-5ae3bcc11ee6,,,88.0,83.750000,101.642561,83.750000,1.0,18.500000,21.0,76.0,,
2,97124,a06f4802-c63a-562f-9160-4a65241f4d7e,241.440850,241.440850,66.0,66.000000,96.582804,66.000000,0.0,,,66.0,,
3,107718,e158e91c-6384-5b83-8178-dceb8893a91c,253.359525,254.649458,68.0,63.500000,95.979629,63.500000,0.0,17.000000,18.0,62.0,,
4,96334,8f78627d-d7da-5da5-bdf8-63bdc3d9e132,126.000000,119.626033,93.0,80.800000,99.667202,80.800000,0.0,16.750000,18.0,83.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27199995,56828,22babbdd-2398-5d74-80b9-d93f846b018f,188.290813,216.466102,149.0,73.090909,103.000000,73.090909,1.0,17.509804,23.0,58.0,,21.756812
27199996,128003,6d130e56-fc0b-5f5e-8897-0dc115483bb6,222.310952,241.781791,139.0,101.516129,101.409013,101.516129,1.0,21.592593,35.0,125.0,,
27199997,110343,66ac08ad-edd3-4e67-b875-efee1b188297,,,,,,,,,,,,
27199998,47388,7abb124e-78f9-5eab-8477-8b9386928933,157.357797,158.690935,80.0,66.750000,101.122829,66.750000,1.0,18.750000,21.0,70.0,,


In [None]:
measurements = measurements.groupby(['Internalpatientid', 'Encounter ID']).first().reset_index()
measurements

Unnamed: 0,Internalpatientid,Encounter ID,heart_wt_admit,heart_wt_dc,heart_hr_high,heart_hr_avg,id_temp_high,id_temp_avg,id_temp_up,resp_rr_avg,resp_rr_high,discharge_hr,discharge_blood_pressure,BMI
0,1,00143959-6e5e-4464-833e-44413fba817d,236.021404,236.021404,,,,,,,,,,35.883012
1,1,00238998-8913-4b06-8380-4d5ceed3f252,,,,,,,,18.0,18.0,,,
2,1,00e485bd-8982-5611-b3fb-fe574552fbed,,,73.0,73.0,98.102793,73.0,0.0,,,73.0,,
3,1,010792c4-1bac-42c7-b457-7f26ef5e99d8,,,,,,,,,,,,
4,1,0153602b-c7a6-4670-af53-38bb2c74d92a,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12694188,169064,a272b7a0-6764-4ec5-a0fa-a1ff3dccecbd,,,,,97.074750,,0.0,,,,,
12694189,169064,a446d553-25bd-436b-b474-e2b9ae5ed774,,,,,,,,17.0,17.0,,,
12694190,169064,b19f1a52-0e2a-4fef-8bca-62996c693a63,,,59.0,59.0,,59.0,,,,59.0,,
12694191,169064,c43271c3-2e7f-406b-8fca-513d444290e2,,,,,,,,,,,,


In [None]:
model_df = model_df.merge(measurements,on=['Internalpatientid', 'Encounter ID'], how='left')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,heart_hr_high,heart_hr_avg,id_temp_high,id_temp_avg,id_temp_up,resp_rr_avg,resp_rr_high,discharge_hr,discharge_blood_pressure,BMI
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,102.0,82.285714,96.489682,82.285714,0.0,18.428571,27.0,102.0,,
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,102.0,82.285714,96.489682,82.285714,0.0,18.428571,27.0,102.0,,
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,,,,,,,,,,
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,86.0,69.000000,102.467201,69.000000,1.0,17.800000,22.0,62.0,,


In [None]:
pd.set_option('display.max_rows', 10)
model_df.dtypes

Internalpatientid             int32
Age at ed visit                int8
Died during ed visit           int8
CV diagnosis                   int8
code                         object
                             ...   
resp_rr_avg                 float64
resp_rr_high                float64
discharge_hr                float64
discharge_blood_pressure    float64
BMI                         float64
Length: 88, dtype: object

In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

In [None]:
model_df['Internalpatientid'].nunique()

105427

# Merge demographic info

In [None]:
demographics_event  = pd.read_parquet(rearranged_file_paths[1])
#Drop datetime columns
datetime_columns = demographics_event.select_dtypes(include='datetime').columns
demographics_event = demographics_event.drop(columns=datetime_columns)
demographics_event = demographics_event.drop(columns=['Age at update'])
#Modify bool columns
bool_columns = demographics_event.select_dtypes(include='bool').columns
for col in bool_columns:
  demographics_event[col] = demographics_event[col].fillna(-1).astype('int8')
demographics_event.rename(columns={'Unknown': 'MarriedStatus_Unknown'}, inplace=True)
demographics_event

Unnamed: 0,Internalpatientid,Divorced,Married,Never married,Separated,Single,MarriedStatus_Unknown,Widowed,Highly rural,Rural,Urban,Encounter ID
0,100028,0,1,0,0,0,0,0,0,0,1,e0f58601-833e-5982-b624-175beb0cfc77
1,100032,0,1,0,0,0,0,0,0,1,0,26960966-c254-5cdd-a22b-e4305f53dba4
2,100046,0,1,0,0,0,0,0,0,0,1,0c514ba1-9295-4acd-b397-a07f3b76fb37
3,100071,0,0,0,0,0,0,1,0,0,1,dd9f49d1-856a-4659-975a-6cb78622be7c
4,100091,0,0,1,0,0,0,0,0,0,1,b747ba8a-8696-44a2-8f35-57a046e551d2
...,...,...,...,...,...,...,...,...,...,...,...,...
133247,99898,0,1,0,0,0,0,0,0,0,1,3281f665-f906-42f4-a83a-272898de4763
133248,9995,0,1,0,0,0,0,0,0,1,0,302180f8-67b4-420e-a7cb-30efd7065455
133249,99950,0,1,0,0,0,0,0,0,0,1,3959fd06-b3f6-4982-9ec6-bd594de84193
133250,9998,1,0,0,0,0,0,0,0,0,1,8304a1cb-b7c6-4d7f-880d-4e671e386c84


In [None]:
model_df = model_df.merge(demographics_event,on=['Internalpatientid', 'Encounter ID'], how='left')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,Divorced,Married,Never married,Separated,Single,MarriedStatus_Unknown,Widowed,Highly rural,Rural,Urban
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,,,,,,,,,,
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,,,,,,,,,,
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,,,,,,,,,,
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,,,,,,,,,,


I'm going to clear some variables before moving forward to free up a bit of RAM

In [None]:
# List of variables to delete
variables_to_delete = ['measurements_bp', 'demographics_event', 'conditions', 'concatenated_df', 'grouped_df',
                       'ed_visits', 'outpatient_visits', 'inpatient_admissions', 'dataframes', 'df',
                       'categorical_columns', 'col', 'concatenated_df', 'datetime_columns', 'float_columns',
                       'mapping', 'cols_to_convert', 'cols_to_replace']

# Delete the specified variables
for var_name in variables_to_delete:
    if var_name in globals():
        del globals()[var_name]

In [None]:
#Convert columns to int8 and fill NaNs with -1
columns_to_convert = ['Divorced', 'Married', 'Never married', 'Separated', 'Single',
                      'MarriedStatus_Unknown', 'Widowed', 'Highly rural', 'Rural', 'Urban']

model_df[columns_to_convert] = model_df[columns_to_convert].fillna(-1).astype('int8')

In [None]:
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,Divorced,Married,Never married,Separated,Single,MarriedStatus_Unknown,Widowed,Highly rural,Rural,Urban
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

In [None]:
pd.set_option('display.max_rows', 10)
model_df.dtypes

Internalpatientid         int32
Age at ed visit            int8
Died during ed visit       int8
CV diagnosis               int8
code                     object
                          ...  
MarriedStatus_Unknown      int8
Widowed                    int8
Highly rural               int8
Rural                      int8
Urban                      int8
Length: 98, dtype: object

# Merging inpatient specialty

In [None]:
inpatient_specialty  = pd.read_parquet(rearranged_file_paths[11])
#Drop datetime columns
datetime_columns = inpatient_specialty.select_dtypes(include='datetime').columns
inpatient_specialty = inpatient_specialty.drop(columns=datetime_columns)
inpatient_specialty = inpatient_specialty.drop(columns=['Age at specialty','Specialty','counts'])
inpatient_specialty

Unnamed: 0,Internalpatientid,Encounter ID,genMed,hospice,homeCare,homelessRecovery,rehab,snf,psych,obs,...,loc_rehab,loc_snf,loc_psych,loc_obs,loc_drug,loc_stepdown,loc_icu,loc_other,ICU_days,stepdown_days
0,1,e8f395d3-c8d4-5cf4-a686-34352c9e47c3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,,
1,100095,98f745fb-5f07-4f4c-86f4-36979464dca3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,,
2,10064,437a3995-2112-5b1c-865b-6ba64c5bd0a1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,,
3,100751,63d9b053-ec7a-5c93-bc23-f52d522208c9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,,
4,101245,33037e3c-db21-57dc-89b0-fc1e09ebd5fc,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40073402,99689,99375e10-7fdc-5328-b8cb-3f4fb64796fe,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,,
40073403,99689,99375e10-7fdc-5328-b8cb-3f4fb64796fe,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,,
40073404,58306,56ec12dd-131a-4e22-a0b1-d6026f6ca121,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,,
40073405,58306,b24abd9c-f525-442b-92eb-b712b2c626a8,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,,


In [None]:
inpatient_specialty = inpatient_specialty.groupby(['Internalpatientid', 'Encounter ID']).first().reset_index()
inpatient_specialty

Unnamed: 0,Internalpatientid,Encounter ID,genMed,hospice,homeCare,homelessRecovery,rehab,snf,psych,obs,...,loc_rehab,loc_snf,loc_psych,loc_obs,loc_drug,loc_stepdown,loc_icu,loc_other,ICU_days,stepdown_days
0,1,0ce62d49-ea64-5941-b70c-89b4a1c4cc8a,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,
1,1,4ab4ae77-f40e-4ade-b4c1-c1574dc2041f,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,
2,1,52d5e3bc-aced-53a4-b8a4-4a458e55601f,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,,
3,1,aee47e87-cab8-5ca7-9947-21cb2daf476b,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,,
4,1,d164c341-63a3-5729-9bf1-a6b54bae4a74,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856199,169062,caba65f7-390e-40b2-8857-5a2ad2fc072a,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,,
856200,169062,da784870-55cd-51f3-84f4-9686c00291f9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,,
856201,169062,e6460fae-0e3b-5fda-a016-f36cdd654819,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,,
856202,169062,f6956488-7e4c-5967-a95f-208aae167c25,0,0,0,0,0,0,0,1,...,0,0,0,1,0,1,0,0,,0


In [None]:
model_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,Divorced,Married,Never married,Separated,Single,MarriedStatus_Unknown,Widowed,Highly rural,Rural,Urban
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [None]:
model_df = model_df.merge(inpatient_specialty,on=['Internalpatientid', 'Encounter ID'], how='left')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,loc_rehab,loc_snf,loc_psych,loc_obs,loc_drug,loc_stepdown,loc_icu,loc_other,ICU_days,stepdown_days
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,,0
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,,0
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,,


In [None]:
pd.set_option('display.max_rows', 10)
#model_df.dtypes[-26:]

In [None]:
# List of columns to convert to int8
columns_to_convert = ['genMed', 'hospice', 'homeCare', 'homelessRecovery', 'rehab', 'snf', 'psych', 'obs', 'drug',
                      'stepdown', 'icu', 'other', 'loc_genMed', 'loc_hospice', 'loc_homeCare', 'loc_homelessRecovery',
                      'loc_rehab', 'loc_snf', 'loc_psych', 'loc_obs', 'loc_drug', 'loc_stepdown', 'loc_icu', 'loc_other']

# Convert columns to int8 and fill NaNs with -1
model_df[columns_to_convert] = model_df[columns_to_convert].fillna(-1).astype('int8')

In [None]:
columns_to_convert = ['ICU_days', 'stepdown_days']
# Convert columns to int8 and fill NaNs with -1
model_df[columns_to_convert] = model_df[columns_to_convert].fillna(-1)

In [None]:
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,loc_rehab,loc_snf,loc_psych,loc_obs,loc_drug,loc_stepdown,loc_icu,loc_other,ICU_days,stepdown_days
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0,0,0,0,0,0,0,1,-1,-1
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0,0,0,0,0,0,0,1,-1,-1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,0,0,0,0,0,0,0,1,-1,-1
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0,0,0,1,0,1,0,0,-1,0
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0,0,0,1,0,1,0,0,-1,0
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,1,1,0,0,0,0,0,1,-1,-1


In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

# Merging Inpatient Location
Is there anything we want to do with this? I might just one hot encode this

In [None]:
model_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,loc_rehab,loc_snf,loc_psych,loc_obs,loc_drug,loc_stepdown,loc_icu,loc_other,ICU_days,stepdown_days
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0,0,0,0,0,0,0,1,-1,-1
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0,0,0,0,0,0,0,1,-1,-1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,0,0,0,0,0,0,0,1,-1,-1
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0,0,0,1,0,1,0,0,-1,0
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0,0,0,1,0,1,0,0,-1,0
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,1,1,0,0,0,0,0,1,-1,-1


In [None]:
inpatient_location  = pd.read_parquet(rearranged_file_paths[5])
#Drop datetime columns
datetime_columns = inpatient_location.select_dtypes(include='datetime').columns
inpatient_location = inpatient_location.drop(columns=datetime_columns)
inpatient_location = inpatient_location.rename(columns={'Service': 'inpatient_location_service'})
inpatient_location = inpatient_location.drop(columns=['Age at location'])
#One hot encode the Diagnosis Sequence or Rank column
one_hot = pd.get_dummies(inpatient_location['inpatient_location_service'], prefix='inpatient_location_service')
inpatient_location = pd.concat([inpatient_location, one_hot], axis=1)
#Drop the inpatient_location_service column
inpatient_location.drop(columns=['inpatient_location_service'], inplace=True)
inpatient_location

Unnamed: 0,Internalpatientid,Died at location,Encounter ID,inpatient_location_service_BLIND REHAB,inpatient_location_service_DOMICILIARY,inpatient_location_service_INTERMEDIATE MED,inpatient_location_service_MEDICINE,inpatient_location_service_NEUROLOGY,inpatient_location_service_NHCU,inpatient_location_service_NON-COUNT,inpatient_location_service_PSYCHIATRY,inpatient_location_service_REHAB MEDICINE,inpatient_location_service_SPINAL CORD INJURY,inpatient_location_service_SURGERY
0,100005,0,6acbda18-63a1-5086-a5f2-64bbb198632e,0,0,0,1,0,0,0,0,0,0,0
1,100008,0,ef5e041b-4e3a-5cba-afa8-de036ba335d4,0,0,0,0,0,0,0,0,0,0,1
2,100015,0,c90d9a73-08d3-5217-aa5f-69225425b1eb,0,0,0,1,0,0,0,0,0,0,0
3,100015,0,bdcc756b-a0be-509c-aedd-f366602ee3a9,0,0,0,1,0,0,0,0,0,0,0
4,100026,0,c8750041-50d2-5e6c-8587-a641615a1182,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
830348,99974,0,89bfbaf3-d368-4ca5-b5bd-cd8f08e891b7,0,0,0,1,0,0,0,0,0,0,0
830349,99981,0,72f1a0a9-3887-5b36-9c34-7fa4bc871441,0,0,0,0,0,0,0,0,0,0,1
830350,99983,0,ced196f3-133d-59a9-9040-9b3601fa2189,0,1,0,0,0,0,0,0,0,0,0
830351,9999,0,eaf7690b-8b9d-5a21-ab1b-ae476aa4b7ce,0,0,0,0,0,0,0,0,0,0,1


In [None]:
# Group by 'Encounter ID' and 'Internalpatientid', and apply max to 'Died at location' and sum to the rest of the columns
inpatient_location = inpatient_location.groupby(['Encounter ID', 'Internalpatientid'], as_index=False).agg({
    'Died at location': 'max',
    # Exclude 'Encounter ID' and 'Internalpatientid' from the sum
    **{col: 'sum' for col in inpatient_location.columns if col not in ['Encounter ID', 'Internalpatientid']}
})
# List of columns to exclude from mapping
columns_to_exclude = ['Encounter ID', 'Internalpatientid']

# Function to convert values to 1 or 0
def convert_to_binary(value):
    return 1 if value >= 1 else 0

# Apply the mapping function to each cell in the selected columns
inpatient_location[inpatient_location.columns.difference(columns_to_exclude)] = inpatient_location[inpatient_location.columns.difference(columns_to_exclude)].applymap(convert_to_binary)
inpatient_location

Unnamed: 0,Encounter ID,Internalpatientid,Died at location,inpatient_location_service_BLIND REHAB,inpatient_location_service_DOMICILIARY,inpatient_location_service_INTERMEDIATE MED,inpatient_location_service_MEDICINE,inpatient_location_service_NEUROLOGY,inpatient_location_service_NHCU,inpatient_location_service_NON-COUNT,inpatient_location_service_PSYCHIATRY,inpatient_location_service_REHAB MEDICINE,inpatient_location_service_SPINAL CORD INJURY,inpatient_location_service_SURGERY
0,00004f7e-d083-5c82-871e-8208e40efd8e,91722,0,0,0,0,1,0,0,0,0,0,0,0
1,00007d37-6979-51f6-81cb-2cf4b98f5266,12449,0,0,0,1,1,0,0,0,0,0,0,0
2,00009fa6-cfee-56a4-805c-b6a30b94beb9,49340,0,0,0,0,1,0,0,0,0,0,0,0
3,0000a3ac-05fd-5703-8011-cc0afeab322c,129997,0,0,0,0,0,0,0,1,0,0,0,0
4,0000beb8-ba2d-5922-8476-2077e03f6363,151320,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
553919,ffffd350-0a98-57de-92dd-b680c7ed041f,25848,0,0,0,0,1,0,0,0,0,0,0,0
553920,ffffd8a9-78c4-4dca-94a3-7429da17c099,157043,0,0,0,0,1,0,0,0,0,0,0,0
553921,fffff6dd-1aea-4218-8990-38fc1a69230c,121856,0,0,0,0,0,0,1,0,0,0,0,0
553922,fffffc48-b17e-59f9-8409-5355dab6445f,10372,0,0,0,0,0,0,0,0,0,0,0,1


In [None]:
inpatient_location['Died at location'].value_counts()

0    542185
1     11739
Name: Died at location, dtype: int64

In [None]:
model_df = model_df.merge(inpatient_location,on=['Internalpatientid', 'Encounter ID'], how='left')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,inpatient_location_service_DOMICILIARY,inpatient_location_service_INTERMEDIATE MED,inpatient_location_service_MEDICINE,inpatient_location_service_NEUROLOGY,inpatient_location_service_NHCU,inpatient_location_service_NON-COUNT,inpatient_location_service_PSYCHIATRY,inpatient_location_service_REHAB MEDICINE,inpatient_location_service_SPINAL CORD INJURY,inpatient_location_service_SURGERY
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
pd.set_option('display.max_rows', 15)
model_df.dtypes[-15:]

loc_other                                         int8
ICU_days                                         Int32
stepdown_days                                    Int32
Died at location                                  int8
inpatient_location_service_BLIND REHAB            int8
inpatient_location_service_DOMICILIARY            int8
inpatient_location_service_INTERMEDIATE MED       int8
inpatient_location_service_MEDICINE               int8
inpatient_location_service_NEUROLOGY              int8
inpatient_location_service_NHCU                   int8
inpatient_location_service_NON-COUNT              int8
inpatient_location_service_PSYCHIATRY             int8
inpatient_location_service_REHAB MEDICINE         int8
inpatient_location_service_SPINAL CORD INJURY     int8
inpatient_location_service_SURGERY                int8
dtype: object

In [None]:
# List of columns to convert to int8
columns_to_convert = ['Died at location', 'inpatient_location_service_BLIND REHAB', 'inpatient_location_service_DOMICILIARY', 'inpatient_location_service_INTERMEDIATE MED',
                      'inpatient_location_service_MEDICINE', 'inpatient_location_service_NEUROLOGY', 'inpatient_location_service_NHCU', 'inpatient_location_service_NON-COUNT',
                      'inpatient_location_service_PSYCHIATRY', 'inpatient_location_service_REHAB MEDICINE', 'inpatient_location_service_SPINAL CORD INJURY', 'inpatient_location_service_SURGERY']


# Convert columns to int8 and fill NaNs with -1
model_df[columns_to_convert] = model_df[columns_to_convert].fillna(-1).astype('int8')

In [None]:
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,inpatient_location_service_DOMICILIARY,inpatient_location_service_INTERMEDIATE MED,inpatient_location_service_MEDICINE,inpatient_location_service_NEUROLOGY,inpatient_location_service_NHCU,inpatient_location_service_NON-COUNT,inpatient_location_service_PSYCHIATRY,inpatient_location_service_REHAB MEDICINE,inpatient_location_service_SPINAL CORD INJURY,inpatient_location_service_SURGERY
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0,0,1,0,0,0,0,0,0,0
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0,0,1,0,0,0,0,0,0,0
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,0,0,1,0,0,0,0,0,0,0
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0,0,0,0,0,1,0,0,0,0
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0,0,0,0,0,1,0,0,0,0
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,0,0,0,0,0,0,0,0,0,1


In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

# Merging Labs

In [None]:
model_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

In [None]:
lab_results  = pd.read_parquet(rearranged_file_paths[8])
#Drop datetime columns
datetime_columns = lab_results.select_dtypes(include='datetime').columns
lab_results = lab_results.drop(columns=datetime_columns)
lab_results = lab_results.drop(columns=['Age at lab test','Result numeric','desc','concept','unit','range_min','range_max','Specimen source','ESR_ULN','hsCRP_ULN','CRP_ULN'])
lab_results['drug_stimulant_use'] = lab_results['drug_stimulant_use'].fillna(-1).astype('int8')
# Replace inf values with NaN
lab_results = lab_results.replace([np.inf, -np.inf], np.nan)

#Group by Encounter ID
lab_results = lab_results.groupby(['Internalpatientid', 'Encounter ID']).first().reset_index()
# Fill NaNs in 'liver_heptaocellular_product' column with 1188
lab_results['liver_heptaocellular_product'] = lab_results['liver_heptaocellular_product'].fillna(1188)

# Select all float64 columns excluding 'liver_heptaocellular_product'
float_columns = lab_results.select_dtypes(include='float64').columns.drop('liver_heptaocellular_product')

# Fill NaNs in float64 columns with -100 since all of the labs are either always positive or if they have negative values they are comparatively small
lab_results[float_columns] = lab_results[float_columns].fillna(-100)

lab_results

Unnamed: 0,Internalpatientid,Encounter ID,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium,...,heme_ida,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer,liver_heptaocellular_product,discharge_creatinine,discharge_lactate
0,1,003ea922-181b-427a-b4e1-581f765e4d3f,-100.0,-100.0,-100.0,-100.0,0.954925,-100.0,-100.000000,-100.0,...,-100.0,0,-100.0,-100.0,-100.0,0,-100.0,1188.0,-100.0,-100.0
1,1,00877775-f80c-4ba7-b36f-a84b01fe6f5e,-100.0,-100.0,-100.0,-100.0,-100.000000,-100.0,-100.000000,-100.0,...,-100.0,0,-100.0,-100.0,-100.0,0,-100.0,1188.0,-100.0,-100.0
2,1,01652571-e526-4474-85e5-cf2dd063e9e1,-100.0,-100.0,-100.0,-100.0,-100.000000,-100.0,-100.000000,-100.0,...,-100.0,0,-100.0,-100.0,-100.0,0,-100.0,1188.0,-100.0,-100.0
3,1,03226ee6-be62-421a-b097-3eaa01a53285,-100.0,-100.0,-100.0,-100.0,-100.000000,-100.0,-100.000000,-100.0,...,-100.0,0,-100.0,-100.0,-100.0,0,-100.0,1188.0,-100.0,-100.0
4,1,0519dde0-c2d6-453f-bc82-a552bfc7051f,-100.0,-100.0,-100.0,-100.0,0.896554,-100.0,-100.000000,-100.0,...,-100.0,0,-100.0,-100.0,-100.0,0,-100.0,1188.0,-100.0,-100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13516447,169064,ee2a38ae-d621-4f3f-816b-210f09fcaee7,-100.0,-100.0,-100.0,-100.0,-100.000000,-100.0,-100.000000,-100.0,...,-100.0,0,-100.0,-100.0,-100.0,0,-100.0,25.0,-100.0,-100.0
13516448,169064,ef63b549-7f29-441a-aee1-a60f26e4de78,-100.0,-100.0,-100.0,-100.0,1.114847,-100.0,-100.000000,-100.0,...,-100.0,0,-100.0,-100.0,-100.0,0,-100.0,1188.0,-100.0,-100.0
13516449,169064,fc4b01db-ab3a-479b-8de1-c4b126d11251,-100.0,-100.0,-100.0,-100.0,-100.000000,-100.0,-100.000000,-100.0,...,-100.0,0,-100.0,-100.0,-100.0,0,-100.0,1188.0,-100.0,-100.0
13516450,169064,fd214a67-d620-4ba5-a6aa-14486114c87a,-100.0,-100.0,-100.0,-100.0,1.577394,-100.0,-100.000000,-100.0,...,-100.0,0,-100.0,-100.0,-100.0,0,-100.0,1188.0,-100.0,-100.0


I'm not sure if we are using specimen source for anything so I'll drop it for now

In [None]:
model_df = model_df.merge(lab_results,on=['Internalpatientid', 'Encounter ID'], how='left')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,heme_ida,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer,liver_heptaocellular_product,discharge_creatinine,discharge_lactate
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-100.0,0.0,-100.000000,-100.0,-100.000000,0.0,-100.0,1188.0,-100.0,-100.0
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-100.0,0.0,-100.000000,-100.0,-100.000000,0.0,-100.0,1188.0,-100.0,-100.0
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-100.0,0.0,1.201152,-100.0,-100.000000,0.0,-100.0,11.0,-100.0,-100.0
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-100.0,0.0,1.201152,-100.0,-100.000000,0.0,-100.0,11.0,-100.0,-100.0
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-100.0,0.0,-100.000000,-100.0,-100.000000,0.0,-100.0,18.0,-100.0,-100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,,,,,,,,,,
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,-100.0,0.0,-100.000000,-100.0,0.002281,0.0,-100.0,1558.0,-100.0,-100.0


In [None]:
model_df['drug_stimulant_use'] = model_df['drug_stimulant_use'].fillna(-1).astype('int8')
model_df['id_inflamed_up'] = model_df['id_inflamed_up'].fillna(-1).astype('int8')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,heme_ida,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer,liver_heptaocellular_product,discharge_creatinine,discharge_lactate
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-100.0,0,-100.000000,-100.0,-100.000000,0,-100.0,1188.0,-100.0,-100.0
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-100.0,0,-100.000000,-100.0,-100.000000,0,-100.0,1188.0,-100.0,-100.0
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-100.0,0,1.201152,-100.0,-100.000000,0,-100.0,11.0,-100.0,-100.0
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-100.0,0,1.201152,-100.0,-100.000000,0,-100.0,11.0,-100.0,-100.0
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-100.0,0,-100.000000,-100.0,-100.000000,0,-100.0,18.0,-100.0,-100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,,-1,,,,-1,,,,
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,-1,,,,-1,,,,
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,-1,,,,-1,,,,
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,-100.0,0,-100.000000,-100.0,0.002281,0,-100.0,1558.0,-100.0,-100.0


In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

# Merging Medications Ordered

In [None]:
model_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

In [None]:
medications_ordered  = pd.read_parquet(rearranged_file_paths[12])
#Drop datetime columns
datetime_columns = medications_ordered.select_dtypes(include='datetime').columns
medications_ordered = medications_ordered.drop(columns=datetime_columns)
medications_ordered = medications_ordered.drop(columns=['Age at med ordered','Ordered medication atc 5','desc','keep','Stop reason'])
#One hot encode the Diagnosis Sequence or Rank column
one_hot = pd.get_dummies(medications_ordered['Order status'], prefix='Order_status')
medications_ordered = pd.concat([medications_ordered, one_hot], axis=1)
#Drop the Order_status column
medications_ordered.drop(columns=['Order status'], inplace=True)

# Exclude certain columns for groupby operation
columns_to_exclude = ['Encounter ID', 'Internalpatientid']

# Group by 'Encounter ID' and 'Internalpatientid' and sum other columns
medications_ordered = medications_ordered.groupby(['Encounter ID', 'Internalpatientid']).sum().reset_index()

# Convert values greater than 1 to 1
for col in medications_ordered.columns.difference(columns_to_exclude):
    medications_ordered[col] = medications_ordered[col].apply(lambda x: 1 if x >= 1 else 0)

medications_ordered

Unnamed: 0,Internalpatientid,Encounter ID,acei,arb,arni,sglt2i,mra,gdmtBB,loopDiuretic,thiazideDiuretic,...,Order_status_complete,Order_status_delayed,Order_status_discontinued,Order_status_discontinued/edit,Order_status_expired,Order_status_hold,Order_status_lapsed,Order_status_pending,Order_status_renewed,Order_status_unreleased
0,61915,a2a885ac-a147-4155-ab65-b8c3450f3a81,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,33870,9b3f3f81-e4f3-464c-9223-dd7d8bd57138,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,48767,42cc3283-ea83-516f-b70f-2a350af4ee97,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,124600,b569f7a3-22fd-5cb1-a425-4a1ad5ce2cba,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,40729,58381482-5e6c-5a32-b896-362bceb6a945,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2769209,85129,75ff2567-d719-4056-b926-47ed10498490,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2769210,115905,9d092042-9850-4c39-b679-fd77864c011d,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2769211,99976,268db7d2-57d8-4bd7-98de-8cb528d28c91,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2769212,167678,6d017556-0ba4-437a-862f-8e3436df9c4d,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
model_df = model_df.merge(medications_ordered,on=['Internalpatientid', 'Encounter ID'], how='left')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,Order_status_complete,Order_status_delayed,Order_status_discontinued,Order_status_discontinued/edit,Order_status_expired,Order_status_hold,Order_status_lapsed,Order_status_pending,Order_status_renewed,Order_status_unreleased
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,,,,,,,,,,
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,,,,,,,,,,
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,,,,,,,,,,
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,,,,,,,,,,


In [None]:
# Columns to fill and convert
cols_to_fill_and_convert = ['acei', 'arb', 'arni', 'sglt2i', 'mra', 'gdmtBB', 'loopDiuretic',
                            'thiazideDiuretic', 'p2y12i', 'aspirin', 'doac', 'warfarin', 'amio',
                            'sotalol', 'dofetilide', 'dobutamine', 'dopamine', 'milrinone',
                            'mexiletine', 'dronedarone', 'antiarrhythmic1c', 'nonDihydropyridineCAB',
                            'bicarbonate', 'phosBinder', 'he_ppx', 'asa5', 'immunoMod',
                            'methotrexate', 'tnfA_inh', 'cni', 'mTor_inh', 'prednisone', 'nicotine',
                            'parkinsons', 'pHTN_Rx', 'insulin', 'alzheimers_Rx', 'methadone', 'oxygen',
                            'hcq', 'rituximab', 'cyclophosphamide', 'doxorubicin', 'cisplatin',
                            'pyrimidineAM', 'topoisomerasei', 'norepinephrine', 'vasopressin',
                            'enoxaparin', 'cilostazol', 'dpp4i', 'midodrine', 'nsaids',
                            'thiazolidinediones', 'chestPainRx', 'IMiDs', 'checkpointinhibitors',
                            'Order_status_active', 'Order_status_cancelled', 'Order_status_complete',
                            'Order_status_delayed', 'Order_status_discontinued',
                            'Order_status_discontinued/edit', 'Order_status_expired', 'Order_status_hold',
                            'Order_status_lapsed', 'Order_status_pending', 'Order_status_renewed',
                            'Order_status_unreleased']

# Add 'MO_' prefix to the selected columns
model_df.rename(columns={col: 'MO_'+col for col in cols_to_fill_and_convert}, inplace=True)

# Fill NaN values with -1 and convert to int8
for col in cols_to_fill_and_convert:
    model_df[col] = model_df[col].fillna(-1).astype('int8')

# Columns to remove 'MO_' prefix from
cols_to_rename = ['MO_Order_status_active','MO_Order_status_cancelled','MO_Order_status_complete', 'MO_Order_status_delayed', 'MO_Order_status_discontinued',
                  'MO_Order_status_discontinued/edit', 'MO_Order_status_expired', 'MO_Order_status_hold',
                  'MO_Order_status_lapsed', 'MO_Order_status_pending', 'MO_Order_status_renewed',
                  'MO_Order_status_unreleased']

# Remove 'MO_' prefix from the selected columns
model_df.rename(columns={col: col.replace('MO_', '') for col in cols_to_rename}, inplace=True)

In [None]:
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,Order_status_complete,Order_status_delayed,Order_status_discontinued,Order_status_discontinued/edit,Order_status_expired,Order_status_hold,Order_status_lapsed,Order_status_pending,Order_status_renewed,Order_status_unreleased
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0,0,1,0,0,0,0,0,0,0
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0,0,1,0,0,0,0,0,0,0
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [None]:
pd.set_option('display.max_rows', 20)
model_df.dtypes[-20:]

MO_cilostazol                     int8
MO_dpp4i                          int8
MO_midodrine                      int8
MO_nsaids                         int8
MO_thiazolidinediones             int8
MO_chestPainRx                    int8
MO_IMiDs                          int8
MO_checkpointinhibitors           int8
Order_status_active               int8
Order_status_cancelled            int8
Order_status_complete             int8
Order_status_delayed              int8
Order_status_discontinued         int8
Order_status_discontinued/edit    int8
Order_status_expired              int8
Order_status_hold                 int8
Order_status_lapsed               int8
Order_status_pending              int8
Order_status_renewed              int8
Order_status_unreleased           int8
dtype: object

In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

# Merging Medications Administered

In [None]:
model_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

In [None]:
mar = pd.read_csv('/content/drive/MyDrive/collapsedMAR.csv').drop('Unnamed: 0',axis=1)
mar

Unnamed: 0,Count,desc,name,route,acei,arb,arni,sglt2i,mra,gdmtBB,...,midazolam,propofol,precedex,solumedrol,amphotericinB,mexiletine,dronedarone,antiarrhythmic1c,ndpCCB_po,ndpCCB_IV
0,23201,diltiazem - tab,diltiazem,tab,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,20309,"diltiazem - cap,sa",diltiazem,"cap,sa",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,5814,"diltiazem ; triamcinolone - aerosol,oral",diltiazem ; triamcinolone,"aerosol,oral",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2992,"verapamil - tab,sa",verapamil,"tab,sa",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1407,verapamil - tab,verapamil,tab,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4603,1,"itraconazole - inj,soln",itraconazole,"inj,soln",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4604,1,"ixekizumab - inj,soln",ixekizumab,"inj,soln",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4605,1,kanamycin - inj,kanamycin,inj,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4606,1,kaolin - liquid,kaolin,liquid,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
medications_administered  = pd.read_parquet(rearranged_file_paths[6])
#Drop datetime columns
datetime_columns = medications_administered.select_dtypes(include='datetime').columns
medications_administered = medications_administered.drop(columns=datetime_columns)
medications_administered = medications_administered.drop(columns=['Age at med administration'])#,'Administered medication atc 5'
#Merge mar with medications administered
medications_administered = medications_administered.merge(mar, left_on=['Administered medication atc 5', 'Dose form'], right_on=['name', 'route'])
#Drop unnecessary columns
medications_administered = medications_administered.drop(columns=['Administered medication atc 5','Dose form','Count','Dose unit administered','name','desc','route','Dose administered'])

medications_administered

Unnamed: 0,Internalpatientid,Administration status,Encounter ID,acei,arb,arni,sglt2i,mra,gdmtBB,loopDiureticPO,...,midazolam,propofol,precedex,solumedrol,amphotericinB,mexiletine,dronedarone,antiarrhythmic1c,ndpCCB_po,ndpCCB_IV
0,78275,Given,6a0188e5-a4fe-5a56-bdfe-27b4533ccd40,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,137733,Given,2b3a6590-6266-5441-bdee-e7a63a57e787,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,138906,Given,c5c2ed5f-57aa-51eb-8efc-9c239dba4e5f,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,112283,Given,f15865a3-e139-4a9d-ade1-8f39f5a4ae4f,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,59445,Given,d8fc2c26-36b5-5ded-a090-9e889ab21c80,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,11724,Given,6434d72d-d84b-554f-b809-e89acac7d7a1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9999996,92570,Given,21944a47-9752-5c77-a96e-c4900a4d650e,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9999997,165097,Given,1dcb8fa9-e942-56fe-bb0c-6b3a0dc47663,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9999998,63425,Completed (infusion orders),ccb0bb72-a52b-5fc3-8842-968e876839c4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#One hot encode the Diagnosis Sequence or Rank column
one_hot = pd.get_dummies(medications_administered['Administration status'], prefix='Administration_status')
medications_administered = pd.concat([medications_administered, one_hot], axis=1)
#Drop the Order_status column
medications_administered.drop(columns=['Administration status'], inplace=True)

# Exclude certain columns for groupby operation
columns_to_exclude = ['Encounter ID', 'Internalpatientid']

# Group by 'Encounter ID' and 'Internalpatientid' and sum other columns
medications_administered = medications_administered.groupby(['Encounter ID', 'Internalpatientid']).sum().reset_index()

# Convert values greater than 1 to 1
for col in medications_administered.columns.difference(columns_to_exclude):
    medications_administered[col] = medications_administered[col].apply(lambda x: 1 if x >= 1 else 0)
medications_administered

Unnamed: 0,Encounter ID,Internalpatientid,acei,arb,arni,sglt2i,mra,gdmtBB,loopDiureticPO,loopDiureticIV,...,ndpCCB_IV,Administration_status_Completed (infusion orders),Administration_status_Given,Administration_status_Held,Administration_status_Infusing (infusion orders),Administration_status_Missing dose,Administration_status_Not given,Administration_status_Refused,Administration_status_Removed (patch taken off),Administration_status_Stopped (infusion orders)
0,00000536-a2ae-43d1-a558-94d1dd971ba6,21709,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,000007d8-15ca-481d-a788-2700c26b414d,116954,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0000138c-4bb3-43e6-a59c-faac708e4a49,82552,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,00002068-4370-49c9-b6e6-70b2abd4d9f3,85561,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,00002652-7189-471f-aa2a-7a25cc6cfeee,145579,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2886027,ffffeae8-b286-4240-9374-f79b4a7f8255,38100,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2886028,fffff60c-1a77-4fcd-922a-6bcdecf32082,119754,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2886029,fffffa48-3e27-42b7-94c3-57731cf84c7a,163768,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2886030,fffffc48-b17e-59f9-8409-5355dab6445f,10372,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [None]:
#Modify int64 columns
int64_columns = [col for col in medications_administered.select_dtypes(include='int64').columns if col != 'Internalpatientid']
for col in int64_columns:
  medications_administered[col] = medications_administered[col].astype('int8')
medications_administered

Unnamed: 0,Encounter ID,Internalpatientid,acei,arb,arni,sglt2i,mra,gdmtBB,loopDiureticPO,loopDiureticIV,...,ndpCCB_IV,Administration_status_Completed (infusion orders),Administration_status_Given,Administration_status_Held,Administration_status_Infusing (infusion orders),Administration_status_Missing dose,Administration_status_Not given,Administration_status_Refused,Administration_status_Removed (patch taken off),Administration_status_Stopped (infusion orders)
0,00000536-a2ae-43d1-a558-94d1dd971ba6,21709,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,000007d8-15ca-481d-a788-2700c26b414d,116954,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0000138c-4bb3-43e6-a59c-faac708e4a49,82552,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,00002068-4370-49c9-b6e6-70b2abd4d9f3,85561,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,00002652-7189-471f-aa2a-7a25cc6cfeee,145579,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2886027,ffffeae8-b286-4240-9374-f79b4a7f8255,38100,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2886028,fffff60c-1a77-4fcd-922a-6bcdecf32082,119754,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2886029,fffffa48-3e27-42b7-94c3-57731cf84c7a,163768,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2886030,fffffc48-b17e-59f9-8409-5355dab6445f,10372,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [None]:
# Columns to fill and convert
cols_to_fill_and_convert = ['acei', 'arb', 'arni', 'sglt2i', 'mra', 'gdmtBB', 'loopDiureticPO',
               'loopDiureticIV', 'thiazideDiuretic', 'p2y12iPO', 'cangrelor', 'doac',
               'warfarin', 'amioPO', 'amioIV', 'norepinephrine', 'vasopressin',
               'epinephrine', 'phenylephrine', 'sotalol', 'dofetilide', 'dobutamine',
               'dopamine', 'milrinone', 'nitroprusside', 'esmolol', 'midazolam',
               'propofol', 'precedex', 'solumedrol', 'amphotericinB', 'mexiletine',
               'dronedarone', 'antiarrhythmic1c', 'ndpCCB_po', 'ndpCCB_IV']


# Add 'MO_' prefix to the selected columns
medications_administered.rename(columns={col: 'MA_'+col for col in cols_to_fill_and_convert}, inplace=True)
medications_administered

Unnamed: 0,Encounter ID,Internalpatientid,MA_acei,MA_arb,MA_arni,MA_sglt2i,MA_mra,MA_gdmtBB,MA_loopDiureticPO,MA_loopDiureticIV,...,MA_ndpCCB_IV,Administration_status_Completed (infusion orders),Administration_status_Given,Administration_status_Held,Administration_status_Infusing (infusion orders),Administration_status_Missing dose,Administration_status_Not given,Administration_status_Refused,Administration_status_Removed (patch taken off),Administration_status_Stopped (infusion orders)
0,00000536-a2ae-43d1-a558-94d1dd971ba6,21709,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,000007d8-15ca-481d-a788-2700c26b414d,116954,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0000138c-4bb3-43e6-a59c-faac708e4a49,82552,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,00002068-4370-49c9-b6e6-70b2abd4d9f3,85561,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,00002652-7189-471f-aa2a-7a25cc6cfeee,145579,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2886027,ffffeae8-b286-4240-9374-f79b4a7f8255,38100,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2886028,fffff60c-1a77-4fcd-922a-6bcdecf32082,119754,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2886029,fffffa48-3e27-42b7-94c3-57731cf84c7a,163768,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2886030,fffffc48-b17e-59f9-8409-5355dab6445f,10372,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [None]:
model_df = model_df.merge(medications_administered,on=['Internalpatientid', 'Encounter ID'], how='left')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,MA_ndpCCB_IV,Administration_status_Completed (infusion orders),Administration_status_Given,Administration_status_Held,Administration_status_Infusing (infusion orders),Administration_status_Missing dose,Administration_status_Not given,Administration_status_Refused,Administration_status_Removed (patch taken off),Administration_status_Stopped (infusion orders)
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,,,,,,,,,,
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,,,,,,,,,,
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,,,,,,,,,,
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,Order_status_complete,Order_status_delayed,Order_status_discontinued,Order_status_discontinued/edit,Order_status_expired,Order_status_hold,Order_status_lapsed,Order_status_pending,Order_status_renewed,Order_status_unreleased
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0,0,1,0,0,0,0,0,0,0
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,0,0,1,0,0,0,0,0,0,0
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [None]:
# List of columns to fill NaN values and convert to int8
cols_to_fill_and_convert = ['MA_acei', 'MA_arb', 'MA_arni', 'MA_sglt2i', 'MA_mra', 'MA_gdmtBB',
                            'MA_loopDiureticPO', 'MA_loopDiureticIV', 'MA_thiazideDiuretic',
                            'MA_p2y12iPO', 'MA_cangrelor', 'MA_doac', 'MA_warfarin', 'MA_amioPO',
                            'MA_amioIV', 'MA_norepinephrine', 'MA_vasopressin', 'MA_epinephrine',
                            'MA_phenylephrine', 'MA_sotalol', 'MA_dofetilide', 'MA_dobutamine',
                            'MA_dopamine', 'MA_milrinone', 'MA_nitroprusside', 'MA_esmolol',
                            'MA_midazolam', 'MA_propofol', 'MA_precedex', 'MA_solumedrol',
                            'MA_amphotericinB', 'MA_mexiletine', 'MA_dronedarone',
                            'MA_antiarrhythmic1c', 'MA_ndpCCB_po', 'MA_ndpCCB_IV',
                            'Administration_status_Completed (infusion orders)',
                            'Administration_status_Given', 'Administration_status_Held',
                            'Administration_status_Infusing (infusion orders)',
                            'Administration_status_Missing dose', 'Administration_status_Not given',
                            'Administration_status_Refused', 'Administration_status_Removed (patch taken off)',
                            'Administration_status_Stopped (infusion orders)']

# Fill NaN values and convert to int8
for col in cols_to_fill_and_convert:
    model_df[col] = model_df[col].fillna(-1).astype('int8')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,MA_ndpCCB_IV,Administration_status_Completed (infusion orders),Administration_status_Given,Administration_status_Held,Administration_status_Infusing (infusion orders),Administration_status_Missing dose,Administration_status_Not given,Administration_status_Refused,Administration_status_Removed (patch taken off),Administration_status_Stopped (infusion orders)
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,0,0,1,0,1,0,0,0,0,0


In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

# Merging procedures

In [None]:
model_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

In [None]:
proc_codes = pd.read_csv('/content/drive/MyDrive/keepProcs.csv').drop('Unnamed: 0',axis=1)
proc_codes = proc_codes[proc_codes['keep'] != 0].drop('keep',axis=1)
proc_codes

Unnamed: 0,code,desc,tMCS,ecmo,lvad,tsfx,iabp,dialysis,ett,trach,...,cvc,aline,chestTube,tips,chemo,homeInotropes,cerebralThrombectomy,valvuloplasty,exLap,le_amputation
0,93456,CATHETER PLACEMENT IN CORONARY ARTERY(S) FOR C...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,93457,CATHETER PLACEMENT IN CORONARY ARTERY(S) FOR C...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,93453,COMBINED RIGHT AND LEFT HEART CATHETERIZATION ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,93460,CATHETER PLACEMENT IN CORONARY ARTERY(S) FOR C...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,93461,CATHETER PLACEMENT IN CORONARY ARTERY(S) FOR C...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27891,041D0J2,Bypass Left Common Iliac Artery to Mesenteric ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27892,041H09H,Bypass Right External Iliac Artery to Right Fe...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27893,5250F,ASTHMA DISCHARGE PLAN PROVIDED TO PATIENT (AST...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27894,041K09H,Bypass Right Femoral Artery to Right Femoral A...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
procedures = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/procedures/*.parquet')
#Drop datetime columns
datetime_columns = procedures.select_dtypes(include='datetime').columns
procedures = procedures.drop(columns=datetime_columns)
procedures = procedures.drop(columns=['Age at procedure'])#,'Administered medication atc 5'
#Merge mar with medications administered
procedures = procedures.merge(proc_codes, left_on=['Procedure code description', 'Procedure code'], right_on=['desc', 'code'])
#Drop unnecessary columns
procedures = procedures.drop(columns=['Procedure code','Procedure code description','code','desc'])
procedures = procedures.compute()
procedures

Unnamed: 0,Internalpatientid,Encounter ID,tMCS,ecmo,lvad,tsfx,iabp,dialysis,ett,trach,...,cvc,aline,chestTube,tips,chemo,homeInotropes,cerebralThrombectomy,valvuloplasty,exLap,le_amputation
0,122022,bf8fe5c6-8baa-5c50-87b7-4af72928bc2a,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,13208,404de072-a15c-40cb-8759-1ecfa13a8dc2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,134703,4a8e923e-9c2f-5c63-85e1-7e289cb34eb5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,157503,41e4c4b7-8704-4a1d-ba3c-a63122ebd310,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,60583,53b67455-8cc5-57c3-89ab-b37c1c5c48a0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
952,118244,26c4b84f-5aa6-4fe6-8810-38b59c43b412,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
953,70679,bccea6e2-246c-4ca2-a5e4-def84ccce68f,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
954,116753,c5fbe189-0465-4fd7-b095-0eedb54960b1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
955,115863,3100642b-a46b-4c79-b56f-217858fd62e3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Exclude certain columns for groupby operation
columns_to_exclude = ['Encounter ID', 'Internalpatientid']

# Group by 'Encounter ID' and 'Internalpatientid' and sum other columns
procedures = procedures.groupby(['Encounter ID', 'Internalpatientid']).sum().reset_index()

# Convert values greater than 1 to 1
for col in procedures.columns.difference(columns_to_exclude):
    procedures[col] = procedures[col].apply(lambda x: 1 if x >= 1 else 0)
procedures

Unnamed: 0,Encounter ID,Internalpatientid,tMCS,ecmo,lvad,tsfx,iabp,dialysis,ett,trach,...,cvc,aline,chestTube,tips,chemo,homeInotropes,cerebralThrombectomy,valvuloplasty,exLap,le_amputation
0,00002881-3cc4-5525-b3b6-a2efcb874fbe,128845,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0000350b-e0e8-4a20-8dc8-e1d198457e77,33946,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0000445a-40f1-48e8-8f08-3589152c6add,117656,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,00006db2-8435-451d-ab72-6525165a4758,124220,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,00009f38-3488-43de-bc9e-30b5e87408ed,134571,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
833855,ffff8eb2-1a09-4b06-bd91-c8c025cc2e80,103515,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
833856,ffff9d3a-1d72-5c0d-b7c6-93a289d1cf90,69168,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
833857,ffffc9cc-9b5e-40df-80f5-1dff3db0c93e,156086,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
833858,ffffcd5c-9c25-52bf-b3c8-9ace8c407297,135384,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
model_df = model_df.merge(procedures,on=['Internalpatientid', 'Encounter ID'], how='left')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,cvc,aline,chestTube,tips,chemo,homeInotropes,cerebralThrombectomy,valvuloplasty,exLap,le_amputation
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,,,,,,,,,,
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,,,,,,,,,,
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,,,,,,,,,,
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,,,,,,,,,,
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,,,,,,,,,,


In [None]:
model_df.dtypes[-50:]

In [None]:
# List of columns to fill NaN values and convert to int8
cols_to_fill_and_convert = ['tMCS', 'ecmo', 'lvad', 'tsfx', 'iabp', 'dialysis', 'ett', 'trach',
                            'corAngio', 'rhc', 'cardioMEMS', 'pci', 'tpa_STEMI', 'icd', 'crtd',
                            'pericardiocentesis', 'pericardiotomy', 'aorticValveSurgery',
                            'mitralValveSurgery', 'tricuspidValveSurgery', 'cabg', 'ventriculotomy',
                            'heartTransplant', 'renalTransplant', 'liverTransplant',
                            'thrombectomy_lysis', 'neurosrugery', 'peg_tube', 'plex', 'cvc', 'aline',
                            'chestTube', 'tips', 'chemo', 'homeInotropes', 'cerebralThrombectomy',
                            'valvuloplasty', 'exLap', 'le_amputation']

# Fill NaN values and convert to int8
for col in cols_to_fill_and_convert:
    model_df[col] = model_df[col].fillna(-1).astype('int8')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,cvc,aline,chestTube,tips,chemo,homeInotropes,cerebralThrombectomy,valvuloplasty,exLap,le_amputation
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

# Merge demographics static

In [None]:
demographics_static  = pd.read_parquet(rearranged_file_paths[13])
#Drop datetime columns
datetime_columns = demographics_static.select_dtypes(include='datetime').columns
demographics_static = demographics_static.drop(columns=datetime_columns)

#Modify bool columns
bool_columns = demographics_static.select_dtypes(include='bool').columns
for col in bool_columns:
  demographics_static[col] = demographics_static[col].fillna(-1).astype('int8')

#One hot encode the Diagnosis Sequence or Rank column
one_hot = pd.get_dummies(demographics_static['Ethnicity'], prefix='Ethnicity')
demographics_static = pd.concat([demographics_static, one_hot], axis=1)

#Drop the DSNR column
demographics_static.drop(columns=['Ethnicity'], inplace=True)

# List of columns to rename
cols_to_rename = ['Asian', 'Asian ; Other', 'Black or African American',
                  'Black or African American ; Asian', 'Black or African American ; Other',
                  'Black or African American ; White', 'Black or African American ; White ; Asian',
                  'Black or African American ; White ; Other', 'Other', 'White', 'White ; Asian',
                  'White ; Asian ; Other', 'White ; Other']

# Rename columns
demographics_static = demographics_static.rename(columns={col: 'Race_' + col for col in cols_to_rename})

demographics_static['Veteran flag'] = demographics_static['Veteran flag'].fillna(-1).astype('int8')
demographics_static

Unnamed: 0,Internalpatientid,Veteran flag,Female,Male,Race_Asian,Race_Asian ; Other,Race_Black or African American,Race_Black or African American ; Asian,Race_Black or African American ; Other,Race_Black or African American ; White,Race_Black or African American ; White ; Asian,Race_Black or African American ; White ; Other,Race_Other,Race_White,Race_White ; Asian,Race_White ; Asian ; Other,Race_White ; Other,Ethnicity_Hispanic or Latino,Ethnicity_Not Hispanic or Latino
0,168674,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
1,168681,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,168696,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,168711,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
4,168720,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106760,168624,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
106761,168626,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
106762,168628,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
106763,168645,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [None]:
demographics_static.dtypes

Internalpatientid                                 int32
Veteran flag                                       int8
Female                                             int8
Male                                               int8
Race_Asian                                         int8
Race_Asian ; Other                                 int8
Race_Black or African American                     int8
Race_Black or African American ; Asian             int8
Race_Black or African American ; Other             int8
Race_Black or African American ; White             int8
Race_Black or African American ; White ; Asian     int8
Race_Black or African American ; White ; Other     int8
Race_Other                                         int8
Race_White                                         int8
Race_White ; Asian                                 int8
Race_White ; Asian ; Other                         int8
Race_White ; Other                                 int8
Ethnicity_Hispanic or Latino                    

In [None]:
model_df = model_df.merge(demographics_static,on=['Internalpatientid'], how='left')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,Race_Black or African American ; White,Race_Black or African American ; White ; Asian,Race_Black or African American ; White ; Other,Race_Other,Race_White,Race_White ; Asian,Race_White ; Asian ; Other,Race_White ; Other,Ethnicity_Hispanic or Latino,Ethnicity_Not Hispanic or Latino
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,,,,,,,,,,
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,,,,,,,,,,
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [None]:
model_df.dtypes[-20:]

exLap                                             int8
le_amputation                                     int8
Veteran flag                                      int8
Female                                            int8
Male                                              int8
Race_Asian                                        int8
Race_Asian ; Other                                int8
Race_Black or African American                    int8
Race_Black or African American ; Asian            int8
Race_Black or African American ; Other            int8
Race_Black or African American ; White            int8
Race_Black or African American ; White ; Asian    int8
Race_Black or African American ; White ; Other    int8
Race_Other                                        int8
Race_White                                        int8
Race_White ; Asian                                int8
Race_White ; Asian ; Other                        int8
Race_White ; Other                                int8
Ethnicity_

In [None]:
cols_to_modify = ['Veteran flag', 'Female', 'Male', 'Race_Asian', 'Race_Asian ; Other',
                  'Race_Black or African American', 'Race_Black or African American ; Asian',
                  'Race_Black or African American ; Other', 'Race_Black or African American ; White',
                  'Race_Black or African American ; White ; Asian', 'Race_Black or African American ; White ; Other',
                  'Race_Other', 'Race_White', 'Race_White ; Asian', 'Race_White ; Asian ; Other',
                  'Race_White ; Other', 'Ethnicity_Hispanic or Latino', 'Ethnicity_Not Hispanic or Latino']

for col in cols_to_modify:
    model_df[col] = model_df[col].fillna(-1).astype('int8')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,Race_Black or African American ; White,Race_Black or African American ; White ; Asian,Race_Black or African American ; White ; Other,Race_Other,Race_White,Race_White ; Asian,Race_White ; Asian ; Other,Race_White ; Other,Ethnicity_Hispanic or Latino,Ethnicity_Not Hispanic or Latino
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0,0,0,0,0,0,0,1,0,1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0,0,0,0,0,0,0,1,0,1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0,0,0,0,1,0,0,0,0,1
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,0,0,0,0,1,0,0,0,0,1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,0,0,0,0,1,0,0,0,0,1


In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

In [None]:
pd.set_option('display.max_rows', 10)
model_df.dtypes

Finally, I'll move the outcome variables to the very end of the model dataframe for easier access during training

In [None]:
model_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

In [None]:
# Get a list of all column names
cols = list(model_df.columns)

# Specify the columns you want to move to the end
cols_to_move = ['readmit_allcause_30d', 'readmit_allcause_90d', 'readmit_allcause_180d', 'readmit_allcause_365d',
                'readmit_CV_30d', 'readmit_CV_90d', 'readmit_CV_180d', 'readmit_CV_365d',
                'mortality_inhosp_CV', 'mortality_inhosp_allcause']

# Remove these columns from the original column list
cols = [col for col in cols if col not in cols_to_move]

# Extend the original list with the columns to move. This puts these columns at the end.
cols.extend(cols_to_move)
# Reindex the DataFrame
model_df = model_df[cols]
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,mortality_inhosp_allcause
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,1,1,1,1,0,0,0,0,0,0
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,1,1,1,1,0,0,0,0,0,0
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,1,1,1,1,0,0,0,0,0,0
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,-1,-1,-1,-1,0,0,0,0,0,0


In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')

I'lll also create a version where all the object columns are dropped since I won't be using them

In [None]:
object_columns = model_df.select_dtypes(include='object')
object_columns

Unnamed: 0,code,Encounter ID,Stop code,diagnosis,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty
0,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,,,,,,
1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,,,,,,
2,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,,,,,,
3,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,,,,,,
4,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,,,,,,
...,...,...,...,...,...,...,...,...
18012027,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,,alcohol abuse,MEDICINE,MEDICINE,"PULMONARY, TUBERCULOSIS",Not specified (no value)
18012028,E440,f6956488-7e4c-5967-a95f-208aae167c25,,moderate protein-calorie malnutrition,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value)
18012029,F101,f6956488-7e4c-5967-a95f-208aae167c25,,alcohol abuse,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value)
18012030,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,,osteoarthritis unspecified site,SURGERY,SURGERY,NH SHORT STAY SKILLED NURSING,GENERAL SURGERY


In [None]:
model_df = model_df.select_dtypes(exclude='object')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,Age at visit,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,...,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,mortality_inhosp_allcause
0,101689,64,0,1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,107210,71,0,0,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,107866,61,0,0,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,-1,0,-1,-1,-1,-1,...,1,1,1,1,0,0,0,0,0,0
18012028,169062,-1,-1,0,-1,0,-1,-1,-1,-1,...,1,1,1,1,0,0,0,0,0,0
18012029,169062,-1,-1,0,-1,0,-1,-1,-1,-1,...,1,1,1,1,0,0,0,0,0,0
18012030,169064,-1,-1,0,-1,0,-1,-1,-1,-1,...,-1,-1,-1,-1,0,0,0,0,0,0


I'll also remove the Internalpatientid column since it won be used

In [None]:
model_df = model_df.drop('Internalpatientid',axis=1)
model_df

Unnamed: 0,Age at ed visit,Died during ed visit,CV diagnosis,Age at visit,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,Age at admission,...,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,mortality_inhosp_allcause
0,64,0,1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,64,0,1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,71,0,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,71,0,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,61,0,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,-1,-1,0,-1,0,-1,-1,-1,-1,73,...,1,1,1,1,0,0,0,0,0,0
18012028,-1,-1,0,-1,0,-1,-1,-1,-1,73,...,1,1,1,1,0,0,0,0,0,0
18012029,-1,-1,0,-1,0,-1,-1,-1,-1,73,...,1,1,1,1,0,0,0,0,0,0
18012030,-1,-1,0,-1,0,-1,-1,-1,-1,82,...,-1,-1,-1,-1,0,0,0,0,0,0


In [None]:
#ilter out the columns that have any NaN values
nan_cols = model_df.isna().sum()
nan_cols = nan_cols[nan_cols != 0]

# List of columns to exclude
exclude_cols = ['liver_heptaocellular_product']

# Select the columns
selected_columns = [col for col in nan_cols.index if col not in exclude_cols]

# Fill NaN values with -100
for col in selected_columns:
    model_df[col] = model_df[col].fillna(-100)

model_df['liver_heptaocellular_product'] = model_df['liver_heptaocellular_product'].fillna(1188)
model_df

Series([], dtype: int64)


In [None]:
model_df

Unnamed: 0,Age at ed visit,Died during ed visit,CV diagnosis,Age at visit,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,Age at admission,...,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,mortality_inhosp_allcause
0,64,0,1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,64,0,1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,71,0,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,71,0,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,61,0,0,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,-1,-1,0,-1,0,-1,-1,-1,-1,73,...,1,1,1,1,0,0,0,0,0,0
18012028,-1,-1,0,-1,0,-1,-1,-1,-1,73,...,1,1,1,1,0,0,0,0,0,0
18012029,-1,-1,0,-1,0,-1,-1,-1,-1,73,...,1,1,1,1,0,0,0,0,0,0
18012030,-1,-1,0,-1,0,-1,-1,-1,-1,82,...,-1,-1,-1,-1,0,0,0,0,0,0


In [None]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model_final.parquet')

In [None]:
model_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model.parquet')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,mortality_inhosp_allcause
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,1,1,1,1,0,0,0,0,0,0
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,1,1,1,1,0,0,0,0,0,0
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,1,1,1,1,0,0,0,0,0,0
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,-1,-1,-1,-1,0,0,0,0,0,0


In [None]:
model_df_filt = model_df[model_df['Internalpatientid'] == 1]
model_df_filt

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,mortality_inhosp_allcause
8674,1,75,0,0,E835,cd71eee5-0ea9-511c-89ef-fb54a37c7758,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
11887,1,79,0,1,I504,e8f395d3-c8d4-5cf4-a686-34352c9e47c3,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
11888,1,79,0,1,S700,e8f395d3-c8d4-5cf4-a686-34352c9e47c3,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
19002,1,76,0,0,R079,dc7d3573-3569-5524-a3b1-61dc48f7f610,-1,,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
406537,1,-1,-1,-1,Z790,cf27f2bd-1de6-5267-a75a-5334c38ea74b,74,CLINICAL PHARMACY,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16803745,1,-1,-1,0,N998,aee47e87-cab8-5ca7-9947-21cb2daf476b,-1,,0,-1,...,0,0,0,1,0,0,0,0,0,0
16803746,1,-1,-1,0,N998,aee47e87-cab8-5ca7-9947-21cb2daf476b,-1,,0,-1,...,0,0,0,1,0,0,0,0,0,0
16803747,1,-1,-1,0,N998,aee47e87-cab8-5ca7-9947-21cb2daf476b,-1,,0,-1,...,0,0,0,1,0,0,0,0,0,0
16803748,1,-1,-1,0,I489,d164c341-63a3-5729-9bf1-a6b54bae4a74,-1,,0,-1,...,0,0,0,1,0,0,0,0,0,0


In [None]:
model_df_filt.columns[-10:]

Index(['readmit_allcause_30d', 'readmit_allcause_90d', 'readmit_allcause_180d',
       'readmit_allcause_365d', 'readmit_CV_30d', 'readmit_CV_90d',
       'readmit_CV_180d', 'readmit_CV_365d', 'mortality_inhosp_CV',
       'mortality_inhosp_allcause'],
      dtype='object')

In [None]:
# List of columns to be aggregated
agg_columns = ['readmit_allcause_30d', 'readmit_allcause_90d', 'readmit_allcause_180d',
               'readmit_allcause_365d', 'readmit_CV_30d', 'readmit_CV_90d',
               'readmit_CV_180d', 'readmit_CV_365d', 'mortality_inhosp_CV',
               'mortality_inhosp_allcause']

# Group by 'internal_patient_id' and calculate the maximum value for each column
grouped_df = model_df.groupby('Internalpatientid')[agg_columns].max()
grouped_df

Unnamed: 0_level_0,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,mortality_inhosp_allcause
Internalpatientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,0,0,1,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,0,0
3,-1,-1,-1,-1,0,0,0,0,0,0
4,-1,-1,-1,-1,0,0,0,0,0,0
5,-1,-1,-1,-1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
169057,1,1,1,1,1,1,1,1,0,0
169060,1,1,1,1,1,1,1,1,0,0
169061,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
169062,1,1,1,1,0,0,0,0,0,0


In [None]:
# List of columns to be aggregated
agg_columns = ['readmit_allcause_30d', 'readmit_allcause_90d', 'readmit_allcause_180d',
               'readmit_allcause_365d', 'readmit_CV_30d', 'readmit_CV_90d',
               'readmit_CV_180d', 'readmit_CV_365d', 'mortality_inhosp_CV',
               'mortality_inhosp_allcause']

# Group by 'internal_patient_id' and calculate the maximum value for each column
grouped_df = model_df.groupby('Internalpatientid')[agg_columns].max()

# Step 1: Replace '-1' with NaN
grouped_df.replace(-1, float('nan'), inplace=True)

# Step 2 and 3: Find the most frequent positive value for each column within each group
most_frequent_positives = grouped_df[grouped_df > 0].mode()

# Step 4: Impute '-1' with the corresponding most frequent positive value within each group
grouped_df = grouped_df.fillna(most_frequent_positives.iloc[0])

# List of columns to be converted to 'int8'
cols_to_convert = ['readmit_allcause_30d', 'readmit_allcause_90d', 'readmit_allcause_180d',
                  'readmit_allcause_365d', 'readmit_CV_30d', 'readmit_CV_90d',
                  'readmit_CV_180d', 'readmit_CV_365d', 'mortality_inhosp_CV',
                  'mortality_inhosp_allcause']

# Convert specified columns to 'int8'
grouped_df[cols_to_convert] = grouped_df[cols_to_convert].astype('int8')

grouped_df

Unnamed: 0_level_0,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,mortality_inhosp_allcause
Internalpatientid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,0,0,1,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,0,0
3,1,1,1,1,0,0,0,0,0,0
4,1,1,1,1,0,0,0,0,0,0
5,1,1,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
169057,1,1,1,1,1,1,1,1,0,0
169060,1,1,1,1,1,1,1,1,0,0
169061,1,1,1,1,1,1,1,1,1,1
169062,1,1,1,1,0,0,0,0,0,0


In [None]:
# Update values in 'model_df' using 'grouped_df'
model_df.set_index('Internalpatientid', inplace=True)
model_df.update(grouped_df)

# Reset the index to its original state if needed
model_df.reset_index(inplace=True)

model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,mortality_inhosp_allcause
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
grouped_df['readmit_CV_30d'].value_counts()

0    58371
1    47056
Name: readmit_CV_30d, dtype: int64

In [None]:
# List of columns to be converted to 'int8'
cols_to_convert = ['readmit_allcause_30d', 'readmit_allcause_90d', 'readmit_allcause_180d',
                  'readmit_allcause_365d', 'readmit_CV_30d', 'readmit_CV_90d',
                  'readmit_CV_180d', 'readmit_CV_365d', 'mortality_inhosp_CV',
                  'mortality_inhosp_allcause']

# Convert specified columns to 'int8'
model_df[cols_to_convert] = model_df[cols_to_convert].astype('int8')
model_df

Unnamed: 0,Internalpatientid,Age at ed visit,Died during ed visit,CV diagnosis,code,Encounter ID,Age at visit,Stop code,Agentorangeflag,Combatflag,...,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,mortality_inhosp_allcause
0,101689,64,0,1,I502,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0,1,1,1,0,0,0,0,0,0
1,101689,64,0,1,N189,2afc637f-1e93-5ce7-b817-0784cebc77a1,-1,,-1,-1,...,0,1,1,1,0,0,0,0,0,0
2,107210,71,0,0,A419,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,1,1,1,1,1,1,1,1,0,0
3,107210,71,0,0,I959,0d3f5146-915f-5e92-b3e0-0ea9f4aa0141,-1,,-1,-1,...,1,1,1,1,1,1,1,1,0,0
4,107866,61,0,0,R918,efb622f7-8c68-5b26-ae9f-09147ff0dce7,-1,,-1,-1,...,1,1,1,1,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,169062,-1,-1,0,F101,e6460fae-0e3b-5fda-a016-f36cdd654819,-1,,0,-1,...,1,1,1,1,0,0,0,0,0,0
18012028,169062,-1,-1,0,E440,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,1,1,1,1,0,0,0,0,0,0
18012029,169062,-1,-1,0,F101,f6956488-7e4c-5967-a95f-208aae167c25,-1,,0,-1,...,1,1,1,1,0,0,0,0,0,0
18012030,169064,-1,-1,0,M199,0c52ac5b-8578-50a3-9f26-fbdf68dad075,-1,,0,-1,...,1,1,1,1,0,0,0,0,0,0


In [29]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model_V2.parquet')

In [30]:
model_df = model_df.select_dtypes(exclude='object')
model_df = model_df.drop('Internalpatientid',axis=1)
#ilter out the columns that have any NaN values
nan_cols = model_df.isna().sum()
nan_cols = nan_cols[nan_cols != 0]

# List of columns to exclude
exclude_cols = ['liver_heptaocellular_product']

# Select the columns
selected_columns = [col for col in nan_cols.index if col not in exclude_cols]

# Fill NaN values with -100
for col in selected_columns:
    model_df[col] = model_df[col].fillna(-100)

model_df['liver_heptaocellular_product'] = model_df['liver_heptaocellular_product'].fillna(1188)
model_df

Unnamed: 0,Age at ed visit,Died during ed visit,CV diagnosis,Age at visit,Agentorangeflag,Combatflag,Ionizingradiationflag,Serviceconnectedflag,Swasiaconditionsflag,Age at admission,...,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,mortality_inhosp_allcause
0,64,0,1,-1,-1,-1,-1,-1,-1,-1,...,0,1,1,1,0,0,0,0,0,0
1,64,0,1,-1,-1,-1,-1,-1,-1,-1,...,0,1,1,1,0,0,0,0,0,0
2,71,0,0,-1,-1,-1,-1,-1,-1,-1,...,1,1,1,1,1,1,1,1,0,0
3,71,0,0,-1,-1,-1,-1,-1,-1,-1,...,1,1,1,1,1,1,1,1,0,0
4,61,0,0,-1,-1,-1,-1,-1,-1,-1,...,1,1,1,1,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18012027,-1,-1,0,-1,0,-1,-1,-1,-1,73,...,1,1,1,1,0,0,0,0,0,0
18012028,-1,-1,0,-1,0,-1,-1,-1,-1,73,...,1,1,1,1,0,0,0,0,0,0
18012029,-1,-1,0,-1,0,-1,-1,-1,-1,73,...,1,1,1,1,0,0,0,0,0,0
18012030,-1,-1,0,-1,0,-1,-1,-1,-1,82,...,1,1,1,1,0,0,0,0,0,0


In [31]:
model_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Model DF/model_final_V2.parquet')