<a href="https://colab.research.google.com/github/victormurcia/VCHAMPS/blob/main/VCHAMPS_Harmonizing_Inpatient_Specialty_Level_of_Care.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#General utilities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm  # Import tqdm for the progress bar
import math
import glob,shutil,os,warnings,math,time,sys,re
from typing import List
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

#For UUID generation
import uuid

#For Slider viz
import ipywidgets as widgets
from IPython.display import display, clear_output,HTML

#Enable data to be extracted and downloaded from my Google Drive
from google.colab import drive, files
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Specify the path to the desired directory
directory_path = r'/content/drive/MyDrive/VCHAMPS - Train Cleaned'

# Change the current working directory to the desired directory
os.chdir(directory_path)

# Verify the current working directory
cwd = os.getcwd()

print(f"Current working directory: {cwd}")

Current working directory: /content/drive/MyDrive/VCHAMPS - Train Cleaned


In [3]:
#Instantiate level of care df
levelofcare_df = pd.read_csv('/content/drive/MyDrive/levelOfCare.csv')
levelofcare_df = levelofcare_df.drop('Unnamed: 0',axis=1)
levelofcare_df

Unnamed: 0,Specialty,counts,genMed,hospice,homeCare,homelessRecovery,rehab,snf,psych,obs,drug,stepdown,icu,other
0,GEN INTERMEDIATE PSYCH,12289,1,0,0,0,0,0,1,0,0,0,0,0
1,GEN MEDICINE (ACUTE),6372,1,0,0,0,0,0,0,0,0,0,0,0
2,GENERAL(ACUTE MEDICINE),249188,1,0,0,0,0,0,0,0,0,0,0,0
3,zGENERAL(ACUTE MEDICINE,15,1,0,0,0,0,0,0,0,0,0,0,0
4,HOSPICE FOR ACUTE CARE,7215,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,STROKE UNIT,6282,0,0,0,0,0,0,0,0,0,0,0,1
122,THORACIC SURGERY,9610,0,0,0,0,0,0,0,0,0,0,0,1
123,TRANSPLANTATION,11296,0,0,0,0,0,0,0,0,0,0,0,1
124,UROLOGY,13155,0,0,0,0,0,0,0,0,0,0,0,1


In [4]:
inpatient_specialty_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/inpatient_specialty.parquet')
inpatient_specialty_df

Unnamed: 0,Internalpatientid,Age at specialty,Specialty start date,Specialty end date,Specialty,Encounter ID
0,1,79,2022-12-31 05:41:51,2023-01-01 23:06:29,DERMATOLOGY,e8f395d3-c8d4-5cf4-a686-34352c9e47c3
1,100001,87,2012-07-21 23:00:00,2012-07-22 19:18:47,PM&R TRANSITIONAL REHAB,13f5cb00-3104-519d-89ef-33c7a85f2de8
2,100015,57,2002-07-16 06:40:13,2002-07-16 06:40:13,INTERMEDIATE MEDICINE,7348e73e-71bd-404f-bf10-177f0d249226
3,100015,58,2004-02-21 11:01:35,2004-02-22 14:56:40,DOMICILIARY PTSD,da52c625-e2bd-5031-ba84-a938baa4f96e
4,100019,81,1999-10-24 10:44:24,1999-10-31 10:20:31,PLASTIC SURGERY,fba82c7b-9bf6-599e-85cb-c65ed3831022
...,...,...,...,...,...,...
725466,99993,58,2007-02-18 00:35:16,2007-02-18 04:55:16,METABOLIC,ec0d7f6c-983e-5aa9-8474-d948f36a52f4
725467,99994,83,2016-10-18 10:35:16,2016-10-21 11:02:57,NH SHORT STAY DEMENTIA CARE,4b258fe8-25ab-57b4-8485-8313022fc335
725468,99994,86,2019-08-30 08:38:03,2019-09-05 09:28:48,"PULMONARY, NON-TB",c3900bc9-5e28-5def-8584-7f06a5bd9be3
725469,99996,56,2011-08-17 00:10:52,2011-08-17 00:10:52,GENERAL(ACUTE MEDICINE),502e87d1-488b-4849-bc7f-a7845e8966af


In [5]:
# Merge df1 and df2 on 'Specialty' column
inpatient_specialty_merged_df = pd.merge(inpatient_specialty_df, levelofcare_df, on='Specialty')
inpatient_specialty_merged_df

Unnamed: 0,Internalpatientid,Age at specialty,Specialty start date,Specialty end date,Specialty,Encounter ID,counts,genMed,hospice,homeCare,homelessRecovery,rehab,snf,psych,obs,drug,stepdown,icu,other
0,1,79,2022-12-31 05:41:51,2023-01-01 23:06:29,DERMATOLOGY,e8f395d3-c8d4-5cf4-a686-34352c9e47c3,11480,0,0,0,0,0,0,0,0,0,0,0,1
1,100095,84,2004-07-25 18:00:57,2004-07-26 08:04:01,DERMATOLOGY,98f745fb-5f07-4f4c-86f4-36979464dca3,11480,0,0,0,0,0,0,0,0,0,0,0,1
2,10064,51,2009-03-01 05:45:08,2009-03-01 07:23:26,DERMATOLOGY,437a3995-2112-5b1c-865b-6ba64c5bd0a1,11480,0,0,0,0,0,0,0,0,0,0,0,1
3,100751,65,2016-02-10 16:57:15,2016-02-14 13:43:38,DERMATOLOGY,63d9b053-ec7a-5c93-bc23-f52d522208c9,11480,0,0,0,0,0,0,0,0,0,0,0,1
4,101245,65,2015-12-27 20:54:57,2015-12-28 05:53:45,DERMATOLOGY,33037e3c-db21-57dc-89b0-fc1e09ebd5fc,11480,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435450,99689,74,2001-03-26 15:34:13,2001-03-26 17:13:06,"ZZSUBST ABUSE STAR I,II,II",99375e10-7fdc-5328-b8cb-3f4fb64796fe,6,0,0,0,0,0,0,0,0,1,0,0,0
1435451,99689,74,2001-03-26 17:09:34,2001-03-28 18:35:00,"ZZSUBST ABUSE STAR I,II,II",99375e10-7fdc-5328-b8cb-3f4fb64796fe,6,0,0,0,0,0,0,0,0,1,0,0,0
1435452,58306,52,2000-06-04 19:51:59,2000-06-11 18:08:46,"ZZSUBST ABUSE STAR I,II,II",56ec12dd-131a-4e22-a0b1-d6026f6ca121,6,0,0,0,0,0,0,0,0,1,0,0,0
1435453,58306,52,2000-06-02 19:22:06,2000-06-04 19:43:46,"ZZSUBST ABUSE STAR I,II,II",b24abd9c-f525-442b-92eb-b712b2c626a8,6,0,0,0,0,0,0,0,0,1,0,0,0


In [6]:
# Select the columns to be sum-collapsed and converted to binary
columns_to_collapse = ['genMed', 'hospice', 'homeCare', 'homelessRecovery', 'rehab',
                       'snf', 'psych', 'obs', 'drug', 'stepdown', 'icu', 'other']

# Group by 'Encounter ID' and sum-collapse the specified columns
grouped_df = inpatient_specialty_merged_df.groupby('Encounter ID')[columns_to_collapse].sum()

# Convert each dummy variable back to binary (if value > 0, 1, else 0)
binary_df = grouped_df.applymap(lambda x: 1 if x > 0 else 0)

# Add 'loc_' prefix to the column names
binary_df.columns = ['loc_' + col for col in binary_df.columns]
binary_df

Unnamed: 0_level_0,loc_genMed,loc_hospice,loc_homeCare,loc_homelessRecovery,loc_rehab,loc_snf,loc_psych,loc_obs,loc_drug,loc_stepdown,loc_icu,loc_other
Encounter ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0000347e-d7c6-52ff-b34b-0bc2c34c7f30,0,0,0,0,0,0,0,0,0,0,0,1
00007d37-6979-51f6-81cb-2cf4b98f5266,0,0,0,0,1,1,1,0,0,0,0,0
00008679-5bec-4d10-9511-74523a1cfd0d,0,0,0,0,0,0,0,1,0,0,0,0
000088c0-420b-4671-8b1d-90eed7d420f9,0,0,0,0,0,0,0,0,0,0,0,1
00008dd0-fe30-4782-8081-9de560716b24,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
ffffbff6-5789-53d9-b060-36a4ea4646d9,0,0,1,0,1,0,1,0,0,0,0,0
ffffd350-0a98-57de-92dd-b680c7ed041f,1,0,0,0,0,0,0,1,0,0,0,0
ffffd7c4-a2c3-44cd-a27b-101ae712f139,1,0,0,0,0,0,0,0,0,0,0,0
fffffc48-b17e-59f9-8409-5355dab6445f,0,1,0,0,0,0,0,0,0,0,0,1


In [7]:
# Merge binary_df with inpatient_specialty_merged_df on 'Encounter ID'
inpatient_specialty_merged_df = pd.merge(inpatient_specialty_merged_df, binary_df, on='Encounter ID', how='left')
inpatient_specialty_merged_df

Unnamed: 0,Internalpatientid,Age at specialty,Specialty start date,Specialty end date,Specialty,Encounter ID,counts,genMed,hospice,homeCare,...,loc_homeCare,loc_homelessRecovery,loc_rehab,loc_snf,loc_psych,loc_obs,loc_drug,loc_stepdown,loc_icu,loc_other
0,1,79,2022-12-31 05:41:51,2023-01-01 23:06:29,DERMATOLOGY,e8f395d3-c8d4-5cf4-a686-34352c9e47c3,11480,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,100095,84,2004-07-25 18:00:57,2004-07-26 08:04:01,DERMATOLOGY,98f745fb-5f07-4f4c-86f4-36979464dca3,11480,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,10064,51,2009-03-01 05:45:08,2009-03-01 07:23:26,DERMATOLOGY,437a3995-2112-5b1c-865b-6ba64c5bd0a1,11480,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,100751,65,2016-02-10 16:57:15,2016-02-14 13:43:38,DERMATOLOGY,63d9b053-ec7a-5c93-bc23-f52d522208c9,11480,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,101245,65,2015-12-27 20:54:57,2015-12-28 05:53:45,DERMATOLOGY,33037e3c-db21-57dc-89b0-fc1e09ebd5fc,11480,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435450,99689,74,2001-03-26 15:34:13,2001-03-26 17:13:06,"ZZSUBST ABUSE STAR I,II,II",99375e10-7fdc-5328-b8cb-3f4fb64796fe,6,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1435451,99689,74,2001-03-26 17:09:34,2001-03-28 18:35:00,"ZZSUBST ABUSE STAR I,II,II",99375e10-7fdc-5328-b8cb-3f4fb64796fe,6,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1435452,58306,52,2000-06-04 19:51:59,2000-06-11 18:08:46,"ZZSUBST ABUSE STAR I,II,II",56ec12dd-131a-4e22-a0b1-d6026f6ca121,6,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1435453,58306,52,2000-06-02 19:22:06,2000-06-04 19:43:46,"ZZSUBST ABUSE STAR I,II,II",b24abd9c-f525-442b-92eb-b712b2c626a8,6,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# icu days

In [8]:
# Select rows where 'loc_icu' is equal to 1
icu_rows_df = inpatient_specialty_merged_df[inpatient_specialty_merged_df['loc_icu'] == 1]
# Calculate duration of ICU stay for each encounter
icu_rows_df['ICU_days'] = (icu_rows_df['Specialty end date'] - icu_rows_df['Specialty start date']).dt.days
icu_rows_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  icu_rows_df['ICU_days'] = (icu_rows_df['Specialty end date'] - icu_rows_df['Specialty start date']).dt.days


Unnamed: 0,Internalpatientid,Age at specialty,Specialty start date,Specialty end date,Specialty,Encounter ID,counts,genMed,hospice,homeCare,...,loc_homelessRecovery,loc_rehab,loc_snf,loc_psych,loc_obs,loc_drug,loc_stepdown,loc_icu,loc_other,ICU_days
21,102845,77,2009-08-26 05:47:08,2009-09-11 23:46:22,DERMATOLOGY,512c199e-98a2-58f5-a0bd-b4591faa2417,11480,0,0,0,...,1,0,0,0,0,0,0,1,1,16
324,110603,67,2011-01-26 15:51:19,2011-01-27 13:10:53,DERMATOLOGY,4d19a385-ed38-5ce6-8497-2828a010932f,11480,0,0,0,...,0,0,0,0,0,0,0,1,1,0
363,102440,83,2010-12-16 22:11:30,2010-12-17 06:18:26,DERMATOLOGY,589f00df-7071-55ac-856a-492c9e02cc5a,11480,0,0,0,...,0,0,1,1,0,0,0,1,1,0
434,103594,81,2022-01-26 02:04:27,2022-01-27 11:45:26,DERMATOLOGY,cac3f000-18f6-5e4d-961b-30292509ec11,11480,0,0,0,...,0,0,0,1,0,0,0,1,1,1
518,112665,80,2001-10-13 04:10:13,2001-11-15 18:53:10,DERMATOLOGY,e4d7a4bd-11af-5d7e-8256-60916656259c,11480,0,0,0,...,0,0,0,0,0,0,0,1,1,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435287,96627,71,2002-12-22 16:44:18,2002-12-23 13:36:12,ZZSUBSTANCE ABUSE INTERMEDCARE,cfcc4e38-10d6-5356-81f4-584e1481b942,304,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1435305,100213,66,2019-12-09 22:46:37,2019-12-11 06:16:33,ZZDRUG DEPENDENCE TRMT UNIT,2ea028e5-19c4-57b1-bf46-554bca5771fb,120,0,0,0,...,0,1,0,0,0,1,0,1,1,1
1435309,113276,64,2015-08-07 20:49:40,2015-08-08 00:34:39,ZZDRUG DEPENDENCE TRMT UNIT,fef57d78-cca7-55a1-a835-4d6a5c4a03f5,120,0,0,0,...,0,0,0,0,0,1,0,1,1,0
1435312,113276,64,2015-08-05 21:10:27,2015-08-06 23:34:14,ZZDRUG DEPENDENCE TRMT UNIT,fef57d78-cca7-55a1-a835-4d6a5c4a03f5,120,0,0,0,...,0,0,0,0,0,1,0,1,1,1


In [9]:
# Merge icu_stay_df with inpatient_specialty_merged_df on 'Encounter ID'
inpatient_specialty_merged_df = pd.merge(inpatient_specialty_merged_df, icu_rows_df[['Encounter ID', 'ICU_days']], on='Encounter ID', how='left')
inpatient_specialty_merged_df

Unnamed: 0,Internalpatientid,Age at specialty,Specialty start date,Specialty end date,Specialty,Encounter ID,counts,genMed,hospice,homeCare,...,loc_homelessRecovery,loc_rehab,loc_snf,loc_psych,loc_obs,loc_drug,loc_stepdown,loc_icu,loc_other,ICU_days
0,1,79,2022-12-31 05:41:51,2023-01-01 23:06:29,DERMATOLOGY,e8f395d3-c8d4-5cf4-a686-34352c9e47c3,11480,0,0,0,...,0,0,0,0,0,0,0,0,1,
1,100095,84,2004-07-25 18:00:57,2004-07-26 08:04:01,DERMATOLOGY,98f745fb-5f07-4f4c-86f4-36979464dca3,11480,0,0,0,...,0,0,0,0,0,0,0,0,1,
2,10064,51,2009-03-01 05:45:08,2009-03-01 07:23:26,DERMATOLOGY,437a3995-2112-5b1c-865b-6ba64c5bd0a1,11480,0,0,0,...,0,0,0,0,0,0,0,0,1,
3,100751,65,2016-02-10 16:57:15,2016-02-14 13:43:38,DERMATOLOGY,63d9b053-ec7a-5c93-bc23-f52d522208c9,11480,0,0,0,...,0,0,0,0,0,0,0,0,1,
4,101245,65,2015-12-27 20:54:57,2015-12-28 05:53:45,DERMATOLOGY,33037e3c-db21-57dc-89b0-fc1e09ebd5fc,11480,0,0,0,...,0,0,0,0,0,0,0,0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1773126,99689,74,2001-03-26 15:34:13,2001-03-26 17:13:06,"ZZSUBST ABUSE STAR I,II,II",99375e10-7fdc-5328-b8cb-3f4fb64796fe,6,0,0,0,...,0,0,0,0,0,1,0,0,0,
1773127,99689,74,2001-03-26 17:09:34,2001-03-28 18:35:00,"ZZSUBST ABUSE STAR I,II,II",99375e10-7fdc-5328-b8cb-3f4fb64796fe,6,0,0,0,...,0,0,0,0,0,1,0,0,0,
1773128,58306,52,2000-06-04 19:51:59,2000-06-11 18:08:46,"ZZSUBST ABUSE STAR I,II,II",56ec12dd-131a-4e22-a0b1-d6026f6ca121,6,0,0,0,...,0,0,0,0,0,1,0,0,0,
1773129,58306,52,2000-06-02 19:22:06,2000-06-04 19:43:46,"ZZSUBST ABUSE STAR I,II,II",b24abd9c-f525-442b-92eb-b712b2c626a8,6,0,0,0,...,0,0,0,0,0,1,0,0,0,


# stepdown days

In [10]:
# Select rows where 'loc_icu' is equal to 1
stepdown_rows_df = inpatient_specialty_merged_df[inpatient_specialty_merged_df['loc_stepdown'] == 1]
# Calculate duration of ICU stay for each encounter
stepdown_rows_df['stepdown_days'] = (stepdown_rows_df['Specialty end date'] - stepdown_rows_df['Specialty start date']).dt.days
stepdown_rows_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stepdown_rows_df['stepdown_days'] = (stepdown_rows_df['Specialty end date'] - stepdown_rows_df['Specialty start date']).dt.days


Unnamed: 0,Internalpatientid,Age at specialty,Specialty start date,Specialty end date,Specialty,Encounter ID,counts,genMed,hospice,homeCare,...,loc_rehab,loc_snf,loc_psych,loc_obs,loc_drug,loc_stepdown,loc_icu,loc_other,ICU_days,stepdown_days
7,102128,66,2013-03-12 05:13:12,2013-03-12 08:58:42,DERMATOLOGY,1bcea015-3a08-580a-a1ff-62c381bffd6f,11480,0,0,0,...,0,0,0,0,0,1,0,1,,0
42,104735,71,2013-05-11 13:05:55,2013-05-13 23:09:21,DERMATOLOGY,58283f91-1c24-57c0-86e6-19e8aa35624c,11480,0,0,0,...,0,0,0,0,0,1,0,1,,2
60,104007,67,2015-03-27 04:50:12,2015-03-28 19:41:33,DERMATOLOGY,5e6730aa-df80-5073-b700-ae8e39ea389e,11480,0,0,0,...,0,0,0,0,0,1,0,1,,1
72,105095,76,2019-07-27 01:55:10,2019-07-27 02:08:57,DERMATOLOGY,0adee2c4-f8b4-50e8-b92f-10be572d14b2,11480,0,0,0,...,0,0,0,0,0,1,0,1,,0
135,104735,72,2013-12-20 19:55:00,2013-12-22 06:30:13,DERMATOLOGY,de9aae37-fa71-571c-a3c5-48e9e5935b6f,11480,0,0,0,...,0,0,0,0,0,1,0,1,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1772944,54717,73,2017-01-02 20:03:29,2017-01-02 20:03:29,CARDIAC STEP DOWN UNIT,23ef2682-8c20-58b1-a5ad-bd22aecdbe45,16,0,0,0,...,0,1,0,0,0,1,0,0,,0
1772945,53283,75,2022-09-07 09:22:10,2022-09-07 09:22:10,CARDIAC STEP DOWN UNIT,7934559d-c294-5706-9e41-c7659cfc7f56,16,0,0,0,...,0,0,0,0,0,1,0,0,,0
1772946,53283,76,2024-02-23 03:53:45,2024-02-23 03:53:45,CARDIAC STEP DOWN UNIT,9860a9d9-1e1f-583c-a3a4-16f3f1e6de62,16,0,0,0,...,0,0,0,0,0,1,0,0,,0
1772947,155757,81,2024-05-19 20:33:12,2024-05-19 20:33:12,CARDIAC STEP DOWN UNIT,01a4c662-b42e-5284-890d-58176055a241,16,0,0,0,...,0,0,0,0,0,1,0,0,,0


In [11]:
# Merge icu_stay_df with inpatient_specialty_merged_df on 'Encounter ID'
inpatient_specialty_merged_df = pd.merge(inpatient_specialty_merged_df, stepdown_rows_df[['Encounter ID', 'stepdown_days']], on='Encounter ID', how='left')
inpatient_specialty_merged_df

Unnamed: 0,Internalpatientid,Age at specialty,Specialty start date,Specialty end date,Specialty,Encounter ID,counts,genMed,hospice,homeCare,...,loc_rehab,loc_snf,loc_psych,loc_obs,loc_drug,loc_stepdown,loc_icu,loc_other,ICU_days,stepdown_days
0,1,79,2022-12-31 05:41:51,2023-01-01 23:06:29,DERMATOLOGY,e8f395d3-c8d4-5cf4-a686-34352c9e47c3,11480,0,0,0,...,0,0,0,0,0,0,0,1,,
1,100095,84,2004-07-25 18:00:57,2004-07-26 08:04:01,DERMATOLOGY,98f745fb-5f07-4f4c-86f4-36979464dca3,11480,0,0,0,...,0,0,0,0,0,0,0,1,,
2,10064,51,2009-03-01 05:45:08,2009-03-01 07:23:26,DERMATOLOGY,437a3995-2112-5b1c-865b-6ba64c5bd0a1,11480,0,0,0,...,0,0,0,0,0,0,0,1,,
3,100751,65,2016-02-10 16:57:15,2016-02-14 13:43:38,DERMATOLOGY,63d9b053-ec7a-5c93-bc23-f52d522208c9,11480,0,0,0,...,0,0,0,0,0,0,0,1,,
4,101245,65,2015-12-27 20:54:57,2015-12-28 05:53:45,DERMATOLOGY,33037e3c-db21-57dc-89b0-fc1e09ebd5fc,11480,0,0,0,...,0,0,0,0,0,0,0,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40073402,99689,74,2001-03-26 15:34:13,2001-03-26 17:13:06,"ZZSUBST ABUSE STAR I,II,II",99375e10-7fdc-5328-b8cb-3f4fb64796fe,6,0,0,0,...,0,0,0,0,1,0,0,0,,
40073403,99689,74,2001-03-26 17:09:34,2001-03-28 18:35:00,"ZZSUBST ABUSE STAR I,II,II",99375e10-7fdc-5328-b8cb-3f4fb64796fe,6,0,0,0,...,0,0,0,0,1,0,0,0,,
40073404,58306,52,2000-06-04 19:51:59,2000-06-11 18:08:46,"ZZSUBST ABUSE STAR I,II,II",56ec12dd-131a-4e22-a0b1-d6026f6ca121,6,0,0,0,...,0,0,0,0,1,0,0,0,,
40073405,58306,52,2000-06-02 19:22:06,2000-06-04 19:43:46,"ZZSUBST ABUSE STAR I,II,II",b24abd9c-f525-442b-92eb-b712b2c626a8,6,0,0,0,...,0,0,0,0,1,0,0,0,,


In [12]:
inpatient_specialty_merged_df['ICU_days'].value_counts()

0.0       14215864
1.0        5578767
2.0        2464655
3.0        2365730
4.0        1895139
            ...   
262.0            1
305.0            1
238.0            1
306.0            1
1557.0           1
Name: ICU_days, Length: 432, dtype: int64

In [13]:
inpatient_specialty_merged_df['stepdown_days'].value_counts()

0.0      14230622
1.0       5583805
2.0       2468731
3.0       2367454
4.0       1896758
           ...   
620.0           1
581.0           1
387.0           1
544.0           1
197.0           1
Name: stepdown_days, Length: 435, dtype: int64

In [14]:
inpatient_specialty_merged_df.dtypes

Internalpatientid                int32
Age at specialty                  int8
Specialty start date    datetime64[ns]
Specialty end date      datetime64[ns]
Specialty                       object
Encounter ID                    object
counts                           int64
genMed                           int64
hospice                          int64
homeCare                         int64
homelessRecovery                 int64
rehab                            int64
snf                              int64
psych                            int64
obs                              int64
drug                             int64
stepdown                         int64
icu                              int64
other                            int64
loc_genMed                       int64
loc_hospice                      int64
loc_homeCare                     int64
loc_homelessRecovery             int64
loc_rehab                        int64
loc_snf                          int64
loc_psych                

In [15]:
columns_to_convert = ['genMed', 'hospice', 'homeCare', 'homelessRecovery', 'rehab', 'snf', 'psych', 'obs',
                      'drug', 'stepdown', 'icu', 'other', 'loc_genMed', 'loc_hospice', 'loc_homeCare',
                      'loc_homelessRecovery', 'loc_rehab', 'loc_snf', 'loc_psych', 'loc_obs', 'loc_drug',
                      'loc_stepdown', 'loc_icu', 'loc_other']

inpatient_specialty_merged_df[columns_to_convert] = inpatient_specialty_merged_df[columns_to_convert].astype('int8')
inpatient_specialty_merged_df

Unnamed: 0,Internalpatientid,Age at specialty,Specialty start date,Specialty end date,Specialty,Encounter ID,counts,genMed,hospice,homeCare,...,loc_rehab,loc_snf,loc_psych,loc_obs,loc_drug,loc_stepdown,loc_icu,loc_other,ICU_days,stepdown_days
0,1,79,2022-12-31 05:41:51,2023-01-01 23:06:29,DERMATOLOGY,e8f395d3-c8d4-5cf4-a686-34352c9e47c3,11480,0,0,0,...,0,0,0,0,0,0,0,1,,
1,100095,84,2004-07-25 18:00:57,2004-07-26 08:04:01,DERMATOLOGY,98f745fb-5f07-4f4c-86f4-36979464dca3,11480,0,0,0,...,0,0,0,0,0,0,0,1,,
2,10064,51,2009-03-01 05:45:08,2009-03-01 07:23:26,DERMATOLOGY,437a3995-2112-5b1c-865b-6ba64c5bd0a1,11480,0,0,0,...,0,0,0,0,0,0,0,1,,
3,100751,65,2016-02-10 16:57:15,2016-02-14 13:43:38,DERMATOLOGY,63d9b053-ec7a-5c93-bc23-f52d522208c9,11480,0,0,0,...,0,0,0,0,0,0,0,1,,
4,101245,65,2015-12-27 20:54:57,2015-12-28 05:53:45,DERMATOLOGY,33037e3c-db21-57dc-89b0-fc1e09ebd5fc,11480,0,0,0,...,0,0,0,0,0,0,0,1,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40073402,99689,74,2001-03-26 15:34:13,2001-03-26 17:13:06,"ZZSUBST ABUSE STAR I,II,II",99375e10-7fdc-5328-b8cb-3f4fb64796fe,6,0,0,0,...,0,0,0,0,1,0,0,0,,
40073403,99689,74,2001-03-26 17:09:34,2001-03-28 18:35:00,"ZZSUBST ABUSE STAR I,II,II",99375e10-7fdc-5328-b8cb-3f4fb64796fe,6,0,0,0,...,0,0,0,0,1,0,0,0,,
40073404,58306,52,2000-06-04 19:51:59,2000-06-11 18:08:46,"ZZSUBST ABUSE STAR I,II,II",56ec12dd-131a-4e22-a0b1-d6026f6ca121,6,0,0,0,...,0,0,0,0,1,0,0,0,,
40073405,58306,52,2000-06-02 19:22:06,2000-06-04 19:43:46,"ZZSUBST ABUSE STAR I,II,II",b24abd9c-f525-442b-92eb-b712b2c626a8,6,0,0,0,...,0,0,0,0,1,0,0,0,,


In [18]:
inpatient_specialty_merged_df['ICU_days'] = inpatient_specialty_merged_df['ICU_days'].astype(pd.Int32Dtype())
inpatient_specialty_merged_df['stepdown_days'] = inpatient_specialty_merged_df['stepdown_days'].astype(pd.Int32Dtype())

In [19]:
inpatient_specialty_merged_df.dtypes

Internalpatientid                int32
Age at specialty                  int8
Specialty start date    datetime64[ns]
Specialty end date      datetime64[ns]
Specialty                       object
Encounter ID                    object
counts                           int64
genMed                            int8
hospice                           int8
homeCare                          int8
homelessRecovery                  int8
rehab                             int8
snf                               int8
psych                             int8
obs                               int8
drug                              int8
stepdown                          int8
icu                               int8
other                             int8
loc_genMed                        int8
loc_hospice                       int8
loc_homeCare                      int8
loc_homelessRecovery              int8
loc_rehab                         int8
loc_snf                           int8
loc_psych                

In [20]:
# Save the Dask DataFrame as Parquet
inpatient_specialty_merged_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/inpatient_specialty.parquet')