<a href="https://colab.research.google.com/github/victormurcia/VCHAMPS/blob/main/VCHAMPS_Gauging_Patient_Sickness_During_Hospitalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How Sick Is Patient During Hospitalization?

In [1]:
#General utilities
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm  # Import tqdm for the progress bar
import glob,shutil,os,warnings,math,time,sys,re
from typing import List
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from datetime import timedelta

#For performing UTC normalization on datetime columns based on the STATE column
import pytz

#For Slider viz
import ipywidgets as widgets
from IPython.display import display, clear_output,HTML

#Enable data to be extracted and downloaded from my Google Drive
from google.colab import drive, files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Specify the path to the desired directory
directory_path = r'/content/drive/MyDrive/VCHAMPS - Train Cleaned'

# Change the current working directory to the desired directory
os.chdir(directory_path)

# Verify the current working directory
cwd = os.getcwd()

print(f"Current working directory: {cwd}")

Current working directory: /content/drive/MyDrive/VCHAMPS - Train Cleaned


#

In [3]:
lab_measurements_df = dd.read_parquet('/content/drive/MyDrive/VCHAMPS - Train Cleaned-Mapped/lab_results/*.parquet')
lab_measurements_df = lab_measurements_df.compute()
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,75e105d9-27db-4639-bfd9-99214f43e737
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,4222f6a7-8023-40a5-ad10-58e043a47822
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,b3668b28-653c-5488-bb46-f5dd14ba103a
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,f29cb959-d880-5e6f-ba16-0d0bec5af545
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,bdb1bb2d-215a-502b-b2b8-3b2cd2d51939
...,...,...,...,...,...,...,...,...,...,...,...
99995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,ba9695d3-87e4-4838-924c-ae58cf2d98f3
99996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,ff397add-59d8-40f1-9637-11fd44c53fd1
99997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,8379cfaa-4537-4cf2-ba56-9dfbe0f55abe
99998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,d29d15f0-8e2a-4bd2-9157-688d12f9f69e


In [4]:
lab_measurements_df = lab_measurements_df.reset_index(drop=True)

# Heart BNP Up

In [5]:
bnp_df = lab_measurements_df[lab_measurements_df['concept'].isin(['bnp','ntprobnp'])]
bnp_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID
192509,5293,85,2012-04-05 16:35:10,224.000000,blood*,"zBNP, TOTAL (DC 1-12)",bnp,pg/mL,0.0,100.0,b91deb89-b069-4df8-b9c8-614c6d21ab42
192510,100490,77,2003-09-16 01:58:17,139.000000,blood*,"zBNP, TOTAL (DC 1-12)",bnp,pg/mL,0.0,100.0,8c08acdd-9ac2-4836-b6db-3108ef4e4ca8
192511,100490,79,2005-08-30 14:07:41,242.000000,blood*,"zBNP, TOTAL (DC 1-12)",bnp,pg/mL,0.0,100.0,6ef8f96c-635c-4a3e-8265-01260e11e8b6
192512,5293,85,2012-04-24 23:29:58,104.000000,blood*,"zBNP, TOTAL (DC 1-12)",bnp,pg/mL,0.0,100.0,4eb8c478-68e6-43dc-9bef-48a0f20683ea
192513,102655,65,2006-04-02 01:46:43,130.000000,blood*,"zBNP, TOTAL (DC 1-12)",bnp,pg/mL,0.0,100.0,0e65c2aa-5580-4494-a4c4-6a0f5b0e41d0
...,...,...,...,...,...,...,...,...,...,...,...
20632959,50375,73,2008-10-03 11:53:20,393.000000,plasma,BNP (BRAIN NATRIURETIC PEPTIDE),bnp,pg/mL,0.0,100.0,caf23f98-d68e-4878-86a7-c5f7a30786f8
20632960,50375,75,2010-11-08 06:02:00,628.000000,plasma,BNP (BRAIN NATRIURETIC PEPTIDE),bnp,pg/mL,0.0,100.0,808156ab-6547-4fa1-bfab-793726ed8a61
20632961,47998,61,2016-03-31 11:16:13,126.668404,plasma,BNP (BRAIN NATRIURETIC PEPTIDE),bnp,pg/mL,0.0,100.0,785384b4-9689-5c3f-ae76-41d2e593c06b
20632962,48355,73,2012-10-02 11:41:06,510.000000,plasma,BNP (BRAIN NATRIURETIC PEPTIDE),bnp,pg/mL,0.0,100.0,5b8263a3-ee20-45c2-b744-507583e04b93


In [6]:
bnp_df['heart_bnp_up'] = np.where(
    (bnp_df['Result numeric'] > 400) |
    ((bnp_df['Result numeric'] > 450) & (bnp_df['Age at lab test'] <= 50)) |
    ((bnp_df['Result numeric'] > 900) & (bnp_df['Age at lab test'] > 50) & (bnp_df['Age at lab test'] <= 75)) |
    ((bnp_df['Result numeric'] > 1800) & (bnp_df['Age at lab test'] > 75)),
    1,  # Value if the condition is true
    0   # Value if the condition is false
)
bnp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bnp_df['heart_bnp_up'] = np.where(


Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up
192509,5293,85,2012-04-05 16:35:10,224.000000,blood*,"zBNP, TOTAL (DC 1-12)",bnp,pg/mL,0.0,100.0,b91deb89-b069-4df8-b9c8-614c6d21ab42,0
192510,100490,77,2003-09-16 01:58:17,139.000000,blood*,"zBNP, TOTAL (DC 1-12)",bnp,pg/mL,0.0,100.0,8c08acdd-9ac2-4836-b6db-3108ef4e4ca8,0
192511,100490,79,2005-08-30 14:07:41,242.000000,blood*,"zBNP, TOTAL (DC 1-12)",bnp,pg/mL,0.0,100.0,6ef8f96c-635c-4a3e-8265-01260e11e8b6,0
192512,5293,85,2012-04-24 23:29:58,104.000000,blood*,"zBNP, TOTAL (DC 1-12)",bnp,pg/mL,0.0,100.0,4eb8c478-68e6-43dc-9bef-48a0f20683ea,0
192513,102655,65,2006-04-02 01:46:43,130.000000,blood*,"zBNP, TOTAL (DC 1-12)",bnp,pg/mL,0.0,100.0,0e65c2aa-5580-4494-a4c4-6a0f5b0e41d0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
20632959,50375,73,2008-10-03 11:53:20,393.000000,plasma,BNP (BRAIN NATRIURETIC PEPTIDE),bnp,pg/mL,0.0,100.0,caf23f98-d68e-4878-86a7-c5f7a30786f8,0
20632960,50375,75,2010-11-08 06:02:00,628.000000,plasma,BNP (BRAIN NATRIURETIC PEPTIDE),bnp,pg/mL,0.0,100.0,808156ab-6547-4fa1-bfab-793726ed8a61,1
20632961,47998,61,2016-03-31 11:16:13,126.668404,plasma,BNP (BRAIN NATRIURETIC PEPTIDE),bnp,pg/mL,0.0,100.0,785384b4-9689-5c3f-ae76-41d2e593c06b,0
20632962,48355,73,2012-10-02 11:41:06,510.000000,plasma,BNP (BRAIN NATRIURETIC PEPTIDE),bnp,pg/mL,0.0,100.0,5b8263a3-ee20-45c2-b744-507583e04b93,1


Merge with labs

In [7]:
columns_to_merge = ['Internalpatientid', 'Lab test date','concept','heart_bnp_up']
merged_df = lab_measurements_df.merge(bnp_df[columns_to_merge], on=['Internalpatientid', 'Lab test date', 'concept'], how='outer')
merged_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,75e105d9-27db-4639-bfd9-99214f43e737,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,4222f6a7-8023-40a5-ad10-58e043a47822,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,b3668b28-653c-5488-bb46-f5dd14ba103a,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,f29cb959-d880-5e6f-ba16-0d0bec5af545,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,bdb1bb2d-215a-502b-b2b8-3b2cd2d51939,
...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,ba9695d3-87e4-4838-924c-ae58cf2d98f3,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,ff397add-59d8-40f1-9637-11fd44c53fd1,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,8379cfaa-4537-4cf2-ba56-9dfbe0f55abe,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,d29d15f0-8e2a-4bd2-9157-688d12f9f69e,


In [8]:
lab_measurements_df = merged_df
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,75e105d9-27db-4639-bfd9-99214f43e737,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,4222f6a7-8023-40a5-ad10-58e043a47822,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,b3668b28-653c-5488-bb46-f5dd14ba103a,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,f29cb959-d880-5e6f-ba16-0d0bec5af545,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,bdb1bb2d-215a-502b-b2b8-3b2cd2d51939,
...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,ba9695d3-87e4-4838-924c-ae58cf2d98f3,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,ff397add-59d8-40f1-9637-11fd44c53fd1,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,8379cfaa-4537-4cf2-ba56-9dfbe0f55abe,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,d29d15f0-8e2a-4bd2-9157-688d12f9f69e,


In [9]:
lab_measurements_df['heart_bnp_up'].value_counts()

0.0    58781
1.0    40672
Name: heart_bnp_up, dtype: int64

# Heart Trop High

In [10]:
trop_df = lab_measurements_df[lab_measurements_df['concept'].isin(['tropi', 'tropt', 'trophs'])]
trop_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up
24565,9382,85,2022-07-20 14:07:30,0.000000,blood,POC TROPONIN,tropi,ng/mL,0.000,0.08,cdafc6f0-a4f9-54ca-9420-55f70d5c47e6,
24566,9382,85,2022-08-17 00:17:07,0.000000,blood,POC TROPONIN,tropi,ng/mL,0.000,0.08,ba0f8f38-40ba-5172-92d8-ae5512e5aeeb,
24567,24261,71,2020-05-22 21:50:12,0.020688,blood,POC TROPONIN,tropi,ng/mL,0.000,0.08,6622b953-8ea1-529d-aac9-458114625dee,
24568,22578,80,2019-09-21 08:06:14,0.009667,blood,POC TROPONIN,tropi,ng/mL,0.000,0.08,a759a612-a144-5c99-a554-71f69f33c213,
24569,161952,68,2011-10-18 22:39:36,0.426903,blood,POC TROPONIN,tropi,ng/mL,0.000,1.10,aa564069-59be-4170-890d-f4b81bd5f6c3,
...,...,...,...,...,...,...,...,...,...,...,...,...
20182594,48586,91,2023-03-05 13:57:57,0.082759,plasma,TROPONIN-I(ULTRA),tropi,ng/mL,0.006,0.06,e8748e9b-0590-545f-b85c-db86f7ac63f0,
20182595,48168,75,2018-12-17 21:33:24,0.006961,plasma,TROPONIN-I(ULTRA),tropi,ng/mL,0.006,0.06,5057a40b-9e46-52d5-a814-3a59a36e24f7,
20182596,48586,91,2023-03-02 02:54:49,0.091072,plasma,TROPONIN-I(ULTRA),tropi,ng/mL,0.006,0.06,1b912a6b-2e6d-51a5-b2e5-d412806080ca,
20182597,48773,59,2014-05-05 15:57:26,0.056996,plasma,TROPONIN-I(ULTRA),tropi,ng/mL,0.006,0.06,6a2ef4ee-7ef7-4043-9a0c-bc17259543d3,


In [11]:
tropi_df  = trop_df[trop_df['concept'] == 'tropi']
tropt_df  = trop_df[trop_df['concept'] == 'tropt']
trophs_df = trop_df[trop_df['concept'] == 'trophs']

tropi_grouped_df = tropi_df.groupby('Encounter ID')['Result numeric'].max().reset_index()
tropi_grouped_df.rename(columns={'Result numeric': 'tropI_highest_value'}, inplace=True)

tropt_grouped_df = tropt_df.groupby('Encounter ID')['Result numeric'].max().reset_index()
tropt_grouped_df.rename(columns={'Result numeric': 'tropT_highest_value'}, inplace=True)

trops_grouped_df = trophs_df.groupby('Encounter ID')['Result numeric'].max().reset_index()
trops_grouped_df.rename(columns={'Result numeric': 'tropS_highest_value'}, inplace=True)

In [12]:
# Merge the tropI DataFrame back into the original DataFrame based on 'Encounter ID'
lab_measurements_df = pd.merge(lab_measurements_df, tropi_grouped_df, on='Encounter ID', how='left')
lab_measurements_df = pd.merge(lab_measurements_df, tropt_grouped_df, on='Encounter ID', how='left')
lab_measurements_df = pd.merge(lab_measurements_df, trops_grouped_df, on='Encounter ID', how='left')

In [13]:
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,75e105d9-27db-4639-bfd9-99214f43e737,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,4222f6a7-8023-40a5-ad10-58e043a47822,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,b3668b28-653c-5488-bb46-f5dd14ba103a,,,,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,f29cb959-d880-5e6f-ba16-0d0bec5af545,,,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,bdb1bb2d-215a-502b-b2b8-3b2cd2d51939,,0.058185,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,ba9695d3-87e4-4838-924c-ae58cf2d98f3,,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,ff397add-59d8-40f1-9637-11fd44c53fd1,,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,8379cfaa-4537-4cf2-ba56-9dfbe0f55abe,,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,d29d15f0-8e2a-4bd2-9157-688d12f9f69e,,,,


# Get max value and of creatinine per inpatient encounter

In [14]:
lab_measurements_df['concept'].value_counts()

k                5251943
cr               3664256
hct              3045252
wbc              2706831
ast              1703198
tbili            1289608
bicarb            948079
alt               681131
na                300100
hgb               297329
ldh               256731
a1c               136728
bnp                89287
lactate            75386
tropi              58454
inr                46750
gfr                34521
ferritin           22779
crp                18393
ddimer             16726
esr                16722
tropt              11647
hscrp              10484
ntprobnp           10166
pco2                6437
trophs               936
ph                    71
methadone_lvl         55
cocaine_lvl            0
Name: concept, dtype: int64

In [15]:
cr_df = lab_measurements_df[lab_measurements_df['concept'].isin(['cr'])]
cr_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value
25312,22510,67,2017-06-25 13:28:38,0.799602,blood,_CREATININE (I-STAT),cr,mg/dL,0.6,1.3,7c8b47d8-2226-4b39-9294-a5f08c8d2c83,,,,
25313,93290,71,2002-10-24 09:01:11,1.000000,blood,_CREATININE (I-STAT),cr,mg/dL,0.6,1.4,684de247-b98f-4a13-a641-040476c0d640,,,,
25314,93290,73,2004-04-27 10:43:51,1.000000,blood,_CREATININE (I-STAT),cr,mg/dL,0.6,1.4,f6c9f85a-791d-41c0-ba65-1aea26fe974b,,,,
25315,26490,46,2001-06-02 20:49:34,1.201626,blood,_CREATININE (I-STAT),cr,mg/dL,0.6,1.4,ba702176-da06-4311-96c5-7be0f180afec,,,,
25316,26490,45,2000-11-21 22:25:01,0.383530,blood,_CREATININE (I-STAT),cr,mg/dL,0.6,1.4,c41bc405-696a-46cd-b38a-2f420a8b6d33,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20647475,48693,84,1999-12-01 00:44:56,2.331936,blood*,zCREATININE - (DC 1-12),cr,mg/dL,0.8,1.5,601b1957-bd2e-5e48-8025-fa427c336f16,,,,
20647476,48693,84,1999-11-26 00:39:55,1.957715,blood*,zCREATININE - (DC 1-12),cr,mg/dL,0.8,1.5,601b1957-bd2e-5e48-8025-fa427c336f16,,,,
20647477,48693,84,1999-11-28 00:43:11,1.877020,blood*,zCREATININE - (DC 1-12),cr,mg/dL,0.8,1.5,601b1957-bd2e-5e48-8025-fa427c336f16,,,,
20647478,48693,85,2000-11-04 17:59:22,3.000000,blood*,zCREATININE - (DC 1-12),cr,mg/dL,0.8,1.5,139fe685-6ca8-4c45-b0c2-989713261748,,,,


In [16]:
highest_cr_per_encounter = cr_df.groupby('Encounter ID')['Result numeric'].max().reset_index()
highest_cr_per_encounter.rename(columns={'Result numeric': 'highest_creatinine_value'}, inplace=True)
highest_cr_per_encounter

Unnamed: 0,Encounter ID,highest_creatinine_value
0,0000102c-7d98-59f8-9aaf-6eda9b8506df,1.539236
1,00001410-68db-4940-9d25-6b2551d3c8c9,0.679358
2,000026a9-1a9c-4b2a-b949-01b342fcd514,1.272626
3,00002fb2-9439-45fe-bb4d-73e142230367,0.854518
4,000033a1-4df6-4385-bc5a-adc02becf6ba,0.963217
...,...,...
2657566,fffff565-8a3f-4e26-8d12-a08a1c426aa1,1.214803
2657567,fffff5c3-55d8-4865-b78e-de3e78585619,1.774298
2657568,fffff8b7-9776-496e-8344-e303630360d8,3.633510
2657569,fffffc48-b17e-59f9-8409-5355dab6445f,1.062079


In [17]:
lab_measurements_df = pd.merge(lab_measurements_df, highest_cr_per_encounter, on='Encounter ID', how='left')
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value,highest_creatinine_value
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,75e105d9-27db-4639-bfd9-99214f43e737,,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,4222f6a7-8023-40a5-ad10-58e043a47822,,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,b3668b28-653c-5488-bb46-f5dd14ba103a,,,,,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,f29cb959-d880-5e6f-ba16-0d0bec5af545,,,,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,bdb1bb2d-215a-502b-b2b8-3b2cd2d51939,,0.058185,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,ba9695d3-87e4-4838-924c-ae58cf2d98f3,,,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,ff397add-59d8-40f1-9637-11fd44c53fd1,,,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,8379cfaa-4537-4cf2-ba56-9dfbe0f55abe,,,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,d29d15f0-8e2a-4bd2-9157-688d12f9f69e,,,,,


# Get average creatinine over duration of encounter

In [18]:
# Step 2: Sort the DataFrame by 'Encounter ID' and 'Lab test date'
cr_df.sort_values(by=['Encounter ID', 'Lab test date'], inplace=True)
# Step 3 and 4: Calculate duration and cumulative sum of creatinine within each encounter
cr_df['duration'] = cr_df.groupby('Encounter ID')['Lab test date'].diff().dt.total_seconds()
cr_df['cumulative_creatinine'] = cr_df.groupby('Encounter ID')['Result numeric'].cumsum()
# Step 5: Calculate average creatinine for each inpatient encounter
cr_df['average_creatinine'] = cr_df['cumulative_creatinine'] / cr_df['duration']
cr_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cr_df.sort_values(by=['Encounter ID', 'Lab test date'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cr_df['duration'] = cr_df.groupby('Encounter ID')['Lab test date'].diff().dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cr_df['cumulative_creatinine'] = cr_df.groupby('Encounter ID')['Result numeric'].cumsum()
A value i

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value,duration,cumulative_creatinine,average_creatinine
10022940,72646,85,2017-02-27 22:18:50,1.539236,plasma,CREATININE (SERUM/PLASMA),cr,MG/DL,0.64,1.27,0000102c-7d98-59f8-9aaf-6eda9b8506df,,,,,,1.539236,
8189813,47780,55,2014-02-24 14:39:37,0.679358,plasma,CREATININE,cr,mg/dl,0.70,1.20,00001410-68db-4940-9d25-6b2551d3c8c9,,,,,,0.679358,
6144292,122080,55,2008-04-09 01:07:18,1.272626,serum,CREATININE,cr,mg/dl,0.70,1.40,000026a9-1a9c-4b2a-b949-01b342fcd514,,,,,,1.272626,
6649016,133225,72,2022-02-28 10:48:51,0.854518,plasma,CREATININE,cr,mg/dl,0.67,1.17,00002fb2-9439-45fe-bb4d-73e142230367,,,,,,0.854518,
5634479,60231,61,2012-11-01 15:35:47,0.963217,plasma,CREATININE,cr,mg/dl,0.50,1.10,000033a1-4df6-4385-bc5a-adc02becf6ba,,,,,,0.963217,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5874424,67213,79,2013-08-11 00:09:58,1.214803,serum,CREATININE,cr,mg/dl,0.50,1.20,fffff565-8a3f-4e26-8d12-a08a1c426aa1,,,,,,1.214803,
5252087,5216,79,2016-07-09 04:54:59,1.774298,plasma,CREATININE,cr,mg/dl,0.60,1.30,fffff5c3-55d8-4865-b78e-de3e78585619,,,,,,1.774298,
5425596,57405,65,2000-09-18 06:39:14,3.633510,plasma,CREATININE,cr,mg/dl,0.70,1.50,fffff8b7-9776-496e-8344-e303630360d8,,,,,,3.633510,
5367111,10372,86,2002-07-16 14:10:52,1.062079,serum,CREATININE,cr,mg/dl,0.70,1.50,fffffc48-b17e-59f9-8409-5355dab6445f,,,,,,1.062079,


In [19]:
# Drop the intermediate columns 'duration' and 'cumulative_creatinine'
cr_df.drop(['duration', 'cumulative_creatinine'], axis=1, inplace=True)
# Step: Merge the 'average_creatinine' column back into the original DataFrame based on 'Encounter ID'
# Create a dictionary to map 'Encounter ID' to 'average_creatinine' values
average_creatinine_dict = cr_df[['Encounter ID', 'average_creatinine']].drop_duplicates().set_index('Encounter ID').to_dict()['average_creatinine']
# Map the 'average_creatinine' values to the original DataFrame using the 'Encounter ID' as the index
lab_measurements_df['average_creatinine'] = lab_measurements_df['Encounter ID'].map(average_creatinine_dict)
lab_measurements_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cr_df.drop(['duration', 'cumulative_creatinine'], axis=1, inplace=True)


Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value,highest_creatinine_value,average_creatinine
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,75e105d9-27db-4639-bfd9-99214f43e737,,,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,4222f6a7-8023-40a5-ad10-58e043a47822,,,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,b3668b28-653c-5488-bb46-f5dd14ba103a,,,,,,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,f29cb959-d880-5e6f-ba16-0d0bec5af545,,,,,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,bdb1bb2d-215a-502b-b2b8-3b2cd2d51939,,0.058185,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,ba9695d3-87e4-4838-924c-ae58cf2d98f3,,,,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,ff397add-59d8-40f1-9637-11fd44c53fd1,,,,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,8379cfaa-4537-4cf2-ba56-9dfbe0f55abe,,,,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,d29d15f0-8e2a-4bd2-9157-688d12f9f69e,,,,,,


In [20]:
lab_measurements_df.rename(columns={'highest_creatinine_value': 'renal_cr_high', 'average_creatinine': 'renal_cr_avg'}, inplace=True)

In [21]:
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value,renal_cr_high,renal_cr_avg
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,75e105d9-27db-4639-bfd9-99214f43e737,,,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,4222f6a7-8023-40a5-ad10-58e043a47822,,,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,b3668b28-653c-5488-bb46-f5dd14ba103a,,,,,,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,f29cb959-d880-5e6f-ba16-0d0bec5af545,,,,,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,bdb1bb2d-215a-502b-b2b8-3b2cd2d51939,,0.058185,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,ba9695d3-87e4-4838-924c-ae58cf2d98f3,,,,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,ff397add-59d8-40f1-9637-11fd44c53fd1,,,,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,8379cfaa-4537-4cf2-ba56-9dfbe0f55abe,,,,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,d29d15f0-8e2a-4bd2-9157-688d12f9f69e,,,,,,


# Max Potassium

In [22]:
k_df = lab_measurements_df[lab_measurements_df['concept'].isin(['k'])]
k_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value,renal_cr_high,renal_cr_avg
25628,58846,53,2001-07-07 21:26:12,4.000000,plasma,SPR-POTASSIUM,k,MMOL/L,3.8,5.1,5c186bd8-7fa2-425e-8fdb-8653b93ae90c,,,,,,
25629,58846,54,2002-02-02 22:20:09,4.376191,plasma,SPR-POTASSIUM,k,MMOL/L,3.6,5.1,e96ef528-9f88-4bc3-a46d-635929acd8ff,,,,,,
25630,58846,54,2002-07-26 19:49:55,4.425276,plasma,SPR-POTASSIUM,k,MMOL/L,3.6,5.1,7628bf69-d3fc-4db6-9e98-ea73ebaa703d,,,,,,
25631,58846,55,2003-06-01 13:56:49,4.560742,plasma,SPR-POTASSIUM,k,MMOL/L,3.6,5.1,a6c3166e-1747-4e8f-ae76-a548365aff74,,,,,,
25632,58846,54,2001-12-13 23:37:55,4.875731,plasma,SPR-POTASSIUM,k,MMOL/L,3.6,5.1,8d1b23ed-1c5f-4bde-ab5a-63d46c91ca6b,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20652847,50379,71,2021-02-08 09:44:08,6.057990,arterial bld,POTASSIUM (POC),k,mEq/L,3.5,5.5,a25f9741-b5a7-501a-9803-592cec35bc8b,,,,,5.504596,0.003535
20652848,50379,71,2021-02-09 01:13:54,6.000000,arterial bld,POTASSIUM (POC),k,mEq/L,3.5,5.5,a25f9741-b5a7-501a-9803-592cec35bc8b,,,,,5.504596,0.003535
20652849,50379,71,2021-02-13 06:10:06,4.282011,arterial bld,POTASSIUM (POC),k,mEq/L,3.5,5.5,a25f9741-b5a7-501a-9803-592cec35bc8b,,,,,5.504596,0.003535
20652850,5156,71,2015-09-01 15:09:42,4.364627,arterial bld,POTASSIUM (POC),k,mEq/L,3.5,5.5,21895df9-4aa4-42e6-b3d6-d51763b0c730,,,,,,


In [23]:
highest_k_per_encounter = k_df.groupby('Encounter ID')['Result numeric'].max().reset_index()
highest_k_per_encounter.rename(columns={'Result numeric': 'highest_potassium_value'}, inplace=True)
highest_k_per_encounter

Unnamed: 0,Encounter ID,highest_potassium_value
0,0000089b-2578-40c9-98d8-1abda49aedb4,3.695482
1,0000092a-bc8d-407d-921d-ded0f7aa6fa6,4.000000
2,0000102c-7d98-59f8-9aaf-6eda9b8506df,4.217926
3,000010f4-322d-425a-ba58-5fc03775c240,4.295234
4,00001227-8687-4ef9-966d-ed90f71f3bfa,4.474889
...,...,...
3765264,fffff4dc-e0c2-40cd-a3f1-40fa6b269648,5.064329
3765265,fffff7d0-28f4-4c9c-9841-6373f24b8621,3.288768
3765266,fffffc48-b17e-59f9-8409-5355dab6445f,4.295346
3765267,fffffd49-3e0a-5bcf-96cb-03d3418df509,4.351041


In [24]:
lab_measurements_df = pd.merge(lab_measurements_df, highest_k_per_encounter, on='Encounter ID', how='left')
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,75e105d9-27db-4639-bfd9-99214f43e737,,,,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,4222f6a7-8023-40a5-ad10-58e043a47822,,,,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,b3668b28-653c-5488-bb46-f5dd14ba103a,,,,,,,4.467428
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,f29cb959-d880-5e6f-ba16-0d0bec5af545,,,,,,,4.352516
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,bdb1bb2d-215a-502b-b2b8-3b2cd2d51939,,0.058185,,,,,4.810952
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,ba9695d3-87e4-4838-924c-ae58cf2d98f3,,,,,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,ff397add-59d8-40f1-9637-11fd44c53fd1,,,,,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,8379cfaa-4537-4cf2-ba56-9dfbe0f55abe,,,,,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,d29d15f0-8e2a-4bd2-9157-688d12f9f69e,,,,,,,


# Average Potassium

In [25]:
# Step 2: Sort the DataFrame by 'Encounter ID' and 'Lab test date'
k_df.sort_values(by=['Encounter ID', 'Lab test date'], inplace=True)
# Step 3 and 4: Calculate duration and cumulative sum of creatinine within each encounter
k_df['duration'] = k_df.groupby('Encounter ID')['Lab test date'].diff().dt.total_seconds()
k_df['cumulative_potassium'] = k_df.groupby('Encounter ID')['Result numeric'].cumsum()
# Step 5: Calculate average creatinine for each inpatient encounter
k_df['average_potassium'] = k_df['cumulative_potassium'] / k_df['duration']
#cr_df
# Drop the intermediate columns 'duration' and 'cumulative_creatinine'
k_df.drop(['duration', 'cumulative_potassium'], axis=1, inplace=True)
# Step: Merge the 'average_creatinine' column back into the original DataFrame based on 'Encounter ID'
# Create a dictionary to map 'Encounter ID' to 'average_creatinine' values
average_potassium_dict = k_df[['Encounter ID', 'average_potassium']].drop_duplicates().set_index('Encounter ID').to_dict()['average_potassium']
# Map the 'average_creatinine' values to the original DataFrame using the 'Encounter ID' as the index
lab_measurements_df['average_potassium'] = lab_measurements_df['Encounter ID'].map(average_potassium_dict)
lab_measurements_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_df.sort_values(by=['Encounter ID', 'Lab test date'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_df['duration'] = k_df.groupby('Encounter ID')['Lab test date'].diff().dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_df['cumulative_potassium'] = k_df.groupby('Encounter ID')['Result numeric'].cumsum()
A value is tryi

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,75e105d9-27db-4639-bfd9-99214f43e737,,,,,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,4222f6a7-8023-40a5-ad10-58e043a47822,,,,,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,b3668b28-653c-5488-bb46-f5dd14ba103a,,,,,,,4.467428,0.000027
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,f29cb959-d880-5e6f-ba16-0d0bec5af545,,,,,,,4.352516,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,bdb1bb2d-215a-502b-b2b8-3b2cd2d51939,,0.058185,,,,,4.810952,0.000145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,ba9695d3-87e4-4838-924c-ae58cf2d98f3,,,,,,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,ff397add-59d8-40f1-9637-11fd44c53fd1,,,,,,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,8379cfaa-4537-4cf2-ba56-9dfbe0f55abe,,,,,,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,d29d15f0-8e2a-4bd2-9157-688d12f9f69e,,,,,,,,


# Peak WBC during encounter

In [26]:
wbc_df = lab_measurements_df[lab_measurements_df['concept'].isin(['wbc'])]
wbc_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium
25168,23273,82,1996-07-11 17:33:11,6.668240,blood,D-WBC (DCed 31320),wbc,x10 3,4.3,11.0,1ce657c1-69ba-4ec4-afe6-fbef571e62f7,,,,,,,,
25169,23273,83,1997-01-10 14:25:55,10.089370,blood,D-WBC (DCed 31320),wbc,x10 3,4.3,11.0,cc4a792e-65fc-4a6a-8ee1-b5cd588c2c95,,,,,,,,
25170,26051,84,2001-09-22 12:11:15,3.712764,blood,D-WBC (DCed 31320),wbc,x10 3,4.3,11.0,84ee8db1-f6cf-4d07-8062-cb63f98c2ccb,,,,,,,,
25171,161621,78,1999-01-03 15:08:37,6.797455,blood,D-WBC (DCed 31320),wbc,x10 3,4.3,11.0,dce8dc0f-83e9-446e-a027-50d5c74e2401,,,,,,,,
25172,27515,64,2000-01-14 12:07:47,2.793629,blood,D-WBC (DCed 31320),wbc,x10 3,4.3,11.0,c1f87e68-e5bf-4dcb-96df-97a916b23689,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19755101,51217,66,2017-04-28 07:48:32,4.777242,blood,WBC (5/2/2019),wbc,K/cmm,4.8,10.8,0f52fdac-a4cf-4e09-b8da-ab71c5fa79ff,,,,,,,,
19755102,51393,82,2012-07-27 10:41:25,8.768856,blood,WBC (5/2/2019),wbc,K/cmm,4.8,10.8,54fb8f7c-7f9f-5171-b511-92ceb2cfcd63,,,,,,,4.209313,0.000233
19755103,51393,82,2012-07-30 10:28:34,11.595727,blood,WBC (5/2/2019),wbc,K/cmm,4.8,10.8,54fb8f7c-7f9f-5171-b511-92ceb2cfcd63,,,,,,,4.209313,0.000233
19755104,51601,56,2002-04-27 03:15:42,7.167561,blood,WBC (5/2/2019),wbc,K/cmm,4.8,10.8,c4f1b210-e596-5824-9351-4239ef462323,,,,,,,3.929503,0.000152


In [27]:
highest_wbc_per_encounter = wbc_df.groupby('Encounter ID')['Result numeric'].max().reset_index()
highest_wbc_per_encounter.rename(columns={'Result numeric': 'highest_wbc_value'}, inplace=True)
highest_wbc_per_encounter

Unnamed: 0,Encounter ID,highest_wbc_value
0,00001667-0c04-4ee8-b2b5-1f1ab9c7cfb1,5.471573
1,00001d90-c69f-4e5d-98ae-c178fd5f7879,5.208589
2,00001f33-63fa-4f7d-9d31-1ceb30b9a1d4,7.995730
3,00001f63-21b6-4277-9ae8-0c705b248105,6.270786
4,00002147-3569-490e-99d6-39a7da915b38,7.074051
...,...,...
1959880,ffffdae5-14a7-4877-989c-a79f2b07c082,7.323278
1959881,ffffe666-93bc-4ada-a8b0-2178bc0471f1,8.498686
1959882,ffffe679-0357-4db6-9618-858f0cc27b81,18.000000
1959883,fffff98e-dd33-46f4-919d-20324486a561,7.659671


In [28]:
lab_measurements_df = pd.merge(lab_measurements_df, highest_wbc_per_encounter, on='Encounter ID', how='left')
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,Encounter ID,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium,highest_wbc_value
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,75e105d9-27db-4639-bfd9-99214f43e737,,,,,,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,4222f6a7-8023-40a5-ad10-58e043a47822,,,,,,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,b3668b28-653c-5488-bb46-f5dd14ba103a,,,,,,,4.467428,0.000027,4.240895
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,f29cb959-d880-5e6f-ba16-0d0bec5af545,,,,,,,4.352516,,4.543383
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,bdb1bb2d-215a-502b-b2b8-3b2cd2d51939,,0.058185,,,,,4.810952,0.000145,5.377059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,ba9695d3-87e4-4838-924c-ae58cf2d98f3,,,,,,,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,ff397add-59d8-40f1-9637-11fd44c53fd1,,,,,,,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,8379cfaa-4537-4cf2-ba56-9dfbe0f55abe,,,,,,,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,d29d15f0-8e2a-4bd2-9157-688d12f9f69e,,,,,,,,,


# Average WBC over encounter

In [29]:
# Step 2: Sort the DataFrame by 'Encounter ID' and 'Lab test date'
wbc_df.sort_values(by=['Encounter ID', 'Lab test date'], inplace=True)
# Step 3 and 4: Calculate duration and cumulative sum of creatinine within each encounter
wbc_df['duration'] = wbc_df.groupby('Encounter ID')['Lab test date'].diff().dt.total_seconds()
wbc_df['cumulative_wbc'] = wbc_df.groupby('Encounter ID')['Result numeric'].cumsum()
# Step 5: Calculate average creatinine for each inpatient encounter
wbc_df['average_wbc'] = wbc_df['cumulative_wbc'] / wbc_df['duration']
#cr_df
# Drop the intermediate columns 'duration' and 'cumulative_creatinine'
wbc_df.drop(['duration', 'cumulative_wbc'], axis=1, inplace=True)
# Step: Merge the 'average_creatinine' column back into the original DataFrame based on 'Encounter ID'
# Create a dictionary to map 'Encounter ID' to 'average_creatinine' values
average_wbc_dict = wbc_df[['Encounter ID', 'average_wbc']].drop_duplicates().set_index('Encounter ID').to_dict()['average_wbc']
# Map the 'average_creatinine' values to the original DataFrame using the 'Encounter ID' as the index
lab_measurements_df['average_wbc'] = lab_measurements_df['Encounter ID'].map(average_wbc_dict)
lab_measurements_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wbc_df.sort_values(by=['Encounter ID', 'Lab test date'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wbc_df['duration'] = wbc_df.groupby('Encounter ID')['Lab test date'].diff().dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wbc_df['cumulative_wbc'] = wbc_df.groupby('Encounter ID')['Result numeric'].cumsum()
A value is 

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium,highest_wbc_value,average_wbc
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,,,,4.467428,0.000027,4.240895,0.000010
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,,,,4.352516,,4.543383,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,0.058185,,,,,4.810952,0.000145,5.377059,0.000169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,,,,,,,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,,,,,,,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,,,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,,,,,


# Heme HGB Low

In [30]:
hgb_df = lab_measurements_df[lab_measurements_df['concept'].isin(['hgb'])]
hgb_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,heart_bnp_up,tropI_highest_value,tropT_highest_value,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium,highest_wbc_value,average_wbc
191441,100633,77,1997-11-10 22:24:46,8.441779,blood,ZZHGB (BU/CN<3/20/00),hgb,g/dL,12.0,16.0,...,,,,,,,,,,
191442,100633,77,1997-11-03 02:03:33,8.000000,blood,ZZHGB (BU/CN<3/20/00),hgb,g/dL,12.0,16.0,...,,,,,,,,,,
191443,100633,77,1997-11-09 13:27:00,9.000000,blood,ZZHGB (BU/CN<3/20/00),hgb,g/dL,12.0,16.0,...,,,,,,,,,,
191444,100633,77,1997-11-09 21:49:22,9.464111,blood,ZZHGB (BU/CN<3/20/00),hgb,g/dL,12.0,16.0,...,,,,,,,,,,
191445,100633,77,1997-11-12 22:13:23,8.282685,blood,ZZHGB (BU/CN<3/20/00),hgb,g/dL,12.0,16.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635249,44703,84,2002-02-21 11:30:30,11.000000,blood,ZZHEMOGLOBIN (SY<8/00),hgb,g/dL,13.5,18.0,...,,,,,,,5.642783,0.000508,,
20635250,44703,84,2002-03-25 14:15:06,12.000000,blood,ZZHEMOGLOBIN (SY<8/00),hgb,g/dL,13.5,18.0,...,,,,,,,5.289778,0.000573,,
20635251,4045,64,1999-06-01 09:26:30,16.688404,blood,ZZHEMOGLOBIN (SY<8/00),hgb,g/dL,13.5,18.0,...,,,,,,,,,,
20635252,45140,39,1998-08-30 18:32:34,17.284005,blood,ZZHEMOGLOBIN (SY<8/00),hgb,g/dL,13.5,18.0,...,,,,,,,,,,


In [31]:
lowest_hgb_per_encounter = hgb_df.groupby('Encounter ID')['Result numeric'].min().reset_index()
lowest_hgb_per_encounter.rename(columns={'Result numeric': 'lowest_hgb_value'}, inplace=True)
lowest_hgb_per_encounter

Unnamed: 0,Encounter ID,lowest_hgb_value
0,00002d50-52d6-493e-8e3c-1beb8b96cbf9,11.331547
1,00002f05-b261-42e3-ae4d-9dbce6db2c0c,11.494384
2,00005f2f-2ede-4fc2-9d28-38dc2190ffe6,15.615898
3,0000763f-c7d3-412f-a68d-0ceb91910d8e,10.055508
4,0000a3ac-05fd-5703-8011-cc0afeab322c,12.688575
...,...,...
227652,ffff4039-689b-4fad-9b89-fb35df725d48,16.347548
227653,ffff76e5-9449-4032-a545-1785b7944de2,17.000000
227654,ffffe077-aeff-46d6-8742-ba6d2e7ccc06,10.539744
227655,ffffe0e5-2d00-4196-aa42-7e2977e62808,13.000000


In [32]:
lab_measurements_df = pd.merge(lab_measurements_df, lowest_hgb_per_encounter, on='Encounter ID', how='left')
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,tropI_highest_value,tropT_highest_value,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium,highest_wbc_value,average_wbc,lowest_hgb_value
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,,,4.467428,0.000027,4.240895,0.000010,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,,,4.352516,,4.543383,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,0.058185,,,,,4.810952,0.000145,5.377059,0.000169,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,,,,,,,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,,,,,,,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,,,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,,,,,


# Heme HGB Average

In [33]:
# Step 2: Sort the DataFrame by 'Encounter ID' and 'Lab test date'
hgb_df.sort_values(by=['Encounter ID', 'Lab test date'], inplace=True)
# Step 3 and 4: Calculate duration and cumulative sum of creatinine within each encounter
hgb_df['duration'] = hgb_df.groupby('Encounter ID')['Lab test date'].diff().dt.total_seconds()
hgb_df['cumulative_hgb'] = hgb_df.groupby('Encounter ID')['Result numeric'].cumsum()
# Step 5: Calculate average creatinine for each inpatient encounter
hgb_df['average_hgb'] = hgb_df['cumulative_hgb'] / hgb_df['duration']
#cr_df
# Drop the intermediate columns 'duration' and 'cumulative_creatinine'
hgb_df.drop(['duration', 'cumulative_hgb'], axis=1, inplace=True)
# Step: Merge the 'average_creatinine' column back into the original DataFrame based on 'Encounter ID'
# Create a dictionary to map 'Encounter ID' to 'average_creatinine' values
average_hgb_dict = hgb_df[['Encounter ID', 'average_hgb']].drop_duplicates().set_index('Encounter ID').to_dict()['average_hgb']
# Map the 'average_creatinine' values to the original DataFrame using the 'Encounter ID' as the index
lab_measurements_df['average_hgb'] = lab_measurements_df['Encounter ID'].map(average_hgb_dict)
lab_measurements_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hgb_df.sort_values(by=['Encounter ID', 'Lab test date'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hgb_df['duration'] = hgb_df.groupby('Encounter ID')['Lab test date'].diff().dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hgb_df['cumulative_hgb'] = hgb_df.groupby('Encounter ID')['Result numeric'].cumsum()
A value is 

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,tropT_highest_value,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium,highest_wbc_value,average_wbc,lowest_hgb_value,average_hgb
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,,4.467428,0.000027,4.240895,0.000010,,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,,4.352516,,4.543383,,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,,4.810952,0.000145,5.377059,0.000169,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,,,,,,,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,,,,,,,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,,,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,,,,,


# Heme IDA

In [34]:
ferritin_df = lab_measurements_df[lab_measurements_df['concept'].isin(['ferritin'])]
ferritin_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,tropT_highest_value,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium,highest_wbc_value,average_wbc,lowest_hgb_value,average_hgb
182390,10080,66,1997-05-09 11:12:42,168.018907,serum,Ferritin (Old),ferritin,ng/ml,16.4,293.9,...,,,1.346952,0.000031,6.0,0.000188,29.3777,0.000094,,
182391,100801,88,2001-07-12 19:34:32,45.738654,serum,Ferritin (Old),ferritin,ng/ml,27.0,300.0,...,,,,,,,,,,
182392,106230,68,2002-12-17 00:20:12,62.272371,serum,Ferritin (Old),ferritin,ng/ml,27.0,300.0,...,,,,,,,,,,
182393,106230,69,2003-08-23 08:15:03,59.844890,serum,Ferritin (Old),ferritin,ng/ml,27.0,300.0,...,,,,,,,,,,
182394,55459,68,1999-11-12 10:47:51,165.279012,serum,Ferritin (Old),ferritin,ng/ml,27.0,300.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20248507,32706,84,2018-04-27 18:39:41,65.000000,serum,FERRITIN(LUF),ferritin,ng/mL,23.9,336.2,...,,,,,,,,,,
20248508,32706,85,2019-05-14 08:34:00,81.000000,serum,FERRITIN(LUF),ferritin,ng/mL,23.9,336.2,...,,,,,,,,,,
20248509,37218,61,2016-07-17 21:05:27,596.663861,serum,FERRITIN(LUF),ferritin,ng/mL,23.9,336.2,...,,,,,,,,,,
20248510,51529,47,2017-12-05 19:42:21,111.019529,serum,FERRITIN(LUF),ferritin,ng/mL,23.9,336.2,...,,,,,,,,,,


In [35]:
ferritin_df = lab_measurements_df[lab_measurements_df['concept'].isin(['ferritin'])]
ferritin_df['heme_ida'] = ferritin_df['Result numeric'].apply(lambda x: 1 if x < 100 else 0)
heme_ida_mapping = ferritin_df.set_index('Encounter ID')['heme_ida'].to_dict()
lab_measurements_df['heme_ida'] = lab_measurements_df['Encounter ID'].map(heme_ida_mapping)
lab_measurements_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ferritin_df['heme_ida'] = ferritin_df['Result numeric'].apply(lambda x: 1 if x < 100 else 0)


Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium,highest_wbc_value,average_wbc,lowest_hgb_value,average_hgb,heme_ida
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,4.467428,0.000027,4.240895,0.000010,,,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,4.352516,,4.543383,,,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,4.810952,0.000145,5.377059,0.000169,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,,,,,,,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,,,,,,,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,,,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,,,,,


# id_inflamed_up

In [36]:
crp_df = lab_measurements_df[lab_measurements_df['concept'].isin(['crp'])]
crp_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium,highest_wbc_value,average_wbc,lowest_hgb_value,average_hgb,heme_ida
25647,152042,72,2004-01-16 11:20:03,10.719028,serum,zC-REACTIVE PROTEIN (ENDED 4-06),crp,mg/L,0.0,3.00,...,,,,,,,,,,
201396,100938,67,2008-05-26 17:07:47,48.186102,serum,C REACTIVE PROTEIN(CRPH)(WR dc'd 9/11),crp,mg/L,0.0,7.43,...,,,,,,,,,,
201397,100938,67,2008-08-11 16:41:50,17.326803,serum,C REACTIVE PROTEIN(CRPH)(WR dc'd 9/11),crp,mg/L,0.0,7.43,...,,,,,,,,,,
201398,101041,71,2008-10-09 10:10:24,2.852826,serum,C REACTIVE PROTEIN(CRPH)(WR dc'd 9/11),crp,mg/L,0.0,7.43,...,,,,,,,,,,
201399,100938,67,2008-04-20 19:46:53,2.187278,serum,C REACTIVE PROTEIN(CRPH)(WR dc'd 9/11),crp,mg/L,0.0,7.43,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20581482,47196,82,2015-05-17 20:53:50,92.019464,plasma,C REACTIVE PROTEIN (Dc'ed 7-12-21),crp,mg/L,0.0,9.99,...,,1.000000,0.000026,4.000000,0.000116,,,,,
20581483,50144,59,2018-06-04 17:21:19,8.468939,plasma,C REACTIVE PROTEIN (Dc'ed 7-12-21),crp,mg/L,0.0,9.99,...,,1.145837,,4.000000,,,,,,
20581484,50452,65,2017-02-15 12:45:17,97.407885,plasma,C REACTIVE PROTEIN (Dc'ed 7-12-21),crp,mg/L,0.0,9.99,...,,1.346192,0.000681,4.684849,0.002721,,,,,
20581485,50452,67,2018-10-29 05:13:31,22.323927,plasma,C REACTIVE PROTEIN (Dc'ed 7-12-21),crp,mg/L,0.0,9.99,...,,2.120415,0.000219,4.000000,0.000407,,,,,


In [37]:
esr_df = lab_measurements_df[lab_measurements_df['concept'].isin(['esr'])]
esr_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium,highest_wbc_value,average_wbc,lowest_hgb_value,average_hgb,heme_ida
298213,97101,89,2007-11-11 20:05:40,48.0,blood,ESR to 10/17/2017,esr,mm/hr,0.0,9.0,...,,,,3.763406,0.000073,8.53616,0.000116,,,
298214,33716,79,2004-07-31 02:04:45,98.0,blood,ESR to 10/17/2017,esr,mm/hr,0.0,9.0,...,,,,,,,,,,
298215,3157,74,1999-05-17 08:49:09,74.0,blood,ESR to 10/17/2017,esr,mm/hr,0.0,9.0,...,,,,4.432064,,9.45779,0.000094,,,
298216,31447,75,1998-06-12 08:29:33,26.0,blood,ESR to 10/17/2017,esr,mm/hr,0.0,9.0,...,,,,,,,,,,
298217,96220,82,2008-08-24 15:45:45,43.0,blood,ESR to 10/17/2017,esr,mm/hr,0.0,9.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20637648,49098,69,1999-11-02 07:12:17,20.0,blood,WESTERGREN ESR (MODIFIED)(~dc),esr,mm/hour,0.0,20.0,...,,,,,,,,,,
20637649,49098,71,2001-08-09 10:54:19,22.0,blood,WESTERGREN ESR (MODIFIED)(~dc),esr,mm/hour,0.0,20.0,...,,,,,,,,,,
20637650,49098,75,2006-02-16 16:12:58,9.0,blood,WESTERGREN ESR (MODIFIED)(~dc),esr,mm/hour,0.0,20.0,...,,,,,,,,,,
20637651,49098,76,2006-11-01 06:26:07,16.0,blood,WESTERGREN ESR (MODIFIED)(~dc),esr,mm/hour,0.0,20.0,...,,,,,,,,,,


In [38]:
hscrp_df = lab_measurements_df[lab_measurements_df['concept'].isin(['hscrp'])]
hscrp_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium,highest_wbc_value,average_wbc,lowest_hgb_value,average_hgb,heme_ida
182589,56232,74,2007-04-16 07:44:30,1.650196,serum,zzCRP HIGH SENSITIVE,hscrp,mg/L,0.00,3.00,...,,,,,,,,,,
182590,56848,63,2008-02-02 15:10:11,2.500933,serum,zzCRP HIGH SENSITIVE,hscrp,mg/L,0.00,3.00,...,,,,,,,,,,
182591,130838,68,2007-03-14 19:28:22,4.377257,serum,zzCRP HIGH SENSITIVE,hscrp,mg/L,0.00,3.00,...,,,,,,,,,,
182592,135142,78,2009-03-21 23:45:19,0.390793,serum,zzCRP HIGH SENSITIVE,hscrp,mg/L,0.00,3.00,...,,,,,,,,,,
182593,143655,83,2006-10-20 13:21:11,11.936924,serum,zzCRP HIGH SENSITIVE,hscrp,mg/L,0.00,3.00,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20633722,40943,86,2021-12-24 16:09:11,1.826628,serum,"CRP, high sensitivity",hscrp,mg/dL,0.02,0.70,...,,1.000000,0.000031,4.312752,0.000134,4.506734,0.000134,,,
20633723,41539,55,2015-11-29 07:08:21,0.177180,serum,"CRP, high sensitivity",hscrp,mg/dL,0.02,0.70,...,,1.190161,,,,7.427474,,,,
20633724,51677,53,2020-03-18 02:29:49,0.069911,serum,"CRP, high sensitivity",hscrp,mg/dL,0.02,0.70,...,,,,,,,,,,
20633725,5140,91,2009-02-05 04:18:15,0.620439,serum,"CRP, high sensitivity",hscrp,mg/dL,0.02,0.75,...,,,,,,,,,,


In [None]:
lab_measurements_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/lab_results.parquet')

In [9]:
lab_measurements_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/lab_results.parquet')
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,tropS_highest_value,renal_cr_high,renal_cr_avg,highest_potassium_value,average_potassium,highest_wbc_value,average_wbc,lowest_hgb_value,average_hgb,heme_ida
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,4.467428,0.000027,4.240895,0.000010,,,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,4.352516,,4.543383,,,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,4.810952,0.000145,5.377059,0.000169,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,,,,,,,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,,,,,,,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,,,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,,,,,


In [10]:
# Calculate the upper limits for each measurement type (CRP, hsCRP, ESR)
lab_measurements_df['CRP_ULN'] = lab_measurements_df['range_max'] * 5
lab_measurements_df['hsCRP_ULN'] = lab_measurements_df['range_max'] * 5
lab_measurements_df['ESR_ULN'] = lab_measurements_df['range_max'] * 5

# Update the id_inflamed_up column based on conditions
lab_measurements_df['id_inflamed_up'] = np.where(
    (lab_measurements_df['concept'] == 'crp') & (lab_measurements_df['Result numeric'] > lab_measurements_df['CRP_ULN']), 1,
    np.where(
        (lab_measurements_df['concept'] == 'hscrp') & (lab_measurements_df['Result numeric'] > lab_measurements_df['hsCRP_ULN']),
        1,
        np.where(
            (lab_measurements_df['concept'] == 'esr') & (lab_measurements_df['Result numeric'] > lab_measurements_df['ESR_ULN']),
            1,
            0
        )
    )
)

lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,average_potassium,highest_wbc_value,average_wbc,lowest_hgb_value,average_hgb,heme_ida,CRP_ULN,hsCRP_ULN,ESR_ULN,id_inflamed_up
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,,725.0,725.0,725.0,0
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,,725.0,725.0,725.0,0
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,0.000027,4.240895,0.000010,,,,740.0,740.0,740.0,0
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,4.543383,,,,,740.0,740.0,740.0,0
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,0.000145,5.377059,0.000169,,,,740.0,740.0,740.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,,,,,,170.0,170.0,170.0,0
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,,,,,,185.0,185.0,185.0,0
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,,210.0,210.0,210.0,0
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,,210.0,210.0,210.0,0


In [11]:
lab_measurements_df['id_inflamed_up'].value_counts()

0    20687773
1       12227
Name: id_inflamed_up, dtype: int64

In [41]:
lab_measurements_df.columns

Index(['Internalpatientid', 'Age at lab test', 'Lab test date',
       'Result numeric', 'Specimen source', 'desc', 'concept', 'unit',
       'range_min', 'range_max', 'Encounter ID', 'heart_bnp_up',
       'tropI_highest_value', 'tropT_highest_value', 'tropS_highest_value',
       'renal_cr_high', 'renal_cr_avg', 'highest_potassium_value',
       'average_potassium', 'highest_wbc_value', 'average_wbc',
       'lowest_hgb_value', 'average_hgb', 'heme_ida', 'id_inflamed_up_x',
       'id_inflamed_up_y', 'id_inflamed_up'],
      dtype='object')

In [44]:
# Drop duplicates based on all columns
lab_measurements_df.drop_duplicates(inplace=True)
lab_measurements_df

# id_lactate_high

In [12]:
lactate_df = lab_measurements_df[lab_measurements_df['concept'].isin(['lactate'])]
highest_lactate_per_encounter = lactate_df.groupby('Encounter ID')['Result numeric'].max().reset_index()
highest_lactate_per_encounter.rename(columns={'Result numeric': 'highest_lactate_value'}, inplace=True)
lab_measurements_df = pd.merge(lab_measurements_df, highest_lactate_per_encounter, on='Encounter ID', how='left')
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,highest_wbc_value,average_wbc,lowest_hgb_value,average_hgb,heme_ida,CRP_ULN,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,725.0,725.0,725.0,0,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,,725.0,725.0,725.0,0,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,4.240895,0.000010,,,,740.0,740.0,740.0,0,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,4.543383,,,,,740.0,740.0,740.0,0,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,5.377059,0.000169,,,,740.0,740.0,740.0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,,,,,170.0,170.0,170.0,0,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,,,,,185.0,185.0,185.0,0,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,210.0,210.0,210.0,0,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,,210.0,210.0,210.0,0,


# Renal ph low

In [13]:
ph_df = lab_measurements_df[lab_measurements_df['concept'].isin(['ph'])]
highest_ph_per_encounter = ph_df.groupby('Encounter ID')['Result numeric'].min().reset_index()
highest_ph_per_encounter.rename(columns={'Result numeric': 'renal_ph_low'}, inplace=True)
lab_measurements_df = pd.merge(lab_measurements_df, highest_ph_per_encounter, on='Encounter ID', how='left')
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,average_wbc,lowest_hgb_value,average_hgb,heme_ida,CRP_ULN,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,725.0,725.0,725.0,0,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,,725.0,725.0,725.0,0,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,0.000010,,,,740.0,740.0,740.0,0,,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,,740.0,740.0,740.0,0,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,0.000169,,,,740.0,740.0,740.0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,,,,170.0,170.0,170.0,0,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,,,,185.0,185.0,185.0,0,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,210.0,210.0,210.0,0,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,,210.0,210.0,210.0,0,,


# Renal bicarbonate avg

In [14]:
bicarb_df = lab_measurements_df[lab_measurements_df['concept'].isin(['bicarb'])]
# Step 2: Sort the DataFrame by 'Encounter ID' and 'Lab test date'
bicarb_df.sort_values(by=['Encounter ID', 'Lab test date'], inplace=True)
# Step 3 and 4: Calculate duration and cumulative sum of creatinine within each encounter
bicarb_df['duration'] = bicarb_df.groupby('Encounter ID')['Lab test date'].diff().dt.total_seconds()
bicarb_df['cumulative_bicarb'] = bicarb_df.groupby('Encounter ID')['Result numeric'].cumsum()
# Step 5: Calculate average creatinine for each inpatient encounter
bicarb_df['average_bicarb'] = bicarb_df['cumulative_bicarb'] / bicarb_df['duration']
# Drop the intermediate columns 'duration' and 'cumulative_creatinine'
bicarb_df.drop(['duration', 'cumulative_bicarb'], axis=1, inplace=True)
# Step: Merge the 'average_creatinine' column back into the original DataFrame based on 'Encounter ID'
# Create a dictionary to map 'Encounter ID' to 'average_creatinine' values
average_bicarb_dict = bicarb_df[['Encounter ID', 'average_bicarb']].drop_duplicates().set_index('Encounter ID').to_dict()['average_bicarb']
# Map the 'average_creatinine' values to the original DataFrame using the 'Encounter ID' as the index
lab_measurements_df['average_bicarb'] = lab_measurements_df['Encounter ID'].map(average_bicarb_dict)
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,lowest_hgb_value,average_hgb,heme_ida,CRP_ULN,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,725.0,725.0,725.0,0,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,,725.0,725.0,725.0,0,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,740.0,740.0,740.0,0,,,0.000178
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,740.0,740.0,740.0,0,,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,,740.0,740.0,740.0,0,,,0.000749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20699995,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,,,170.0,170.0,170.0,0,,,
20699996,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,,,185.0,185.0,185.0,0,,,
20699997,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,210.0,210.0,210.0,0,,,
20699998,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,,210.0,210.0,210.0,0,,,


# drug stimulant use

In [18]:
dsu_df = lab_measurements_df[lab_measurements_df['concept'].isin(['cocaine_lvl','methadone_lvl'])]
# Define a function to check if either 'cocaine_lvl' or 'methadone_lvl' is positive (greater than 0)
def check_dsu_result(row):
    if pd.isna(row['Result numeric']) or row['Result numeric'] > 0:
        return 1
    else:
        return 0

# Apply the function to create the 'dsu_result' column
dsu_df['drug_stimulant_use'] = dsu_df.apply(check_dsu_result, axis=1)

# Merge the 'dsu_result' column back into the original lab_measurements_df
lab_measurements_df = pd.merge(lab_measurements_df, dsu_df[['Internalpatientid', 'Encounter ID', 'drug_stimulant_use']],
                               on=['Internalpatientid', 'Encounter ID'], how='left')

# Replace NaN values with zero in the 'Result numeric' column
lab_measurements_df['drug_stimulant_use'].fillna(0, inplace=True)

lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,average_hgb,heme_ida,CRP_ULN,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,725.0,725.0,725.0,0,,,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,,725.0,725.0,725.0,0,,,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,740.0,740.0,740.0,0,,,0.000178,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,740.0,740.0,740.0,0,,,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,,740.0,740.0,740.0,0,,,0.000749,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20700149,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,,170.0,170.0,170.0,0,,,,
20700150,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,,185.0,185.0,185.0,0,,,,
20700151,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,210.0,210.0,210.0,0,,,,
20700152,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,,210.0,210.0,210.0,0,,,,


# Average DDimer

In [20]:
ddimer_df = lab_measurements_df[lab_measurements_df['concept'].isin(['ddimer'])]
# Step 2: Sort the DataFrame by 'Encounter ID' and 'Lab test date'
ddimer_df.sort_values(by=['Encounter ID', 'Lab test date'], inplace=True)
# Step 3 and 4: Calculate duration and cumulative sum of creatinine within each encounter
ddimer_df['duration'] = ddimer_df.groupby('Encounter ID')['Lab test date'].diff().dt.total_seconds()
ddimer_df['cumulative_ddimer'] = ddimer_df.groupby('Encounter ID')['Result numeric'].cumsum()
# Step 5: Calculate average creatinine for each inpatient encounter
ddimer_df['average_ddimer'] = ddimer_df['cumulative_ddimer'] / ddimer_df['duration']
# Drop the intermediate columns 'duration' and 'cumulative_creatinine'
ddimer_df.drop(['duration', 'cumulative_ddimer'], axis=1, inplace=True)
# Step: Merge the 'average_creatinine' column back into the original DataFrame based on 'Encounter ID'
# Create a dictionary to map 'Encounter ID' to 'average_creatinine' values
average_ddimer_dict = ddimer_df[['Encounter ID', 'average_ddimer']].drop_duplicates().set_index('Encounter ID').to_dict()['average_ddimer']
# Map the 'average_creatinine' values to the original DataFrame using the 'Encounter ID' as the index
lab_measurements_df['average_ddimer'] = lab_measurements_df['Encounter ID'].map(average_ddimer_dict)
lab_measurements_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ddimer_df.sort_values(by=['Encounter ID', 'Lab test date'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ddimer_df['duration'] = ddimer_df.groupby('Encounter ID')['Lab test date'].diff().dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ddimer_df['cumulative_ddimer'] = ddimer_df.groupby('Encounter ID')['Result numeric'].cu

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,heme_ida,CRP_ULN,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,725.0,725.0,725.0,0,,,,0.0,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,725.0,725.0,725.0,0,,,,0.0,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,740.0,740.0,740.0,0,,,0.000178,0.0,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,740.0,740.0,740.0,0,,,,0.0,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,740.0,740.0,740.0,0,,,0.000749,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20700149,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,170.0,170.0,170.0,0,,,,0.0,
20700150,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,185.0,185.0,185.0,0,,,,0.0,
20700151,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,210.0,210.0,210.0,0,,,,0.0,
20700152,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,210.0,210.0,210.0,0,,,,0.0,


In [21]:
lab_measurements_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/lab_results.parquet')

In [3]:
lab_measurements_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/lab_results.parquet')
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,heme_ida,CRP_ULN,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,725.0,725.0,725.0,0,,,,0.0,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,,725.0,725.0,725.0,0,,,,0.0,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,740.0,740.0,740.0,0,,,0.000178,0.0,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,740.0,740.0,740.0,0,,,,0.0,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,,740.0,740.0,740.0,0,,,0.000749,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20700149,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,170.0,170.0,170.0,0,,,,0.0,
20700150,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,185.0,185.0,185.0,0,,,,0.0,
20700151,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,210.0,210.0,210.0,0,,,,0.0,
20700152,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,210.0,210.0,210.0,0,,,,0.0,


# liver_heptaocellular_product

In [4]:
lhp_df = lab_measurements_df[lab_measurements_df['concept'].isin(['ast','alt'])]
lhp_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,heme_ida,CRP_ULN,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer
25612,105768,72,2020-09-14 03:01:38,17.0,serum,SGOT(DLS),ast,IU/L,0.0,40.0,...,,200.0,200.0,200.0,0,,,,0.0,
25613,105768,73,2021-02-10 13:12:22,21.0,serum,SGOT(DLS),ast,IU/L,0.0,40.0,...,,200.0,200.0,200.0,0,,,,0.0,
25614,106240,60,2020-01-25 12:08:55,19.0,serum,SGOT(DLS),ast,IU/L,0.0,40.0,...,,200.0,200.0,200.0,0,,,,0.0,
25615,60843,69,2020-11-21 07:45:19,29.0,serum,SGOT(DLS),ast,IU/L,15.0,37.0,...,,185.0,185.0,185.0,0,,,,0.0,
25616,60843,69,2021-01-21 04:16:00,28.0,serum,SGOT(DLS),ast,IU/L,15.0,37.0,...,,185.0,185.0,185.0,0,,,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20700149,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,,170.0,170.0,170.0,0,,,,0.0,
20700150,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,,185.0,185.0,185.0,0,,,,0.0,
20700151,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,210.0,210.0,210.0,0,,,,0.0,
20700152,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,,210.0,210.0,210.0,0,,,,0.0,


In [5]:
# Create a new column 'AST_ALT_present' to indicate if AST or ALT is present for each encounter
lhp_df['AST_ALT_present'] = lhp_df['concept'].map({'AST': 1, 'ALT': 1})

# Calculate the product of AST and ALT for each encounter
lhp_df['liver_heptaocellular_product'] = lhp_df.groupby(['Internalpatientid', 'Encounter ID'])['Result numeric'].transform(pd.Series.prod)

# Drop the intermediate columns (optional)
lhp_df.drop(['concept', 'Result numeric'], axis=1, inplace=True)

# Fill NaN values (if an encounter has neither AST nor ALT) with 1188
lhp_df['liver_heptaocellular_product'].fillna(1188, inplace=True)

lhp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lhp_df['AST_ALT_present'] = lhp_df['concept'].map({'AST': 1, 'ALT': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lhp_df['liver_heptaocellular_product'] = lhp_df.groupby(['Internalpatientid', 'Encounter ID'])['Result numeric'].transform(pd.Series.prod)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lhp_df.drop(['concept', 'Result numeric'], axis=1, 

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Specimen source,desc,unit,range_min,range_max,Encounter ID,heart_bnp_up,...,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer,AST_ALT_present,liver_heptaocellular_product
25612,105768,72,2020-09-14 03:01:38,serum,SGOT(DLS),IU/L,0.0,40.0,7745dfe4-507d-4341-8020-b5961540cf16,,...,200.0,200.0,0,,,,0.0,,,17.0
25613,105768,73,2021-02-10 13:12:22,serum,SGOT(DLS),IU/L,0.0,40.0,2469de76-f1c0-5c18-ad64-eafd780f1577,,...,200.0,200.0,0,,,,0.0,,,546.0
25614,106240,60,2020-01-25 12:08:55,serum,SGOT(DLS),IU/L,0.0,40.0,a0b65919-1eda-5a2a-9597-a0356bac2221,,...,200.0,200.0,0,,,,0.0,,,418.0
25615,60843,69,2020-11-21 07:45:19,serum,SGOT(DLS),IU/L,15.0,37.0,93bc7001-5aed-56db-9011-b6e5c8f06c92,,...,185.0,185.0,0,,,,0.0,,,696.0
25616,60843,69,2021-01-21 04:16:00,serum,SGOT(DLS),IU/L,15.0,37.0,c5eb6647-67d4-51bb-b24a-4681006b3c4e,,...,185.0,185.0,0,,,,0.0,,,588.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20700149,142057,83,2022-02-05 08:22:24,plasma,SGOT(AST),U/L,5.0,34.0,ba9695d3-87e4-4838-924c-ae58cf2d98f3,,...,170.0,170.0,0,,,,0.0,,,17.0
20700150,31715,74,1999-12-27 10:23:10,serum,SGOT(AST),U/L,13.0,37.0,ff397add-59d8-40f1-9637-11fd44c53fd1,,...,185.0,185.0,0,,,,0.0,,,22.0
20700151,141306,68,2005-11-30 16:15:56,serum,SGOT(AST),U/L,8.0,42.0,8379cfaa-4537-4cf2-ba56-9dfbe0f55abe,,...,210.0,210.0,0,,,,0.0,,,21.0
20700152,141306,82,2020-03-06 13:42:17,serum,SGOT(AST),U/L,8.0,42.0,d29d15f0-8e2a-4bd2-9157-688d12f9f69e,,...,210.0,210.0,0,,,,0.0,,,21.0


In [7]:
# Find the maximum product of AST and ALT for each encounter
max_lhp_df = lhp_df.groupby(['Internalpatientid', 'Encounter ID'])['liver_heptaocellular_product'].max().reset_index()

# Merge the maximum products back into the original dataframe
lab_measurements_df = pd.merge(lab_measurements_df, max_lhp_df, on=['Internalpatientid', 'Encounter ID'], how='left')

lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,CRP_ULN,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer,liver_heptaocellular_product
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,725.0,725.0,725.0,0,,,,0.0,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,725.0,725.0,725.0,0,,,,0.0,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,740.0,740.0,0,,,0.000178,0.0,,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,740.0,740.0,0,,,,0.0,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,740.0,740.0,0,,,0.000749,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20700149,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,170.0,170.0,170.0,0,,,,0.0,,17.0
20700150,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,185.0,185.0,185.0,0,,,,0.0,,22.0
20700151,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,210.0,210.0,210.0,0,,,,0.0,,21.0
20700152,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,210.0,210.0,210.0,0,,,,0.0,,21.0


In [9]:
lab_measurements_df['liver_heptaocellular_product'].fillna(1188, inplace=True)
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,CRP_ULN,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer,liver_heptaocellular_product
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,725.0,725.0,725.0,0,,,,0.0,,1188.0
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,725.0,725.0,725.0,0,,,,0.0,,1188.0
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,740.0,740.0,0,,,0.000178,0.0,,1188.0
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,740.0,740.0,0,,,,0.0,,1188.0
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,740.0,740.0,0,,,0.000749,0.0,,1188.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20700149,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,170.0,170.0,170.0,0,,,,0.0,,17.0
20700150,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,185.0,185.0,185.0,0,,,,0.0,,22.0
20700151,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,210.0,210.0,210.0,0,,,,0.0,,21.0
20700152,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,210.0,210.0,210.0,0,,,,0.0,,21.0


In [10]:
lab_measurements_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/lab_results.parquet')

# Merging Lab Results with inpatient admissions

In [3]:
lab_measurements_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/lab_results.parquet')
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,CRP_ULN,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer,liver_heptaocellular_product
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,725.0,725.0,725.0,0,,,,0.0,,1188.0
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,725.0,725.0,725.0,0,,,,0.0,,1188.0
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,740.0,740.0,0,,,0.000178,0.0,,1188.0
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,740.0,740.0,0,,,,0.0,,1188.0
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,740.0,740.0,0,,,0.000749,0.0,,1188.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20700149,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,170.0,170.0,170.0,0,,,,0.0,,17.0
20700150,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,185.0,185.0,185.0,0,,,,0.0,,22.0
20700151,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,210.0,210.0,210.0,0,,,,0.0,,21.0
20700152,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,210.0,210.0,210.0,0,,,,0.0,,21.0


In [4]:
inpatient_admissions_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/inpatient_admissions.parquet')
inpatient_admissions_df

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,Outpatientreferralflag,...,Smallest Time Passed,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV
0,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,,,,,,0,0,0,0,0
1,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,,,,,,0,0,0,0,0
2,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,63.0,0.0,1.0,1.0,1.0,0,0,0,0,0
3,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,63.0,0.0,1.0,1.0,1.0,0,0,0,0,0
4,100001,85,2010-11-10 04:32:39,2010-11-19 08:49:45,SURGERY,SURGERY,SUBSTANCE ABUSE RES TRMT PROG,ORTHOPEDIC,0,0,...,63.0,0.0,1.0,1.0,1.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001358,99975,89,2018-01-05 00:59:05,2018-01-06 23:18:39,NON-COUNT,NON-COUNT,PODIATRY,MEDICAL OBSERVATION,0,0,...,12.0,1.0,1.0,1.0,1.0,0,0,0,0,0
1001359,99986,73,2019-06-26 19:47:10,2019-06-29 21:18:00,MEDICINE,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,0.0,1.0,1.0,1.0,1.0,0,0,0,0,0
1001360,99986,73,2019-06-26 19:47:10,2019-06-29 21:18:00,MEDICINE,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,0.0,1.0,1.0,1.0,1.0,0,0,0,0,0
1001361,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,28.0,1.0,1.0,1.0,1.0,0,0,0,0,0


In [5]:
inpatient_admissions_df.columns

Index(['Internalpatientid', 'Age at admission', 'Admission date',
       'Discharge date', 'Admitting unit service', 'Discharging unit service',
       'Admitting specialty', 'Discharging specialty',
       'mortality_inhosp_allcause', 'Outpatientreferralflag',
       'Agentorangeflag', 'CV diagnosis', 'diagnosis', 'code', 'cc Status_CC',
       'cc Status_MCC', 'cc Status_NCC',
       'Discharge disposition_Death with autopsy',
       'Discharge disposition_Death without autopsy',
       'Discharge disposition_Irregular',
       'Discharge disposition_NBC or while ASIH',
       'Discharge disposition_Regular', 'Discharge disposition_Transfer',
       'Encounter ID', 'rehosp_allcause', 'Smallest Time Passed',
       'readmit_allcause_30d', 'readmit_allcause_90d', 'readmit_allcause_180d',
       'readmit_allcause_365d', 'readmit_CV_30d', 'readmit_CV_90d',
       'readmit_CV_180d', 'readmit_CV_365d', 'mortality_inhosp_CV'],
      dtype='object')

In [5]:
# Merge the two dataframes on the 'Encounter ID' column
merged_df = pd.merge(inpatient_admissions_df, lab_measurements_df, on=['Internalpatientid','Encounter ID'], how='left')
merged_df

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,Outpatientreferralflag,...,CRP_ULN,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer,liver_heptaocellular_product
0,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,,,,,,,,,,
1,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,,,,,,,,,,
2,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,260.0,260.0,260.0,0.0,,,,0.0,,1188.0
3,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,6.0,6.0,6.0,0.0,,,,0.0,,1188.0
4,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,25.5,25.5,25.5,0.0,,,,0.0,,1188.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13125487,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,25.0,25.0,25.0,0.0,,,,0.0,,29.0
13125488,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,25.0,25.0,25.0,0.0,,,,0.0,,29.0
13125489,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,6.0,6.0,6.0,0.0,,,,0.0,,29.0
13125490,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,50.0,50.0,50.0,0.0,,,,0.0,,29.0


# discharge creatinine

In [6]:
from datetime import timedelta

In [14]:
merged_df.columns

Index(['Internalpatientid', 'Age at admission', 'Admission date',
       'Discharge date', 'Admitting unit service', 'Discharging unit service',
       'Admitting specialty', 'Discharging specialty',
       'mortality_inhosp_allcause', 'Outpatientreferralflag',
       'Agentorangeflag', 'CV diagnosis', 'diagnosis', 'code', 'cc Status_CC',
       'cc Status_MCC', 'cc Status_NCC',
       'Discharge disposition_Death with autopsy',
       'Discharge disposition_Death without autopsy',
       'Discharge disposition_Irregular',
       'Discharge disposition_NBC or while ASIH',
       'Discharge disposition_Regular', 'Discharge disposition_Transfer',
       'Encounter ID', 'rehosp_allcause', 'Smallest Time Passed',
       'readmit_allcause_30d', 'readmit_allcause_90d', 'readmit_allcause_180d',
       'readmit_allcause_365d', 'readmit_CV_30d', 'readmit_CV_90d',
       'readmit_CV_180d', 'readmit_CV_365d', 'mortality_inhosp_CV',
       'Age at lab test', 'Lab test date', 'Result numeric', 'Spe

In [7]:
# Step 1: Filter rows with 'desc' entry of 'cr'
filtered_df = merged_df[merged_df['concept'] == 'cr']

# Step 2: Find the latest 'Discharge date' for each encounter
latest_discharge_dates = filtered_df.groupby('Encounter ID')['Discharge date'].max().reset_index()

# Step 3: Find the latest 'Lab test date' up to 24 hours after 'Discharge date' for each encounter
def find_latest_lab_test(df):
    max_lab_date = df['Lab test date'].max()
    max_discharge_date = df['Discharge date'].max()
    if max_lab_date <= max_discharge_date + timedelta(hours=24):
        return df.loc[df['Lab test date'] == max_lab_date, 'Result numeric'].iloc[0]
    return None

discharge_creatinine = filtered_df.groupby('Encounter ID').apply(find_latest_lab_test).reset_index(name='discharge_creatinine')

# Merge the 'discharge_creatinine' column back into the original merged_df
merged_df = pd.merge(merged_df, discharge_creatinine, on='Encounter ID', how='left')

merged_df

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,Outpatientreferralflag,...,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer,liver_heptaocellular_product,discharge_creatinine
0,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,,,,,,,,,,
1,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,,,,,,,,,,
2,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,260.0,260.0,0.0,,,,0.0,,1188.0,0.952981
3,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,6.0,6.0,0.0,,,,0.0,,1188.0,0.952981
4,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,25.5,25.5,0.0,,,,0.0,,1188.0,0.952981
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13125487,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,25.0,25.0,0.0,,,,0.0,,29.0,0.887137
13125488,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,25.0,25.0,0.0,,,,0.0,,29.0,0.887137
13125489,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,6.0,6.0,0.0,,,,0.0,,29.0,0.887137
13125490,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,50.0,50.0,0.0,,,,0.0,,29.0,0.887137


# Discharge lactate

In [8]:
# Step 1: Filter rows with 'desc' entry of 'cr'
filtered_df = merged_df[merged_df['concept'] == 'lactate']

# Step 2: Find the latest 'Discharge date' for each encounter
latest_discharge_dates = filtered_df.groupby('Encounter ID')['Discharge date'].max().reset_index()

# Step 3: Find the latest 'Lab test date' up to 24 hours after 'Discharge date' for each encounter
def find_latest_lab_test(df):
    max_lab_date = df['Lab test date'].max()
    max_discharge_date = df['Discharge date'].max()
    if max_lab_date <= max_discharge_date + timedelta(hours=24):
        return df.loc[df['Lab test date'] == max_lab_date, 'Result numeric'].iloc[0]
    return None

discharge_creatinine = filtered_df.groupby('Encounter ID').apply(find_latest_lab_test).reset_index(name='discharge_lactate')

# Merge the 'discharge_creatinine' column back into the original merged_df
merged_df = pd.merge(merged_df, discharge_creatinine, on='Encounter ID', how='left')

merged_df

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,Outpatientreferralflag,...,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer,liver_heptaocellular_product,discharge_creatinine,discharge_lactate
0,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,,,,,,,,,,
1,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,,,,,,,,,,
2,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,260.0,0.0,,,,0.0,,1188.0,0.952981,
3,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,6.0,0.0,,,,0.0,,1188.0,0.952981,
4,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,25.5,0.0,,,,0.0,,1188.0,0.952981,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13125487,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,25.0,0.0,,,,0.0,,29.0,0.887137,
13125488,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,25.0,0.0,,,,0.0,,29.0,0.887137,
13125489,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,6.0,0.0,,,,0.0,,29.0,0.887137,
13125490,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,50.0,0.0,,,,0.0,,29.0,0.887137,


In [9]:
# Create a mapping dictionary from the 'discharge_creatinine' dataframe
discharge_creatinine_mapping = merged_df.set_index('Encounter ID')['discharge_creatinine'].to_dict()

# Add the 'discharge_creatinine' column to the 'lab_measurements_df' based on 'Encounter ID'
lab_measurements_df['discharge_creatinine'] = lab_measurements_df['Encounter ID'].map(discharge_creatinine_mapping)
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,hsCRP_ULN,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer,liver_heptaocellular_product,discharge_creatinine
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,725.0,725.0,0,,,,0.0,,1188.0,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,725.0,725.0,0,,,,0.0,,1188.0,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,740.0,0,,,0.000178,0.0,,1188.0,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,740.0,0,,,,0.0,,1188.0,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,740.0,0,,,0.000749,0.0,,1188.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20700149,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,170.0,170.0,0,,,,0.0,,17.0,
20700150,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,185.0,185.0,0,,,,0.0,,22.0,
20700151,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,210.0,210.0,0,,,,0.0,,21.0,
20700152,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,210.0,210.0,0,,,,0.0,,21.0,


In [11]:
# Create a mapping dictionary from the 'discharge_creatinine' dataframe
discharge_lactate_mapping = merged_df.set_index('Encounter ID')['discharge_lactate'].to_dict()

# Add the 'discharge_creatinine' column to the 'lab_measurements_df' based on 'Encounter ID'
lab_measurements_df['discharge_lactate'] = lab_measurements_df['Encounter ID'].map(discharge_lactate_mapping)
lab_measurements_df

Unnamed: 0,Internalpatientid,Age at lab test,Lab test date,Result numeric,Specimen source,desc,concept,unit,range_min,range_max,...,ESR_ULN,id_inflamed_up,highest_lactate_value,renal_ph_low,average_bicarb,drug_stimulant_use,average_ddimer,liver_heptaocellular_product,discharge_creatinine,discharge_lactate
0,23511,66,2013-05-13 19:58:45,143.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,725.0,0,,,,0.0,,1188.0,,
1,23511,66,2013-06-21 17:21:08,144.0,serum,ZSODIUM,na,mmol/L,136.0,145.0,...,725.0,0,,,,0.0,,1188.0,,
2,23256,51,2001-06-24 23:17:28,137.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,0,,,0.000178,0.0,,1188.0,,
3,23256,56,2006-05-20 02:08:17,149.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,0,,,,0.0,,1188.0,,
4,23256,60,2010-04-08 10:26:51,140.0,serum,ZSODIUM,na,mmol/L,136.0,148.0,...,740.0,0,,,0.000749,0.0,,1188.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20700149,142057,83,2022-02-05 08:22:24,17.0,plasma,SGOT(AST),ast,U/L,5.0,34.0,...,170.0,0,,,,0.0,,17.0,,
20700150,31715,74,1999-12-27 10:23:10,22.0,serum,SGOT(AST),ast,U/L,13.0,37.0,...,185.0,0,,,,0.0,,22.0,,
20700151,141306,68,2005-11-30 16:15:56,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,210.0,0,,,,0.0,,21.0,,
20700152,141306,82,2020-03-06 13:42:17,21.0,serum,SGOT(AST),ast,U/L,8.0,42.0,...,210.0,0,,,,0.0,,21.0,,


In [12]:
lab_measurements_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/lab_results.parquet')

# Length of stay

In [16]:
inpatient_admissions_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/inpatient_admissions.parquet')
inpatient_admissions_df

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,Outpatientreferralflag,...,Smallest Time Passed,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV
0,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,,,,,,0,0,0,0,0
1,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,,,,,,0,0,0,0,0
2,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,63.0,0.0,1.0,1.0,1.0,0,0,0,0,0
3,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,63.0,0.0,1.0,1.0,1.0,0,0,0,0,0
4,100001,85,2010-11-10 04:32:39,2010-11-19 08:49:45,SURGERY,SURGERY,SUBSTANCE ABUSE RES TRMT PROG,ORTHOPEDIC,0,0,...,63.0,0.0,1.0,1.0,1.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001358,99975,89,2018-01-05 00:59:05,2018-01-06 23:18:39,NON-COUNT,NON-COUNT,PODIATRY,MEDICAL OBSERVATION,0,0,...,12.0,1.0,1.0,1.0,1.0,0,0,0,0,0
1001359,99986,73,2019-06-26 19:47:10,2019-06-29 21:18:00,MEDICINE,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,0.0,1.0,1.0,1.0,1.0,0,0,0,0,0
1001360,99986,73,2019-06-26 19:47:10,2019-06-29 21:18:00,MEDICINE,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,0.0,1.0,1.0,1.0,1.0,0,0,0,0,0
1001361,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,28.0,1.0,1.0,1.0,1.0,0,0,0,0,0


In [17]:
inpatient_admissions_df['length_of_stay'] = (inpatient_admissions_df['Discharge date'] - inpatient_admissions_df['Admission date']).dt.days
inpatient_admissions_df

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,Outpatientreferralflag,...,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,length_of_stay
0,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,,,,,0,0,0,0,0,0
1,10,66,2015-11-28 17:41:09,2015-11-29 01:43:14,NON-COUNT,NON-COUNT,DRUG DEPENDENCE TRMT UNIT,MEDICAL OBSERVATION,0,0,...,,,,,0,0,0,0,0,0
2,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,0.0,1.0,1.0,1.0,0,0,0,0,0,2
3,100001,84,2009-10-01 21:19:50,2009-10-04 16:51:33,MEDICINE,MEDICINE,PSYCHIATRIC MENTALLY INFIRM,GENERAL(ACUTE MEDICINE),0,1,...,0.0,1.0,1.0,1.0,0,0,0,0,0,2
4,100001,85,2010-11-10 04:32:39,2010-11-19 08:49:45,SURGERY,SURGERY,SUBSTANCE ABUSE RES TRMT PROG,ORTHOPEDIC,0,0,...,0.0,1.0,1.0,1.0,0,0,0,0,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1001358,99975,89,2018-01-05 00:59:05,2018-01-06 23:18:39,NON-COUNT,NON-COUNT,PODIATRY,MEDICAL OBSERVATION,0,0,...,1.0,1.0,1.0,1.0,0,0,0,0,0,1
1001359,99986,73,2019-06-26 19:47:10,2019-06-29 21:18:00,MEDICINE,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,1.0,1.0,1.0,1.0,0,0,0,0,0,3
1001360,99986,73,2019-06-26 19:47:10,2019-06-29 21:18:00,MEDICINE,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,1.0,1.0,1.0,1.0,0,0,0,0,0,3
1001361,99994,83,2016-08-13 20:09:52,2016-08-15 16:43:07,SURGERY,MEDICINE,INTERMEDIATE MEDICINE,GENERAL(ACUTE MEDICINE),0,1,...,1.0,1.0,1.0,1.0,0,0,0,0,0,1


# pre_hosp_any

In [18]:
# Step 1: Sort the dataframe by 'Internalpatientid' and 'Encounter ID' to ensure proper order
inpatient_admissions_df = inpatient_admissions_df.sort_values(by=['Internalpatientid', 'Encounter ID'])

# Step 2: Calculate the count of hospitalizations prior to the target hospitalization
inpatient_admissions_df['pre_hosp_any'] = inpatient_admissions_df.groupby('Internalpatientid').cumcount()

inpatient_admissions_df

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,Outpatientreferralflag,...,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,length_of_stay,pre_hosp_any
32196,1,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,1,...,0.0,0.0,1.0,0,0,0,0,0,1,0
32197,1,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,1,...,0.0,0.0,1.0,0,0,0,0,0,1,1
45845,1,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,1,...,0.0,0.0,1.0,0,0,0,0,0,3,2
45846,1,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,1,...,0.0,0.0,1.0,0,0,0,0,0,3,3
45847,1,72,2016-07-16 15:18:16,2016-07-22 21:36:24,SURGERY,SURGERY,SPINAL CORD INJURY,UROLOGY,0,1,...,0.0,0.0,1.0,0,0,0,0,0,6,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435812,169062,73,2004-08-11 21:52:41,2004-08-14 23:42:41,MEDICINE,MEDICINE,"PULMONARY, TUBERCULOSIS",Not specified (no value),0,1,...,1.0,1.0,1.0,0,0,0,0,0,3,15
910894,169062,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,0,...,1.0,1.0,1.0,0,0,0,0,0,1,16
910895,169062,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,0,...,1.0,1.0,1.0,0,0,0,0,0,1,17
432541,169064,82,2008-08-19 03:12:47,2008-08-26 07:34:10,SURGERY,SURGERY,NH SHORT STAY SKILLED NURSING,GENERAL SURGERY,0,0,...,,,,0,0,0,0,0,7,0


# pre_hosp_cv

In [20]:
# Step 2: Calculate the count of hospitalizations with a CV-associated cause prior to the target hospitalization
inpatient_admissions_df['pre_hosp_cv'] = inpatient_admissions_df.groupby('Internalpatientid')['CV diagnosis'].cumsum() - inpatient_admissions_df['CV diagnosis']
inpatient_admissions_df

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,Outpatientreferralflag,...,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,length_of_stay,pre_hosp_any,pre_hosp_cv
32196,1,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,1,...,0.0,1.0,0,0,0,0,0,1,0,0
32197,1,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,1,...,0.0,1.0,0,0,0,0,0,1,1,0
45845,1,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,1,...,0.0,1.0,0,0,0,0,0,3,2,0
45846,1,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,1,...,0.0,1.0,0,0,0,0,0,3,3,0
45847,1,72,2016-07-16 15:18:16,2016-07-22 21:36:24,SURGERY,SURGERY,SPINAL CORD INJURY,UROLOGY,0,1,...,0.0,1.0,0,0,0,0,0,6,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435812,169062,73,2004-08-11 21:52:41,2004-08-14 23:42:41,MEDICINE,MEDICINE,"PULMONARY, TUBERCULOSIS",Not specified (no value),0,1,...,1.0,1.0,0,0,0,0,0,3,15,0
910894,169062,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,0,...,1.0,1.0,0,0,0,0,0,1,16,0
910895,169062,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,0,...,1.0,1.0,0,0,0,0,0,1,17,0
432541,169064,82,2008-08-19 03:12:47,2008-08-26 07:34:10,SURGERY,SURGERY,NH SHORT STAY SKILLED NURSING,GENERAL SURGERY,0,0,...,,,0,0,0,0,0,7,0,0


In [22]:
inpatient_admissions_df.dtypes

Internalpatientid                                       int32
Age at admission                                         int8
Admission date                                 datetime64[ns]
Discharge date                                 datetime64[ns]
Admitting unit service                                 object
Discharging unit service                               object
Admitting specialty                                    object
Discharging specialty                                  object
mortality_inhosp_allcause                                int8
Outpatientreferralflag                                   int8
Agentorangeflag                                          int8
CV diagnosis                                             int8
diagnosis                                              object
code                                                   object
cc Status_CC                                            uint8
cc Status_MCC                                           uint8
cc Statu

In [26]:
# List of columns to convert to int8
columns_to_convert = ['rehosp_allcause','readmit_allcause_30d', 'readmit_allcause_90d', 'readmit_allcause_180d', 'readmit_allcause_365d',
                      'readmit_CV_30d', 'readmit_CV_90d', 'readmit_CV_180d', 'readmit_CV_365d',
                      'mortality_inhosp_CV']

# Convert specified columns to int8 data type
inpatient_admissions_df[columns_to_convert] = inpatient_admissions_df[columns_to_convert].fillna(-1).replace([np.inf, -np.inf], np.nan).astype('Int8')
inpatient_admissions_df

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,Outpatientreferralflag,...,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,length_of_stay,pre_hosp_any,pre_hosp_cv
32196,1,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,1,0,0
32197,1,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,1,1,0
45845,1,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,3,2,0
45846,1,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,3,3,0
45847,1,72,2016-07-16 15:18:16,2016-07-22 21:36:24,SURGERY,SURGERY,SPINAL CORD INJURY,UROLOGY,0,1,...,0,1,0,0,0,0,0,6,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435812,169062,73,2004-08-11 21:52:41,2004-08-14 23:42:41,MEDICINE,MEDICINE,"PULMONARY, TUBERCULOSIS",Not specified (no value),0,1,...,1,1,0,0,0,0,0,3,15,0
910894,169062,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,0,...,1,1,0,0,0,0,0,1,16,0
910895,169062,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,0,...,1,1,0,0,0,0,0,1,17,0
432541,169064,82,2008-08-19 03:12:47,2008-08-26 07:34:10,SURGERY,SURGERY,NH SHORT STAY SKILLED NURSING,GENERAL SURGERY,0,0,...,-1,-1,0,0,0,0,0,7,0,0


In [27]:
inpatient_admissions_df.dtypes

Internalpatientid                                       int32
Age at admission                                         int8
Admission date                                 datetime64[ns]
Discharge date                                 datetime64[ns]
Admitting unit service                                 object
Discharging unit service                               object
Admitting specialty                                    object
Discharging specialty                                  object
mortality_inhosp_allcause                                int8
Outpatientreferralflag                                   int8
Agentorangeflag                                          int8
CV diagnosis                                             int8
diagnosis                                              object
code                                                   object
cc Status_CC                                            uint8
cc Status_MCC                                           uint8
cc Statu

In [28]:
# List of columns to convert to int8
columns_to_convert = ['pre_hosp_cv','pre_hosp_any']

# Convert specified columns to int8 data type
inpatient_admissions_df[columns_to_convert] = inpatient_admissions_df[columns_to_convert].fillna(-1).replace([np.inf, -np.inf], np.nan).astype('Int32')
inpatient_admissions_df

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,Outpatientreferralflag,...,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,length_of_stay,pre_hosp_any,pre_hosp_cv
32196,1,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,1,0,0
32197,1,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,1,1,0
45845,1,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,3,2,0
45846,1,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,3,3,0
45847,1,72,2016-07-16 15:18:16,2016-07-22 21:36:24,SURGERY,SURGERY,SPINAL CORD INJURY,UROLOGY,0,1,...,0,1,0,0,0,0,0,6,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435812,169062,73,2004-08-11 21:52:41,2004-08-14 23:42:41,MEDICINE,MEDICINE,"PULMONARY, TUBERCULOSIS",Not specified (no value),0,1,...,1,1,0,0,0,0,0,3,15,0
910894,169062,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,0,...,1,1,0,0,0,0,0,1,16,0
910895,169062,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,0,...,1,1,0,0,0,0,0,1,17,0
432541,169064,82,2008-08-19 03:12:47,2008-08-26 07:34:10,SURGERY,SURGERY,NH SHORT STAY SKILLED NURSING,GENERAL SURGERY,0,0,...,-1,-1,0,0,0,0,0,7,0,0


In [29]:
inpatient_admissions_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/inpatient_admissions.parquet')

In [36]:
inpatient_admissions_df = pd.read_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/inpatient_admissions.parquet')
inpatient_admissions_df

Unnamed: 0,Internalpatientid,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,Outpatientreferralflag,...,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,length_of_stay,pre_hosp_any,pre_hosp_cv
32196,1,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,1,0,0
32197,1,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,1,1,0
45845,1,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,3,2,0
45846,1,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,1,...,0,1,0,0,0,0,0,3,3,0
45847,1,72,2016-07-16 15:18:16,2016-07-22 21:36:24,SURGERY,SURGERY,SPINAL CORD INJURY,UROLOGY,0,1,...,0,1,0,0,0,0,0,6,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435812,169062,73,2004-08-11 21:52:41,2004-08-14 23:42:41,MEDICINE,MEDICINE,"PULMONARY, TUBERCULOSIS",Not specified (no value),0,1,...,1,1,0,0,0,0,0,3,15,0
910894,169062,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,0,...,1,1,0,0,0,0,0,1,16,0
910895,169062,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,0,...,1,1,0,0,0,0,0,1,17,0
432541,169064,82,2008-08-19 03:12:47,2008-08-26 07:34:10,SURGERY,SURGERY,NH SHORT STAY SKILLED NURSING,GENERAL SURGERY,0,0,...,-1,-1,0,0,0,0,0,7,0,0


In [37]:
# Get the column names
cols = inpatient_admissions_df.columns.tolist()

# Move 'Encounter ID' to the second position
cols.insert(1, cols.pop(cols.index('Encounter ID')))

# Reorder the DataFrame with the updated column order
inpatient_admissions_df = inpatient_admissions_df[cols]
inpatient_admissions_df

Unnamed: 0,Internalpatientid,Encounter ID,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,...,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d,mortality_inhosp_CV,length_of_stay,pre_hosp_any,pre_hosp_cv
32196,1,0ce62d49-ea64-5941-b70c-89b4a1c4cc8a,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,...,0,1,0,0,0,0,0,1,0,0
32197,1,0ce62d49-ea64-5941-b70c-89b4a1c4cc8a,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,...,0,1,0,0,0,0,0,1,1,0
45845,1,52d5e3bc-aced-53a4-b8a4-4a458e55601f,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,...,0,1,0,0,0,0,0,3,2,0
45846,1,52d5e3bc-aced-53a4-b8a4-4a458e55601f,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,...,0,1,0,0,0,0,0,3,3,0
45847,1,aee47e87-cab8-5ca7-9947-21cb2daf476b,72,2016-07-16 15:18:16,2016-07-22 21:36:24,SURGERY,SURGERY,SPINAL CORD INJURY,UROLOGY,0,...,0,1,0,0,0,0,0,6,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435812,169062,e6460fae-0e3b-5fda-a016-f36cdd654819,73,2004-08-11 21:52:41,2004-08-14 23:42:41,MEDICINE,MEDICINE,"PULMONARY, TUBERCULOSIS",Not specified (no value),0,...,1,1,0,0,0,0,0,3,15,0
910894,169062,f6956488-7e4c-5967-a95f-208aae167c25,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,...,1,1,0,0,0,0,0,1,16,0
910895,169062,f6956488-7e4c-5967-a95f-208aae167c25,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,...,1,1,0,0,0,0,0,1,17,0
432541,169064,0c52ac5b-8578-50a3-9f26-fbdf68dad075,82,2008-08-19 03:12:47,2008-08-26 07:34:10,SURGERY,SURGERY,NH SHORT STAY SKILLED NURSING,GENERAL SURGERY,0,...,-1,-1,0,0,0,0,0,7,0,0


In [38]:
# Specify the column names to move to the end
cols_to_move = ['readmit_allcause_30d', 'readmit_allcause_90d', 'readmit_allcause_180d',
                'readmit_allcause_365d', 'readmit_CV_30d', 'readmit_CV_90d',
                'readmit_CV_180d', 'readmit_CV_365d']

# Remove the specified columns from the list
for col in cols_to_move:
    cols.remove(col)

# Append the specified columns to the end of the list
cols.extend(cols_to_move)

# Reorder the DataFrame with the updated column order
inpatient_admissions_df = inpatient_admissions_df[cols]
inpatient_admissions_df

Unnamed: 0,Internalpatientid,Encounter ID,Age at admission,Admission date,Discharge date,Admitting unit service,Discharging unit service,Admitting specialty,Discharging specialty,mortality_inhosp_allcause,...,pre_hosp_any,pre_hosp_cv,readmit_allcause_30d,readmit_allcause_90d,readmit_allcause_180d,readmit_allcause_365d,readmit_CV_30d,readmit_CV_90d,readmit_CV_180d,readmit_CV_365d
32196,1,0ce62d49-ea64-5941-b70c-89b4a1c4cc8a,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,...,0,0,0,0,0,1,0,0,0,0
32197,1,0ce62d49-ea64-5941-b70c-89b4a1c4cc8a,79,2022-12-31 05:41:51,2023-01-01 23:06:30,MEDICINE,MEDICINE,DERMATOLOGY,GENERAL(ACUTE MEDICINE),0,...,1,0,0,0,0,1,0,0,0,0
45845,1,52d5e3bc-aced-53a4-b8a4-4a458e55601f,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,...,2,0,0,0,0,1,0,0,0,0
45846,1,52d5e3bc-aced-53a4-b8a4-4a458e55601f,68,2012-08-10 23:27:47,2012-08-14 20:57:06,MEDICINE,MEDICINE,PERIPHERAL VASCULAR,GENERAL(ACUTE MEDICINE),0,...,3,0,0,0,0,1,0,0,0,0
45847,1,aee47e87-cab8-5ca7-9947-21cb2daf476b,72,2016-07-16 15:18:16,2016-07-22 21:36:24,SURGERY,SURGERY,SPINAL CORD INJURY,UROLOGY,0,...,4,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435812,169062,e6460fae-0e3b-5fda-a016-f36cdd654819,73,2004-08-11 21:52:41,2004-08-14 23:42:41,MEDICINE,MEDICINE,"PULMONARY, TUBERCULOSIS",Not specified (no value),0,...,15,0,1,1,1,1,0,0,0,0
910894,169062,f6956488-7e4c-5967-a95f-208aae167c25,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,...,16,0,1,1,1,1,0,0,0,0
910895,169062,f6956488-7e4c-5967-a95f-208aae167c25,73,2004-08-10 14:07:29,2004-08-11 21:53:50,NON-COUNT,NON-COUNT,SURGICAL STEPDOWN,Not specified (no value),0,...,17,0,1,1,1,1,0,0,0,0
432541,169064,0c52ac5b-8578-50a3-9f26-fbdf68dad075,82,2008-08-19 03:12:47,2008-08-26 07:34:10,SURGERY,SURGERY,NH SHORT STAY SKILLED NURSING,GENERAL SURGERY,0,...,0,0,-1,-1,-1,-1,0,0,0,0


In [39]:
inpatient_admissions_df.columns

Index(['Internalpatientid', 'Encounter ID', 'Age at admission',
       'Admission date', 'Discharge date', 'Admitting unit service',
       'Discharging unit service', 'Admitting specialty',
       'Discharging specialty', 'mortality_inhosp_allcause',
       'Outpatientreferralflag', 'Agentorangeflag', 'CV diagnosis',
       'diagnosis', 'code', 'cc Status_CC', 'cc Status_MCC', 'cc Status_NCC',
       'Discharge disposition_Death with autopsy',
       'Discharge disposition_Death without autopsy',
       'Discharge disposition_Irregular',
       'Discharge disposition_NBC or while ASIH',
       'Discharge disposition_Regular', 'Discharge disposition_Transfer',
       'rehosp_allcause', 'Smallest Time Passed', 'mortality_inhosp_CV',
       'length_of_stay', 'pre_hosp_any', 'pre_hosp_cv', 'readmit_allcause_30d',
       'readmit_allcause_90d', 'readmit_allcause_180d',
       'readmit_allcause_365d', 'readmit_CV_30d', 'readmit_CV_90d',
       'readmit_CV_180d', 'readmit_CV_365d'],
      dt

In [40]:
inpatient_admissions_df.to_parquet('/content/drive/MyDrive/VCHAMPS - Final Train Data/inpatient_admissions.parquet')