In [54]:
import pandas as pd
import numpy as np
from datetime import datetime
pd.options.display.max_rows = 500

In [None]:
# 01: static patient data
# Load data
static_patient = pd.read_csv("data/raw/patients.csv.gz")

# Set column data types
static_patient["dod"] = pd.to_datetime(static_patient["dod"], format="%Y-%m-%d")
static_patient["gender"] = static_patient["gender"].astype("category")

# Select columns needed
static_patient = static_patient[['subject_id', 'gender', 'anchor_age']]

print("01: STATIC PATIENT DATA PROCESSED!")


In [59]:
# 02: static admission data
# Load data
static_admission = pd.read_csv("data/raw/admissions.csv.gz")

# Set column data types
# categorical columns
cate_cols = ['admission_type', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status', 'race']
date_cols = ['admittime', 'dischtime', 'deathtime']

for col in cate_cols:
    static_admission[col] = static_admission[col].astype('category')

for col in date_cols:
    static_admission[col] = pd.to_datetime(static_admission[col], format="%Y-%m-%d %H:%M:%S")

# Select columns needed
drop_cols = ['edregtime', 'edouttime', 'hospital_expire_flag']
static_admission.drop(columns=drop_cols, inplace=True)

# Define readmission within 7 days
static_admission = static_admission.sort_values(by=['subject_id', 'admittime'])
static_admission['time_to_readmission'] = static_admission.groupby('subject_id')['admittime'].shift(-1)
static_admission['time_to_readmission'] = static_admission['time_to_readmission'] - static_admission['dischtime']
static_admission['time_to_readmission'] = static_admission['time_to_readmission'].dt.days
static_admission['readmission_30d'] = static_admission['time_to_readmission'] <= 30
static_admission.drop(columns=['time_to_readmission'], inplace=True)

print("02: STATIC ADMISSION DATA PROCESSED!")