In [1]:

import psycopg2
import pandas as pd
import numpy as np
import os
import dask.dataframe as dd
import re
from datetime import datetime, timedelta, time

# scaling 
from sklearn.preprocessing import StandardScaler

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
import shap 

# dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, Sampler, BatchSampler, SequentialSampler
from torch.cuda.amp import GradScaler, autocast

# model 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import lightgbm as lgb
import optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

# wandb
import wandb

# evaluation
from sklearn.metrics import log_loss, roc_auc_score, roc_curve, average_precision_score, accuracy_score, precision_recall_curve
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc as calculate_auc  
from sklearn.model_selection import StratifiedKFold

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# PostgreSQL 서버 연결 정보

host = '192.168.0.76'  # 호스트 주소
database = 'eicu'  # 데이터베이스 이름
user = 'ykjeong'  # 사용자 이름
password = 'mdhi1234!'  # 비밀번호

# PostgreSQL 서버에 연결
conn = psycopg2.connect(
    dbname=database,
    user=user,
    password=password,
    host=host
)

## 데이터 추출

### patients

In [3]:
query = '''
SELECT
    patientunitstayid AS stay_id,
    gender,
    age,
    ethnicity AS race,
    unitType AS CU_type,
    unitAdmitTime24 AS intime, 
    unitDischargeTime24 AS death_or_dischtime,
    unitDischargeStatus AS dead_in_hosp
FROM
    eicu.patient
WHERE -- 1일 이상 60일 미만 입원한 19세 이상의 환자
    unitDischargeOffset / 1440.0 >= 1 
    AND unitDischargeOffset / 1440.0 < 60;
'''
patients = pd.read_sql_query(query, conn)

  patients = pd.read_sql_query(query, conn)


In [4]:
patients.rename(columns={
    'cu_type':'CU type', 
    'dead_in_hosp':'dead in hosp',
    'death_or_dischtime':'death or dischtime'
}, inplace=True)

In [5]:
# 'age' 컬럼 핸들링 
# '> 89'를 91로 처리
patients.loc[patients['age'] == '> 89', 'age'] = 91

# 결측치를 평균으로 대체
patients['age'] = pd.to_numeric(patients['age'], errors='coerce')
patients['age'] = patients['age'].fillna(patients['age'].mean())
patients['age'] = patients['age'].astype(int)

# 18세 이상만 추출 
patients = patients[patients['age'] > 18]

In [6]:
def categorize_careunit(careunit):
    if 'MICU' in careunit or'Med-Surg ICU' in careunit:
        return 'MICU'
    elif 'SICU' in careunit:
        return 'SICU'
    elif 'Neuro ICU' in careunit:
        return 'Neuro ICU'
    elif 'Cardiac ICU' in careunit or 'CSICU' in careunit:
        return 'CCU'
    elif 'CVICU' in careunit:
        return 'CVICU'
    else:
        return 'other'

patients['CU type'] = patients['CU type'].apply(categorize_careunit)
patients['CU type'].unique()

array(['MICU', 'other', 'Neuro ICU', 'SICU', 'CCU'], dtype=object)

In [7]:
# 인종 분류 및 결측치 수정 

def categorize_race(race):
    if 'Caucasian' in race:
        return 'white'
    elif 'African American' in race:
        return 'black'
    elif 'Hispanic' in race:
        return 'hispanic/latino'
    elif 'ASIAN' in race:
        return 'asian'
    else:
        return 'other'

patients['race'] = patients['race'].apply(categorize_race)
patients['race'].unique()

array(['white', 'other', 'black', 'hispanic/latino'], dtype=object)

In [8]:
# dead in hosp 컬럼의 값을 변경하고 비율 확인

patients.loc[:, 'dead in hosp'] = patients['dead in hosp'].map(lambda x: 1 if x == 'Expired' else 0)
patients['dead in hosp'].value_counts(normalize=True)

dead in hosp
0    0.947866
1    0.052134
Name: proportion, dtype: float64

In [9]:
# stay_id 분리

patients_pos_samp = patients[patients['dead in hosp'] == 1]

# 'dead in hosp'가 0인 행들에서 50%만 랜덤으로 추출
patients_neg_samp = patients[patients['dead in hosp'] == 0].sample(frac=0.2, random_state=42)

patients = pd.concat([patients_neg_samp, patients_pos_samp])
patients['dead in hosp'].value_counts(normalize=True)

dead in hosp
0    0.784312
1    0.215688
Name: proportion, dtype: float64

In [9]:
patients.to_parquet('eICU_patients.parquet')

### lab

In [11]:
query = '''
SELECT 
    lab.patientunitstayid AS stay_id,
    lab.labname AS label,
    lab.labresulttext AS value,
    lab.labTypeID AS labtype, 
    lab.labmeasurenamesystem AS valueoum, 
    lab.labresultoffset / 60 AS time
FROM
    eicu.lab lab
WHERE(
        lab.labname LIKE '%AST%' OR
        lab.labname LIKE '%ALT%' OR
        lab.labname LIKE '%albumin%' OR
        lab.labname LIKE '%bilirubin%' OR
        lab.labname LIKE '%BUN%' OR
        lab.labname LIKE '%chloride%' OR
        lab.labname LIKE '%CRP%' OR
        lab.labname LIKE '%glucose%' OR
        lab.labname LIKE '%Hgb%' OR
        lab.labname LIKE '%respiratory%' OR
        lab.labname LIKE '%platelet%' OR
        lab.labname LIKE '%potassium%' OR
        lab.labname LIKE '%Temperature%' OR
        lab.labname LIKE '%urinary sodium%' OR
        lab.labname LIKE '%urinary creatinine%' OR
        lab.labname LIKE '%WBC x 1000%' OR
        lab.labname LIKE '%PT%' AND 
        lab.labmeasurenamesystem LIKE '%sec%' 
    )
    AND lab.patientunitstayid IN (
        SELECT
            patient.patientunitstayid
        FROM
            eicu.patient patient
        WHERE
            patient.unitDischargeOffset / 1440.0 >= 1 
            AND patient.unitDischargeOffset / 1440.0 < 60
    );
'''
lab = pd.read_sql_query(query, conn)
lab = lab[lab['stay_id'].isin(patients['stay_id'])]

  lab = pd.read_sql_query(query, conn)


In [12]:
# lab['label']의 값이 'glucose - CSF', 'prealbumin', 'direct bilirubin', 'CRP-hs'인 행 삭제 

values_to_remove = ['glucose - CSF', 'prealbumin', 'direct bilirubin', 'CRP-hs', 'PTT']
lab = lab[~lab['label'].isin(values_to_remove)]

In [13]:
lab['label'] = lab['label'].replace({
    r' \(SGPT\)': '',
    r' \(SGOT\)': '',
    r'urinary ': '',
    r' x 1000': '',
    r'bedside ': '',
    r'total ': ''
}, regex=True)

In [14]:
# value 열의 모든 특수문자를 제거하고 수치형으로 변환

lab['value'] = lab['value'].str.replace(r'[^\w\s]', '', regex=True)
lab['value'] = pd.to_numeric(lab['value'], errors='coerce')

In [15]:
# vital sign 이상치 조정 

ranges = {           
    'ALT':(0, 5000), 
    'AST':(0, 10000), 
    'albumin':(0, 5.5), 
    'BUN':(0, 150), 
    'bilirubin':(0, 50),
    'CRP':(0, 300), 
    'chloride':(70, 135),
    'creatinine':(0, 15),
    'glucose':(0, 600),
    'Hgb':(0, 25),
    'Temperature': (32, 41), 
    'potassium':(2, 9),
    'sodium':(105, 170),
    'platelets':(0, 1000),
    'PT':(0, 8), 
    'WBC':(0, 90)}

# 범위를 벗어난 값들을 NaN으로 대체
def replace_out_of_range_with_nan(row):
    label = row['label']
    value = row['value']
    
    if label in ranges:
        lower, upper = ranges[label]
        
        # 값이 범위를 벗어나면 NaN으로 대체
        if value < lower or value > upper:
            return np.nan
    return value

lab['value'] = lab.apply(replace_out_of_range_with_nan, axis=1)

# NaN 값을 각 label에 대한 평균값으로 대체
lab['value'] = lab.groupby('label')['value'].transform(lambda x: x.fillna(x.mean()))
lab.head()

Unnamed: 0,stay_id,label,value,labtype,valueoum,time
0,141168,PT,,3.0,sec,18
1,141168,creatinine,11.171875,4.0,mg/dL,28
2,141168,BUN,26.0,1.0,mg/dL,8
3,141168,sodium,127.74056,4.0,mmol/L,28
4,141168,PT,,3.0,sec,3


In [16]:
lab.to_parquet('eICU_lab.parquet')

### Vital aperiodic: sbp, dbp

In [17]:
patients = pd.read_parquet('eICU_patients.parquet')

In [18]:
query = '''
SELECT
    vitala.patientunitstayid AS stay_id,
    vitala.observationoffset / 60 AS time, 
    vitala.noninvasivesystolic AS sbp,
    vitala.noninvasivediastolic AS dbp
FROM
    eicu.vitalaPeriodic vitala
WHERE
    vitala.patientUnitStayID IN (
        SELECT
            patient.patientunitstayid
        FROM
            eicu.patient patient
        WHERE
            patient.unitDischargeOffset / 1440.0 >= 1 
            AND patient.unitDischargeOffset / 1440.0 < 60
    );
'''
vitalaPeriodic = pd.read_sql_query(query, conn)
vitalaPeriodic = vitalaPeriodic[vitalaPeriodic['stay_id'].isin(patients['stay_id'])]

  vitalaPeriodic = pd.read_sql_query(query, conn)


In [None]:
vitalap = pd.melt(vitalaPeriodic, id_vars=['stay_id', 'time'], value_vars=['sbp', 'dbp'],
                    var_name='label', value_name='value')
vitalap = vitalap[['stay_id', 'label', 'value', 'time']]

In [None]:
# vital sign 이상치 조정 

# 기준 설정
ranges = {
    'sbp': (40, 230),
    'dbp': (20, 130)
}


# 범위를 벗어난 값들을 NaN으로 대체
def replace_out_of_range_with_nan(row):
    label = row['label']
    value = row['value']
    
    if label in ranges:
        lower, upper = ranges[label]
        
        # 값이 범위를 벗어나면 NaN으로 대체
        if value < lower or value > upper:
            return np.nan
    return value

vitalap['value'] = vitalap.apply(replace_out_of_range_with_nan, axis=1)
vitalap['value'] = vitalap.groupby('label')['value'].transform(lambda x: x.fillna(x.mean()))

In [None]:
vitalap.to_parquet('eICU_vitalap.parquet')

### Nurse charting : Respiratory Rate, Heart rate, sbp, dbp, temperature, SpO2

In [None]:
query = '''
SELECT 
    chart.patientunitstayid AS stay_id,
    chart.nursingChartCellTypeValName AS label,
    chart.nursingChartValue AS value,
    chart.nursingChartEntryOffset / 60 AS time
FROM
    eicu.nurseCharting chart
WHERE (
        chart.nursingChartCellTypeValName LIKE '%Non-Invasive BP Diastolic%' OR
        chart.nursingChartCellTypeVallabel LIKE '%Temperature (%' OR
        chart.nursingChartCellTypeValName LIKE '%Saturation%' 
    )
    AND chart.patientunitstayid IN (
        SELECT
            patient.patientunitstayid
        FROM
            eicu.patient patient
        WHERE
            patient.unitDischargeOffset / 1440.0 >= 1 
            AND patient.unitDischargeOffset / 1440.0 < 60
    );
'''
chart = pd.read_sql_query(query, conn)
chart = chart[chart['stay_id'].isin(patients['stay_id'])]

In [None]:
chart['label'] = chart['label'].replace({'Non-Invasive BP Systolic':'sbp',
                                           'Non-Invasive BP Diastolic':'dbp',
                                           'O2 Saturation':'SpO2'})

In [None]:
# 화씨 온도를 섭씨 온도로 변환
fahrenheit_mask = chart['label'] == 'Temperature (F)'
celcius_mask = chart['label'] == 'Temperature (C)'
chart.loc[fahrenheit_mask, 'value'] = (chart.loc[fahrenheit_mask, 'value'] - 32) * 5.0 / 9.0

# label 수정
chart.loc[fahrenheit_mask, 'label'] = 'Temperature'
chart.loc[celcius_mask, 'label'] = 'Temperature'

In [None]:
# vital sign 이상치 조정 

chart['value'].replace('', np.nan, inplace=True)
chart['value'] = chart['value'].astype(float)

# 기준 설정
ranges = {
    'Respiratory Rate': (5, 50),
    'Heart Rate': (10, 190),
    'sbp': (40, 230),
    'dbp': (20, 130),
    'Temperature': (32, 41), 
    'SpO2': (68, 100)
}
# 각 label에 대한 평균값 계산
mean_values = chart.groupby('label')['value'].mean()

# 범위를 벗어난 값들을 NaN으로 대체
def replace_out_of_range_with_nan(row):
    label = row['label']
    value = row['value']
    
    if label in ranges:
        lower, upper = ranges[label]
        
        # 값이 범위를 벗어나면 NaN으로 대체
        if value < lower or value > upper:
            return np.nan
    return value

chart['value'] = chart.apply(replace_out_of_range_with_nan, axis=1)
chart['value'] = chart.groupby('label')['value'].transform(lambda x: x.fillna(x.mean()))
chart.head()

### physical Exam: GCS 

In [None]:
query = '''
SELECT 
    exam.patientUnitStayID AS stay_id,
    exam.physicalexampath AS label,
    exam.physicalExamOffset / 60 AS time,
    exam.physicalExamText AS value
FROM
    eicu.physicalExam exam
WHERE
    exam.physicalexampath LIKE '%Eyes Score%' OR
    exam.physicalexampath LIKE '%Verbal Score%' OR
    exam.physicalexampath LIKE '%Motor Score%';
'''
exam = pd.read_sql_query(query, conn)
exam = exam[exam['stay_id'].isin(patients['stay_id'])]

In [None]:
exam['label'] = exam['label'].replace(r'notes/Progress Notes/Physical Exam/Physical Exam/Neurologic/GCS/', '', regex=True)
exam['label'] = exam['label'].str.replace(r'[^a-zA-Z\s]', '', regex=True)
exam['value'] = pd.to_numeric(exam['value'], errors='coerce')

In [None]:
# vital sign 이상치 조정 
    
ranges = {           
    'Eyes Score':(1, 4),
    'Verbal Score':(1, 5),
    'Motor Score':(1, 6)}

# 각 label에 대한 평균값 계산
mean_values = exam.groupby('label')['value'].mean()

# 범위를 벗어난 값들을 NaN으로 대체
def replace_out_of_range_with_nan(row):
    label = row['label']
    value = row['value']
    
    if label in ranges:
        lower, upper = ranges[label]
        
        # 값이 범위를 벗어나면 NaN으로 대체
        if value < lower or value > upper:
            return np.nan
    return value

exam['value'] = exam.apply(replace_out_of_range_with_nan, axis=1)
exam['value'] = exam.groupby('label')['value'].transform(lambda x: x.fillna(x.mean()))
exam.head()

In [None]:
exam.to_parquet('eICU_exam.parquet')

## concat, pivot and merge 

In [None]:
df = pd.concat([chart_1, chart_2, lab, exam, vitalap])

df.drop(columns='valueoum', axis=1, inplace=True)
df.to_parquet('eICU_df.parquet')

In [2]:
'''chart_1 = pd.read_parquet('eICU_chart_1.parquet')
chart_2 = pd.read_parquet('eICU_chart_2.parquet')
lab = pd.read_parquet('eICU_lab.parquet')
exam = pd.read_parquet('eICU_exam.parquet')
vitalap = pd.read_parquet('eICU_vitalap.parquet')'''
patients = pd.read_parquet('eICU_patients.parquet')
df = pd.read_parquet('eICU_df.parquet')

In [8]:
set(df['stay_id'].unique()) - set(patients['stay_id'].unique())

set()

In [None]:
df = df[df['time'] >= 0]

In [3]:
df_chart = df.merge(patients, on='stay_id', how='inner') 
df = df_chart.pivot_table(index=['stay_id', 'time'], columns='label', values='value', aggfunc='first')
df.reset_index(inplace=True)

MemoryError: Unable to allocate 921. MiB for an array with shape (120725664,) and data type int64

In [8]:
df['dead in hosp'].value_counts(normalize=True)

dead in hosp
0    0.755387
1    0.244613
Name: proportion, dtype: float64

## cleansing and fillna
- 결측치 처리 
- 컬럼명/순서 수정
- 날짜 컬럼 삭제

In [2]:
df = pd.read_parquet('eICU_df.parquet')

In [3]:
# 각 컬럼의 결측치 비율 계산
    
def missing_value_ratio(df):
    missing_ratio = df.isna().mean() * 100
    
    # 결측치 비율을 오름차순으로 정렬하여 출력)
    print(missing_ratio.sort_values(ascending=True))

missing_value_ratio(df)

stay_id                0.000000
label                  0.000000
time                   0.000000
age                    0.000000
gender                 0.000000
intime                 0.000000
CU type                0.000000
race                   0.000000
dead in hosp           0.000000
death or dischtime     0.000000
value                  0.132233
valueoum              94.815052
labtype               94.815052
dtype: float64


In [5]:
# stay_id별로 결측치 처리 

def nan_fill(df):
    # 1. forward fill
    # 일부 컬럼은 제외 
    columns_to_exclude = ['stay_id', 'gender', 'age', 'race', 'CU type', 'intime', 'death or dischtime', 'dead in hosp', 'time']
    selected_columns = df.columns.difference(columns_to_exclude)
    df_ffill = df.copy()
    df_ffill[selected_columns] = df.groupby('stay_id')[selected_columns].apply(lambda group: group.ffill()).reset_index(drop=True)
    
    # 2. median fill
    overall_median_values = df_ffill[selected_columns].median()
    df_ffill[selected_columns] = df_ffill[selected_columns].fillna(overall_median_values)

    return df_ffill

df_fillna = nan_fill(df)

TypeError: Cannot convert [['Heart Rate' 'Heart Rate' 'Heart Rate' ... 'dbp' 'dbp' 'dbp']] to numeric

In [None]:
df_fillna['dead in hosp'].value_counts(normalize=True)

- 퇴실 또는 사망 직전 1시간 이내의 측정값이 있는지를 확인하기 위해, 'last time' 컬럼을 생성하였음.


In [None]:
df_fillna['intime'] = pd.to_datetime(df_fillna['intime'])
df_fillna['charttime'] = df_fillna['intime'] + pd.to_timedelta(df_fillna['time'], unit='h')

# 'charttime'에서 시간만 추출하여 새로운 컬럼에 저장
df_fillna['charttime_time'] = df_fillna['charttime'].dt.time

def time_str_to_timedelta(time_str):
    if isinstance(time_str, str):
        time_obj = datetime.strptime(time_str, '%H:%M:%S').time()
    elif isinstance(time_str, time):
        time_obj = time_str
    else:
        return pd.NaT  # time_str이 문자열이나 time 객체가 아니면 NaT 반환
    
    return timedelta(hours=time_obj.hour, minutes=time_obj.minute, seconds=time_obj.second)

# 'death or dischtime'과 'charttime_time' 컬럼을 timedelta로 변환
df_fillna['death_or_dischtime_time'] = df_fillna['death or dischtime'].apply(time_str_to_timedelta)
df_fillna['charttime_time'] = df_fillna['charttime_time'].apply(time_str_to_timedelta)
df_fillna['time_difference'] = df_fillna['death_or_dischtime_time'] - df_fillna['charttime_time']

# 'hours' 열을 추출
df_fillna['last time'] = df_fillna['time_difference'].apply(lambda x: x.total_seconds() / 3600 if pd.notnull(x) else pd.NaT)
df_fillna.drop(columns=['death_or_dischtime_time', 'charttime_time', 'time_difference'], axis=1, inplace=True)

In [None]:
df_fillna = pd.read_parquet('eICU_df_fillna.parquet')

In [None]:
# neg/pos 비율 확인 

df_fillna['dead in hosp'].value_counts(normalize=True)

In [None]:
# stay_id별로 가장 최근 행만 먼저 추출
df_fillna_max_time = df_fillna.loc[df_fillna.groupby('stay_id')['time'].idxmax()]

# 해당 행 중에서 hours 컬럼의 값이 0 이상 1 미만인 경우만 필터링
filtered_df_fillna = df_fillna_max_time[(df_fillna_max_time['hours'] >= 0) & (df_fillna_max_time['hours'] < 1)]
filtered_df_fillna_list = filtered_df_fillna['stay_id'].unique()
df_fillna_filtered = df_fillna[df_fillna['stay_id'].isin(filtered_df_fillna_list)]
df_fillna_filtered.drop(columns=['hours', 'time'], inplace=True)

In [None]:
# neg/pos 비율 확인 

df_fillna_filtered['dead in hosp'].value_counts(normalize=True)

In [None]:
def resample_group(group):
    # 'charttime'을 datetime 인덱스로 설정
    group = group.set_index('charttime')
    
    # float형 데이터만 선택하여 평균 계산
    numeric_cols = group.select_dtypes(include=['float']).columns
    group_resampled = group[numeric_cols].resample('h').mean()
    
    # int, object형 데이터는 해당 stay_id의 전후값으로 계산
    non_numeric_cols = group.select_dtypes(exclude=['float']).columns
    group_non_numeric_resampled = group[non_numeric_cols].resample('h').ffill().bfill()
    
    # 리샘플링된 numeric 데이터와 non-numeric 데이터를 병합
    group_resampled = group_resampled.join(group_non_numeric_resampled)
    
    # stay_id 다시 설정
    group_resampled['stay_id'] = group['stay_id'].iloc[0]
    
    return group_resampled.reset_index()

def resampling(df):
    resampled = df.groupby('stay_id').apply(resample_group).reset_index(drop=True)
    resampled.sort_values(by=['stay_id', 'charttime'], ascending=True, inplace=True)
    return resampled

# 리샘플링 함수 호출
df_filtered_resample = resampling(df_fillna_filtered)
df_filtered_resample.head()

In [None]:
# stay_id별 데이터 개수 계산
group_counts = df_filtered_resample.groupby('stay_id').size()
valid_stay_ids = group_counts[group_counts >= 24].index

# 원래 데이터프레임에서 24개 이상의 데이터를 가지는 stay_id 그룹만 선택
df_fillna_24 = df_filtered_resample[df_filtered_resample['stay_id'].isin(valid_stay_ids)]
df_fillna_24.sort_values(by=['stay_id', 'charttime'], inplace=True)

In [None]:
# 모든 stay_id group이 1시간 간격의 데이터를 가지고 있는지 확인

def check_hourly_intervals(df):
    groups_with_problems = []

    for stay_id, group in df.groupby('stay_id'):
        group = group.sort_values('charttime')
        time_diffs = group['charttime'].diff().dropna()
        
        if not all(time_diffs == pd.Timedelta(hours=1)):
            groups_with_problems.append(stay_id)
    
    return groups_with_problems

len(check_hourly_intervals(df_fillna_24)) == 0

In [None]:
# neg/pos 비율 확인 

df_fillna_24['dead in hosp'].value_counts(normalize=True)

In [None]:
# pos와 neg 분리 

df_pos = df_fillna_24[df_fillna_24['dead in hosp'] == 1]
df_neg = df_fillna_24[df_fillna_24['dead in hosp'] == 0]

In [None]:
# positive의 경우 각 stay_id별로 최신 시간을 기준으로 이전 24개의 행만 남김

def keep_last_24_hours(df):
    # 최신 charttime을 기준으로 정렬
    df_sorted = df.sort_values(by='charttime', ascending=True)
    df_kept = df_sorted.groupby('stay_id').tail(24)
    df_kept.sort_values(by=['stay_id', 'charttime'], ascending=True)
    
    return df_kept

df_pos_24 = keep_last_24_hours(df_pos)

In [None]:
# negative의 경우 각 stay_id에서 일부 시점을 무작위로 선택해 해당 시점 전후의 값만 사용

def keep_random_24_hours(df):
    np.random.seed(42)  # 시드 고정
    
    def select_random_window(group):
        group_sorted = group.sort_values(by='charttime', ascending=False)
        random_index = np.random.randint(0, len(group_sorted))

        start_index = max(0, random_index - 11)
        end_index = min(len(group_sorted), random_index + 12 + 1)
        selected_window = group_sorted.iloc[start_index:end_index]

        if len(selected_window) < 24:
            if start_index == 0:
                additional_rows = group_sorted.iloc[end_index:end_index + (24 - len(selected_window))]
            else:
                additional_rows = group_sorted.iloc[max(0, start_index - (24 - len(selected_window))):start_index]
            selected_window = pd.concat([selected_window, additional_rows]).drop_duplicates().head(24)

        return selected_window
    
    # 각 stay_id 그룹에 대해 select_random_window 함수를 적용
    df_kept = df.groupby('stay_id').apply(select_random_window).reset_index(drop=True)
    df_kept = df_kept.sort_values(by=['stay_id','charttime'], ascending=True)
    
    return df_kept

df_neg_24 = keep_random_24_hours(df_neg)

In [None]:
# neg 비율을 전체의 10%로 언더샘플링 > neg:pos 비율을 4:1로 맞춤

neg_undersampling = df_neg_24['stay_id'].drop_duplicates().sample(frac=0.1, random_state=42)
df_neg_24_under = df_neg_24[df_neg_24['stay_id'].isin(neg_undersampling)]

df_24 = pd.concat([df_pos_24, df_neg_24_under])
df_24.sort_values(by=['stay_id', 'charttime'], ascending=True, inplace=True)
df_24['dead in hosp'].value_counts(normalize=True)

df_24_fillna = nan_fill(df_24)
df_24_fillna['dead in hosp'].value_counts(normalize=True)

In [None]:
# datetime 변수 핸들링
    
def datetime_remove(df):
    df_no_dates = df.drop(columns=['intime', 'death or dischtime'])
    df_no_dates = df_no_dates.sort_values(by=['stay_id', 'charttime'], ascending=True)
    return df_no_dates

df_24 = datetime_remove(df_24_fillna)

### 원핫인코딩

In [None]:
df_24 = df_24[df_24['gender'] != '']

In [None]:
# 범주형 열 목록
categorical_cols = ['race', 'gender', 'CU type']

# 범주형 변수 원핫인코딩
df_encoded = pd.get_dummies(df_24, columns=categorical_cols)
df_encoded['dead in hosp'].value_counts(normalize=True)

### 데이터 개형 수정
- 더미 데이터 추가
- 컬럼 순서 재정렬

In [None]:
df_encoded['CU type_CVICU'] = False
df_encoded['PT'] = 4.0
df_encoded = df_encoded[df_encoded['CU type_Neuro ICU'] == False]

In [None]:
df_encoded = df_encoded[['charttime', 'stay_id', 'ALT', 'AST', 'albumin', 'BUN', 'CRP',
       'chloride', 'creatinine', 'Eyes Score', 'Motor Score',
       'Verbal Score', 'glucose', 'Heart Rate', 'Hgb',
       'platelets', 'potassium', 'PT', 'Respiratory Rate',
       'sodium', 'SpO2', 'Temperature', 'WBC', 'dbp', 'sbp', 'age', 
       'race_asian', 'race_black', 'race_hispanic/latino',
       'race_other', 'race_white', 'gender_Female', 'gender_Male', 'CU type_CCU',
       'CU type_CVICU', 'CU type_MICU', 'CU type_SICU',
       'dead in hosp']]

#### X, y split, 정규화

In [None]:
# vital sign 이상치 조정 

# 기준 설정
ranges = {
    'Respiratory Rate': (5, 50),
    'Heart Rate': (10, 190),
    'sbp': (40, 230),
    'dbp': (20, 130),
    'Temperature': (32, 41), 
    'SpO2': (68, 100),    
    'GCS Eye Opening':(1, 4),
    'GCS Verbal Response':(1, 5),
    'GCS Motor Response':(1, 6),
        
    'ALT':(0, 5000), 
    'AST':(0, 10000), 
    'Albumin':(0, 5.5), 
    'BUN':(0, 150), 
    'Bilirubin':(0, 50),
    'CRP':(0, 300), 
    'Chloride':(70, 135),
    'Creatinine':(0, 15),
    'Glucose':(0, 600),
    'Hemoglobin':(0, 25),
    'Potassium':(2, 9),
    'Sodium':(105, 170),
    'Platelet':(0, 1000),
    'Prothrombin time':(0, 8), 
    'WBC':(0, 90), 
}
# 각 열에 대해 이상치를 NaN으로 대체
for col, (lower, upper) in ranges.items():
    if col in df_encoded.columns:
        df_encoded[col] = np.where((df_encoded[col] < lower) | (df_encoded[col] > upper), np.nan, df_encoded[col])

# NaN 값을 각 열에 대한 평균값으로 대체
for col in ranges.keys():
    if col in df_encoded.columns:
        df_encoded[col] = df_encoded[col].fillna(df_encoded[col].mean())

df_encoded.head()

In [None]:
y_val = df_encoded['dead in hosp']
X_val = df_encoded.drop(columns=['dead in hosp'], inplace=False)

In [None]:
X_val.shape, y_val.shape

In [None]:
def scaling(df):
    # 'charttime'을 인덱스로 설정
    df.set_index('charttime', inplace=True)
    df.drop(columns=['stay_id'], inplace=True)
    
    # 정규화
    scaler = StandardScaler() # MinMaxScaler / StandardScaler
    df_scaled = scaler.fit_transform(df)
    
    return df_scaled

X_val_scaled = scaling(X_val)

In [None]:
def class_ratio(df):
    # 클래스별 개수 계산
    class_counts = df.value_counts()

    # 전체 데이터 개수
    total_count = len(df)

    # 클래스별 비율 계산
    class_ratios = class_counts / total_count

    print("클래스 0의 비율:", class_ratios[0])
    print("클래스 1의 비율:", class_ratios[1])

# 비율 유지되는지 확인 

class_ratio(df_encoded['dead in hosp'])
class_ratio(y_val)

In [None]:
num_features = X_val_scaled.shape[1]
num_features

In [None]:
df_encoded.to_parquet('eICU_valid_dataset.parquet')