# ICU Mortality Model - Feature Engineering

This notebook loads the ICU cohort and creates hourly wide dataset for the first 24 hours of ICU stay.

## Objective
- Load ICU cohort from 01_cohort.ipynb
- Use pyCLIF to extract features from CLIF tables
- Create hourly wide dataset for the first 24 hours
- Filter to encounters with complete 24-hour data
- Save features for modeling

## Feature Sources
- **Vitals**: All vital_category values
- **Labs**: All lab_category values
- **Patient Assessments**: GCS_total, RASS
- **Respiratory Support**: Mode, FiO2, PEEP, ventilator settings (with one-hot encoding)
- **Medications**: All vasoactives and sedatives

## Setup and Configuration

In [None]:
import sys
import os
sys.path.append(os.path.join('..', 'src'))

import pandas as pd
import numpy as np
from pyclif import CLIF
from pyclif.utils.wide_dataset import convert_wide_to_hourly
import json
import warnings
warnings.filterwarnings('ignore')

print("=== ICU Mortality Model - Feature Engineering ===")
print("Setting up environment...")

In [None]:
def load_config():
    """Load configuration from config.json"""
    config_path = os.path.join("config_demo.json")
    
    if os.path.exists(config_path):
        with open(config_path, 'r') as file:
            config = json.load(file)
        print("✅ Loaded configuration from config.json")
    else:
        raise FileNotFoundError("Configuration file not found. Please create config.json based on the config_template.")
    
    return config

# Load configuration
config = load_config()
print(f"Site: {config['site']}")
print(f"Data path: {config['clif2_path']}")
print(f"File type: {config['filetype']}")

In [None]:
# Initialize pyCLIF
clif = CLIF(
    data_dir=config['clif2_path'],
    filetype=config['filetype'],
    timezone="US/Eastern"
)

print("✅ pyCLIF initialized successfully")

## Load ICU Cohort

In [None]:
# Load ICU cohort from 01_cohort.ipynb
cohort_path = os.path.join('output', 'intermitted', 'icu_cohort.csv')

if os.path.exists(cohort_path):
    cohort_df = pd.read_csv(cohort_path)
    
    # Convert datetime columns
    datetime_cols = ['start_dttm', 'hour_24_start_dttm', 'hour_24_end_dttm']
    for col in datetime_cols:
        cohort_df[col] = pd.to_datetime(cohort_df[col])
    
    print(f"✅ Loaded ICU cohort: {len(cohort_df)} hospitalizations")
    print(f"Mortality rate: {cohort_df['disposition'].mean():.3f}")
    print(f"Time range: {cohort_df['start_dttm'].min()} to {cohort_df['start_dttm'].max()}")
    
else:
    raise FileNotFoundError(f"Cohort file not found at {cohort_path}. Please run 01_cohort.ipynb first.")

# Display sample
print("\nSample cohort records:")
print(cohort_df.head())

## Feature Extraction Configuration

In [None]:
# Define feature extraction configuration
print("Configuring feature extraction...")

# Get hospitalization IDs from cohort
cohort_ids = cohort_df['hospitalization_id'].astype(str).unique().tolist()
print(f"Extracting features for {len(cohort_ids)} hospitalizations")

# Define category filters for each table
category_filters = {
    'vitals': [  # Common vital signs
        'heart_rate', 'map', 'respiratory_rate', 'spo2', 'temp_c',
        'weight_kg', 'height_cm'
    ],
    'labs': [  # Common lab values
        "albumin",    "alkaline_phosphatase",    "alt",    "ast",    "basophils_percent",    "basophils_absolute",    "bicarbonate",    "bilirubin_total",    "bilirubin_conjugated",    "bilirubin_unconjugated",
    "bun",
    "calcium_total",    "calcium_ionized",    "chloride",    "creatinine",    "crp",    "eosinophils_percent",
    "eosinophils_absolute",    "esr",    "ferritin",    "glucose_fingerstick",    "glucose_serum",    "hemoglobin",    "phosphate",    "inr",    "lactate",    "ldh",
    "lymphocytes_percent",    "lymphocytes_absolute",    "magnesium",    "monocytes_percent",    "monocytes_absolute",    "neutrophils_percent",    "neutrophils_absolute",
    "pco2_arterial",    "po2_arterial",    "pco2_venous",    "ph_arterial",    "ph_venous",    "platelet_count",    "potassium",    "procalcitonin",
    "pt",    "ptt",    "so2_arterial",    "so2_mixed_venous",    "so2_central_venous",    "sodium",
    "total_protein",    "troponin_i",    "troponin_t",    "wbc"
    ],
    'patient_assessments': [  # Neurological assessments
        'gcs_total', 'rass'
    ],
    'medication_admin_continuous': [  # Vasoactives and sedatives
        "norepinephrine",
    "epinephrine",
    "phenylephrine",
    "angiotensin",
    "vasopressin",
    "dopamine",
    "dobutamine",
    "milrinone",
    "isoproterenol",
    "propofol",
    "dexmedetomidine",
    "ketamine",
    "midazolam",
    "fentanyl",
    "hydromorphone",
    "morphine",
    "remifentanil",
    "pentobarbital",
    "lorazepam"
    ],
    'respiratory_support': [  # All respiratory support categories
        'mode_category', 'device_category', 'fio2'
    ]
}

print("Feature extraction configuration:")
for table, categories in category_filters.items():
    print(f"  {table}: {len(categories)} categories")
    print(f"    {categories[:5]}..." if len(categories) > 5 else f"    {categories}")

## Create Wide Dataset Using pyCLIF

In [None]:
# Create wide dataset for cohort hospitalizations
print("Creating wide dataset using pyCLIF...")


wide_df = clif.create_wide_dataset(
    hospitalization_ids=cohort_ids,
    optional_tables=['vitals', 'labs', 'patient_assessments', 'medication_admin_continuous', 'respiratory_support'],
    category_filters=category_filters,
    save_to_data_location=False  # Keep in memory for processing
)


In [None]:
wide_df.to_csv("wide_df.csv", index=False)

## Filter to 24-Hour Window

In [None]:
# Filter wide dataset to 24-hour windows
print("Filtering to 24-hour windows...")
cohort_df['hospitalization_id'] = cohort_df['hospitalization_id'].astype(str)
# Merge with cohort to get time windows
wide_df_filtered = pd.merge(
    wide_df,
    cohort_df[['hospitalization_id', 'hour_24_start_dttm', 'hour_24_end_dttm', 'disposition']],
    on='hospitalization_id',
    how='inner'
)

print(f"After merge with cohort: {len(wide_df_filtered)} records")

# Filter events within 24-hour window
wide_df_filtered = wide_df_filtered[
    (wide_df_filtered['event_time'] >= wide_df_filtered['hour_24_start_dttm']) &
    (wide_df_filtered['event_time'] <= wide_df_filtered['hour_24_end_dttm'])
].reset_index(drop=True)

print(f"✅ Filtered to 24-hour windows: {len(wide_df_filtered)} records")
print(f"Hospitalizations with data: {wide_df_filtered['hospitalization_id'].nunique()}")

# Show time window validation
print("\nTime window validation:")
print(f"All events within window: {((wide_df_filtered['event_time'] >= wide_df_filtered['hour_24_start_dttm']) & (wide_df_filtered['event_time'] <= wide_df_filtered['hour_24_end_dttm'])).all()}")
print(f"Average records per hospitalization: {len(wide_df_filtered) / wide_df_filtered['hospitalization_id'].nunique():.1f}")

In [None]:
wide_df_filtered.columns.to_list()

In [None]:
aggregation_config = {
    'max': ['eosinophils_absolute',
 'glucose_fingerstick',
 'lymphocytes_absolute',
 'monocytes_absolute',
 'neutrophils_absolute',
 'procalcitonin',
 'troponin_i',
 'wbc',
 'gcs_total',
 'rass',
 'angiotensin',
 'isoproterenol',
 'ketamine',
 'remifentanil',
 'pentobarbital',
 'lorazepam',
 'fio2'],
    'min': ['eosinophils_absolute',
 'glucose_fingerstick',
 'lymphocytes_absolute',
 'monocytes_absolute',
 'neutrophils_absolute',
 'procalcitonin',
 'troponin_i',
 'wbc',
 'gcs_total',
 'rass',
 'angiotensin',
 'isoproterenol',
 'ketamine',
 'remifentanil',
 'pentobarbital',
 'lorazepam',
 'fio2'],
    'mean': ['eosinophils_absolute',
 'glucose_fingerstick',
 'lymphocytes_absolute',
 'monocytes_absolute',
 'neutrophils_absolute',
 'procalcitonin',
 'troponin_i',
 'wbc',
 'gcs_total',
 'rass',
 'angiotensin',
 'isoproterenol',
 'ketamine',
 'remifentanil',
 'pentobarbital',
 'lorazepam',
 'fio2'],
    'median': ['eosinophils_absolute',
 'glucose_fingerstick',
 'lymphocytes_absolute',
 'monocytes_absolute',
 'neutrophils_absolute',
 'procalcitonin',
 'troponin_i',
 'wbc',
 'gcs_total',
 'rass',
 'angiotensin',
 'isoproterenol',
 'ketamine',
 'remifentanil',
 'pentobarbital',
 'lorazepam',
 'fio2',],
    'boolean': ['eosinophils_absolute',
 'glucose_fingerstick',
 'lymphocytes_absolute',
 'monocytes_absolute',
 'neutrophils_absolute',
 'procalcitonin',
 'troponin_i',
 'wbc',
 'gcs_total',
 'rass',
 'angiotensin',
 'isoproterenol',
 'ketamine',
 'remifentanil',
 'pentobarbital',
 'lorazepam',
 'fio2',],
    'one_hot_encode': [ 'mode_category','device_category']
}

hourly_df = convert_wide_to_hourly(wide_df, aggregation_config)



In [None]:
hourly_df

In [None]:
hourly_df.columns.to_list()

In [None]:
# Ensure the directory exists
output_dir = os.path.join('output', 'intermitted')
os.makedirs(output_dir, exist_ok=True)

In [None]:
wide_df.to_parquet(os.path.join(output_dir, 'by_event_wide_df.parquet'), index=False)

In [None]:
hourly_df.to_parquet(os.path.join(output_dir, 'by_hourly_wide_df.parquet'), index=False)