# ICU Mortality Model - Feature Engineering

This notebook loads the ICU cohort and creates hourly wide dataset for the first 24 hours of ICU stay.

## Objective
- Load ICU cohort from 01_cohort.ipynb
- Use pyCLIF to extract features from CLIF tables
- Create hourly wide dataset for the first 24 hours
- Filter to encounters with complete 24-hour data
- Save features for modeling

## Feature Sources
- **Vitals**: All vital_category values
- **Labs**: All lab_category values
- **Patient Assessments**: GCS_total, RASS
- **Respiratory Support**: Mode, FiO2, PEEP, ventilator settings (with one-hot encoding)
- **Medications**: All vasoactives and sedatives

## Setup and Configuration

In [None]:
import sys
import os
sys.path.append(os.path.join('..', 'src'))

import pandas as pd
import numpy as np
from pyclif import CLIF
from pyclif.utils.wide_dataset import convert_wide_to_hourly
import json
import warnings
warnings.filterwarnings('ignore')
# Ensure the directory exists
output_dir = os.path.join('..', 'output', 'intermitted')
os.makedirs(output_dir, exist_ok=True)
print("=== ICU Mortality Model - Feature Engineering ===")
print("Setting up environment...")

In [13]:
def load_config():
    """Load configuration from config.json"""
    config_path = os.path.join("config_demo.json")
    
    if os.path.exists(config_path):
        with open(config_path, 'r') as file:
            config = json.load(file)
        print("✅ Loaded configuration from config.json")
    else:
        raise FileNotFoundError("Configuration file not found. Please create config.json based on the config_template.")
    
    return config

# Load configuration
config = load_config()
print(f"Site: {config['site']}")
print(f"Data path: {config['clif2_path']}")
print(f"File type: {config['filetype']}")

✅ Loaded configuration from config.json
Site: MIMIC
Data path: /Users/sudo_sage/Documents/work/mimic_demo
File type: parquet


In [14]:
# Initialize pyCLIF
clif = CLIF(
    data_dir=config['clif2_path'],
    filetype=config['filetype'],
    timezone="US/Eastern"
)

print("✅ pyCLIF initialized successfully")

CLIF Object Initialized.
✅ pyCLIF initialized successfully


## Load ICU Cohort

In [None]:
# Load ICU cohort from 01_cohort.ipynb
cohort_path = os.path.join('..', 'output', 'intermitted', 'icu_cohort.csv')

if os.path.exists(cohort_path):
    cohort_df = pd.read_csv(cohort_path)
    
    # Convert datetime columns
    datetime_cols = ['start_dttm', 'hour_24_start_dttm', 'hour_24_end_dttm']
    for col in datetime_cols:
        cohort_df[col] = pd.to_datetime(cohort_df[col])
    
    print(f"✅ Loaded ICU cohort: {len(cohort_df)} hospitalizations")
    print(f"Mortality rate: {cohort_df['disposition'].mean():.3f}")
    print(f"Time range: {cohort_df['start_dttm'].min()} to {cohort_df['start_dttm'].max()}")
    
else:
    raise FileNotFoundError(f"Cohort file not found at {cohort_path}. Please run 01_cohort.ipynb first.")

# Display sample
print("\nSample cohort records:")
print(cohort_df.head())

## Feature Extraction Configuration

In [16]:
# Define feature extraction configuration
print("Configuring feature extraction...")

# Get hospitalization IDs from cohort
cohort_ids = cohort_df['hospitalization_id'].astype(str).unique().tolist()
print(f"Extracting features for {len(cohort_ids)} hospitalizations")

# Define category filters for each table
category_filters = {
    'vitals': [  # Common vital signs
        'heart_rate', 'map', 'respiratory_rate', 'spo2', 'temp_c',
        'weight_kg', 'height_cm'
    ],
    'labs': [  # Common lab values
        "albumin",    "alkaline_phosphatase",    "alt",    "ast",    "basophils_percent",    "basophils_absolute",    "bicarbonate",    "bilirubin_total",    "bilirubin_conjugated",    "bilirubin_unconjugated",
    "bun",
    "calcium_total",    "calcium_ionized",    "chloride",    "creatinine",    "crp",    "eosinophils_percent",
    "eosinophils_absolute",    "esr",    "ferritin",    "glucose_fingerstick",    "glucose_serum",    "hemoglobin",    "phosphate",    "inr",    "lactate",    "ldh",
    "lymphocytes_percent",    "lymphocytes_absolute",    "magnesium",    "monocytes_percent",    "monocytes_absolute",    "neutrophils_percent",    "neutrophils_absolute",
    "pco2_arterial",    "po2_arterial",    "pco2_venous",    "ph_arterial",    "ph_venous",    "platelet_count",    "potassium",    "procalcitonin",
    "pt",    "ptt",    "so2_arterial",    "so2_mixed_venous",    "so2_central_venous",    "sodium",
    "total_protein",    "troponin_i",    "troponin_t",    "wbc"
    ],
    'patient_assessments': [  # Neurological assessments
        'gcs_total', 'rass'
    ],
    'medication_admin_continuous': [  # Vasoactives and sedatives
        "norepinephrine",
    "epinephrine",
    "phenylephrine",
    "angiotensin",
    "vasopressin",
    "dopamine",
    "dobutamine",
    "milrinone",
    "isoproterenol",
    "propofol",
    "dexmedetomidine",
    "ketamine",
    "midazolam",
    "fentanyl",
    "hydromorphone",
    "morphine",
    "remifentanil",
    "pentobarbital",
    "lorazepam"
    ],
    'respiratory_support': [  # All respiratory support categories
        'mode_category', 'device_category', 'fio2'
    ]
}

print("Feature extraction configuration:")
for table, categories in category_filters.items():
    print(f"  {table}: {len(categories)} categories")
    print(f"    {categories[:5]}..." if len(categories) > 5 else f"    {categories}")

Configuring feature extraction...
Extracting features for 89 hospitalizations
Feature extraction configuration:
  vitals: 7 categories
    ['heart_rate', 'map', 'respiratory_rate', 'spo2', 'temp_c']...
  labs: 52 categories
    ['albumin', 'alkaline_phosphatase', 'alt', 'ast', 'basophils_percent']...
  patient_assessments: 2 categories
    ['gcs_total', 'rass']
  medication_admin_continuous: 19 categories
    ['norepinephrine', 'epinephrine', 'phenylephrine', 'angiotensin', 'vasopressin']...
  respiratory_support: 3 categories
    ['mode_category', 'device_category', 'fio2']


## Create Wide Dataset Using pyCLIF

In [17]:
# Create wide dataset for cohort hospitalizations
print("Creating wide dataset using pyCLIF...")


wide_df = clif.create_wide_dataset(
    hospitalization_ids=cohort_ids,
    optional_tables=['vitals', 'labs', 'patient_assessments', 'medication_admin_continuous', 'respiratory_support'],
    category_filters=category_filters,
    save_to_data_location=False  # Keep in memory for processing
)


Creating wide dataset using pyCLIF...
Auto-loading required base table: patient
Loading clif_patient.parquet
Data loaded successfully from clif_patient.parquet
Validation completed with 2 error(s). See `errors` attribute.
Auto-loading required base table: hospitalization
Loading clif_hospitalization.parquet
Data loaded successfully from clif_hospitalization.parquet
Validation completed successfully.
Auto-loading required base table: adt
Loading clif_adt.parquet
Data loaded successfully from clif_adt.parquet
Validation completed with 4 error(s). See `errors` attribute.
Auto-loading optional table: vitals
Loading clif_vitals.parquet
Data loaded successfully from clif_vitals.parquet
Validation completed with 5 error(s).
  - 5 range validation error(s)
See `errors` and `range_validation_errors` attributes for details.
Auto-loading optional table: labs
Loading clif_labs.parquet
Data loaded successfully from clif_labs.parquet
Validation completed with 24 error(s).
  - 8 schema validation err

In [18]:
aggregation_config = {
    "max": [
        "eosinophils_absolute",
        "glucose_fingerstick",
        "lymphocytes_absolute",
        "monocytes_absolute",
        "neutrophils_absolute",
        "procalcitonin",
        "troponin_i",
        "wbc",
        "gcs_total",
        "rass",
        "angiotensin",
        "isoproterenol",
        "ketamine",
        "remifentanil",
        "pentobarbital",
        "lorazepam",
        "fio2",
    ],
    "min": [
        "eosinophils_absolute",
        "glucose_fingerstick",
        "lymphocytes_absolute",
        "monocytes_absolute",
        "neutrophils_absolute",
        "procalcitonin",
        "troponin_i",
        "wbc",
        "gcs_total",
        "rass",
        "angiotensin",
        "isoproterenol",
        "ketamine",
        "remifentanil",
        "pentobarbital",
        "lorazepam",
        "fio2",
    ],
    "mean": [
        "eosinophils_absolute",
        "glucose_fingerstick",
        "lymphocytes_absolute",
        "monocytes_absolute",
        "neutrophils_absolute",
        "procalcitonin",
        "troponin_i",
        "wbc",
        "gcs_total",
        "rass",
        "angiotensin",
        "isoproterenol",
        "ketamine",
        "remifentanil",
        "pentobarbital",
        "lorazepam",
        "fio2",
    ],
    "median": [
        "eosinophils_absolute",
        "glucose_fingerstick",
        "lymphocytes_absolute",
        "monocytes_absolute",
        "neutrophils_absolute",
        "procalcitonin",
        "troponin_i",
        "wbc",
        "gcs_total",
        "rass",
        "angiotensin",
        "isoproterenol",
        "ketamine",
        "remifentanil",
        "pentobarbital",
        "lorazepam",
        "fio2",
    ],
    "boolean": [
        "eosinophils_absolute",
        "glucose_fingerstick",
        "lymphocytes_absolute",
        "monocytes_absolute",
        "neutrophils_absolute",
        "procalcitonin",
        "troponin_i",
        "wbc",
        "gcs_total",
        "rass",
        "angiotensin",
        "isoproterenol",
        "ketamine",
        "remifentanil",
        "pentobarbital",
        "lorazepam",
        "fio2",
    ],
    "one_hot_encode": ["mode_category", "device_category"],
}

hourly_df = convert_wide_to_hourly(wide_df, aggregation_config)

Starting hourly aggregation of wide dataset...
Calculating nth_hour starting from 0 based on first event...
Processing 27380 records into hourly buckets...
The following columns are not mentioned in aggregation_config, defaulting to 'first' with '_c' postfix:
  - hospitalization_joined_id
  - admission_dttm
  - discharge_dttm
  - age_at_admission
  - admission_type_name
  - admission_type_category
  - discharge_name
  - discharge_category
  - zipcode_nine_digit
  - zipcode_five_digit
  - census_block_code
  - census_block_group_code
  - census_tract
  - state_code
  - county_code
  - race_name
  - race_category
  - ethnicity_name
  - ethnicity_category
  - sex_name
  - sex_category
  - birth_date
  - death_dttm
  - language_name
  - language_category
  - hospital_id
  - in_dttm
  - out_dttm
  - location_name
  - location_category
  - location_type
  - heart_rate
  - height_cm
  - map
  - respiratory_rate
  - spo2
  - temp_c
  - weight_kg
  - albumin
  - alkaline_phosphatase
  - alt
  -

Aggregating data by hour: 100%|██████████| 10982/10982 [00:43<00:00, 253.26group/s]


Hourly aggregation complete: 10982 hourly records from 27380 original records
Columns in hourly dataset: 222


In [19]:
hourly_df.columns.to_list()

['hospitalization_id',
 'event_time_hour',
 'nth_hour',
 'hour_bucket',
 'patient_id',
 'day_number',
 'eosinophils_absolute_max',
 'glucose_fingerstick_max',
 'lymphocytes_absolute_max',
 'monocytes_absolute_max',
 'neutrophils_absolute_max',
 'procalcitonin_max',
 'troponin_i_max',
 'wbc_max',
 'gcs_total_max',
 'rass_max',
 'angiotensin_max',
 'isoproterenol_max',
 'ketamine_max',
 'remifentanil_max',
 'pentobarbital_max',
 'lorazepam_max',
 'fio2_max',
 'eosinophils_absolute_min',
 'glucose_fingerstick_min',
 'lymphocytes_absolute_min',
 'monocytes_absolute_min',
 'neutrophils_absolute_min',
 'procalcitonin_min',
 'troponin_i_min',
 'wbc_min',
 'gcs_total_min',
 'rass_min',
 'angiotensin_min',
 'isoproterenol_min',
 'ketamine_min',
 'remifentanil_min',
 'pentobarbital_min',
 'lorazepam_min',
 'fio2_min',
 'eosinophils_absolute_mean',
 'glucose_fingerstick_mean',
 'lymphocytes_absolute_mean',
 'monocytes_absolute_mean',
 'neutrophils_absolute_mean',
 'procalcitonin_mean',
 'troponin

In [20]:
# Filter wide dataset to 24-hour windows
print("Filtering to 24-hour windows for event wide data...: Shape:", wide_df.shape)
cohort_df['hospitalization_id'] = cohort_df['hospitalization_id'].astype(str)
# Merge with cohort to get time windows
wide_df_filtered = pd.merge(
    wide_df,
    cohort_df[['hospitalization_id', 'hour_24_start_dttm', 'hour_24_end_dttm', 'disposition']],
    on='hospitalization_id',
    how='inner'
)

print(f"After merge with cohort: {len(wide_df_filtered)} records")

# Filter events within 24-hour window
wide_df_filtered = wide_df_filtered[
    (wide_df_filtered['event_time'] >= wide_df_filtered['hour_24_start_dttm']) &
    (wide_df_filtered['event_time'] <= wide_df_filtered['hour_24_end_dttm'])
].reset_index(drop=True)

print(f"✅ Filtered to 24-hour windows: {len(wide_df_filtered)} records")
print(f"Hospitalizations with data: {wide_df_filtered['hospitalization_id'].nunique()}")

# Show time window validation
print("\nTime window validation:")
print(f"All events within window: {((wide_df_filtered['event_time'] >= wide_df_filtered['hour_24_start_dttm']) & (wide_df_filtered['event_time'] <= wide_df_filtered['hour_24_end_dttm'])).all()}")
print(f"Average records per hospitalization: {len(wide_df_filtered) / wide_df_filtered['hospitalization_id'].nunique():.1f}")
print('Shape: after filtering:', wide_df_filtered.shape)

wide_df_filtered.to_parquet(os.path.join(output_dir, 'by_event_wide_df.parquet'), index=False)

Filtering to 24-hour windows for event wide data...: Shape: (27380, 141)
After merge with cohort: 27380 records
✅ Filtered to 24-hour windows: 6769 records
Hospitalizations with data: 89

Time window validation:
All events within window: True
Average records per hospitalization: 76.1
Shape: after filtering: (6769, 144)


In [21]:
# Filter hourly dataset to 24-hour windows
print("\nFiltering hourly dataset to 24-hour windows...| Shape:",hourly_df.shape)
# Merge with cohort to get time windows
hourly_df_filtered = pd.merge(
    hourly_df,
    cohort_df[['hospitalization_id', 'hour_24_start_dttm', 'hour_24_end_dttm', 'disposition']],
    on='hospitalization_id',
    how='inner'
)

print(f"After merge with cohort: {len(hourly_df_filtered)} records")

# Filter events within 24-hour window
hourly_df_filtered = hourly_df_filtered[
    (hourly_df_filtered['event_time_hour'] >= hourly_df_filtered['hour_24_start_dttm']) &
    (hourly_df_filtered['event_time_hour'] <= hourly_df_filtered['hour_24_end_dttm'])
].reset_index(drop=True)

print(f"✅ Filtered hourly dataset to 24-hour windows: {len(hourly_df_filtered)} records")
print(f"Hospitalizations with data in hourly dataset: {hourly_df_filtered['hospitalization_id'].nunique()}")

# Show time window validation for hourly dataset
print("\nTime window validation for hourly dataset:")
print(f"All events within window: {((hourly_df_filtered['event_time_hour'] >= hourly_df_filtered['hour_24_start_dttm']) & (hourly_df_filtered['event_time_hour'] <= hourly_df_filtered['hour_24_end_dttm'])).all()}")
print(f"Average records per hospitalization: {len(hourly_df_filtered) / hourly_df_filtered['hospitalization_id'].nunique():.1f}")

print('Shape:', hourly_df_filtered.shape)
hourly_df_filtered.to_parquet(os.path.join(output_dir, 'by_hourly_wide_df.parquet'), index=False)


Filtering hourly dataset to 24-hour windows...| Shape: (10982, 222)
After merge with cohort: 10982 records
✅ Filtered hourly dataset to 24-hour windows: 2112 records
Hospitalizations with data in hourly dataset: 89

Time window validation for hourly dataset:
All events within window: True
Average records per hospitalization: 23.7
Shape: (2112, 225)


In [22]:
hourly_df_filtered.columns.tolist()


['hospitalization_id',
 'event_time_hour',
 'nth_hour',
 'hour_bucket',
 'patient_id',
 'day_number',
 'eosinophils_absolute_max',
 'glucose_fingerstick_max',
 'lymphocytes_absolute_max',
 'monocytes_absolute_max',
 'neutrophils_absolute_max',
 'procalcitonin_max',
 'troponin_i_max',
 'wbc_max',
 'gcs_total_max',
 'rass_max',
 'angiotensin_max',
 'isoproterenol_max',
 'ketamine_max',
 'remifentanil_max',
 'pentobarbital_max',
 'lorazepam_max',
 'fio2_max',
 'eosinophils_absolute_min',
 'glucose_fingerstick_min',
 'lymphocytes_absolute_min',
 'monocytes_absolute_min',
 'neutrophils_absolute_min',
 'procalcitonin_min',
 'troponin_i_min',
 'wbc_min',
 'gcs_total_min',
 'rass_min',
 'angiotensin_min',
 'isoproterenol_min',
 'ketamine_min',
 'remifentanil_min',
 'pentobarbital_min',
 'lorazepam_min',
 'fio2_min',
 'eosinophils_absolute_mean',
 'glucose_fingerstick_mean',
 'lymphocytes_absolute_mean',
 'monocytes_absolute_mean',
 'neutrophils_absolute_mean',
 'procalcitonin_mean',
 'troponin