In [1]:
import os
import warnings
import optuna
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    balanced_accuracy_score,
    matthews_corrcoef,
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
    auc
)

warnings.filterwarnings('ignore')


In [2]:
dir_path='/kaggle/input/phems-hackathon-early-sepsis-prediction/'
train_path = '/kaggle/input/phems-hackathon-early-sepsis-prediction/training_data'
test_path = '/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/'

In [3]:
def load_dataset(data_path, data_type='train'):
    """Load all data files for either train or test set"""
    file_map = {
        'sepsis_labels': f"SepsisLabel_{data_type}.csv",
        'devices': f"devices_{data_type}.csv",
        'drugs': f"drugsexposure_{data_type}.csv",
        'lab_measurements': f"measurement_lab_{data_type}.csv",
        'meds_measurements': f"measurement_meds_{data_type}.csv",
        'observations': f"measurement_observation_{data_type}.csv",
        'demographics': f"person_demographics_episode_{data_type}.csv",
        'procedures': f"proceduresoccurrences_{data_type}.csv"
    }
    
    data_dict = {}
    
    for key, filename in file_map.items():
        file_path = os.path.join(data_path, filename)
        try:
            data_dict[key] = pd.read_csv(file_path)
            print(f"Loaded {filename} successfully")
        except FileNotFoundError:
            print(f"Warning: {filename} not found in {data_path}")
            data_dict[key] = None
    
    return data_dict

In [4]:
train_data = load_dataset(train_path, 'train')
test_data = load_dataset(test_path, 'test')

Loaded SepsisLabel_train.csv successfully
Loaded devices_train.csv successfully
Loaded drugsexposure_train.csv successfully
Loaded measurement_lab_train.csv successfully
Loaded measurement_meds_train.csv successfully
Loaded measurement_observation_train.csv successfully
Loaded person_demographics_episode_train.csv successfully
Loaded proceduresoccurrences_train.csv successfully
Loaded SepsisLabel_test.csv successfully
Loaded devices_test.csv successfully
Loaded drugsexposure_test.csv successfully
Loaded measurement_lab_test.csv successfully
Loaded measurement_meds_test.csv successfully
Loaded measurement_observation_test.csv successfully
Loaded person_demographics_episode_test.csv successfully
Loaded proceduresoccurrences_test.csv successfully


In [5]:
train_labels = train_data['sepsis_labels']
train_devices = train_data['devices']
train_drugs = train_data['drugs']
train_lab_measurements = train_data['lab_measurements']
train_meds_measurements = train_data['meds_measurements']
train_observations = train_data['observations']
train_demographics = train_data['demographics']
train_procedures = train_data['procedures']

In [6]:
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

def preprocess_dataset(
    sepsis_labels, devices, drugs, lab_measurements, meds_measurements,
    observations, demographics, procedures, is_train=False, encoders=None
):
    print("Starting data preprocessing...")

    # --------------------------------------------------- Process Sepsis Labels
    sepsis_labels = sepsis_labels.drop_duplicates()
    print("Processing sepsis labels...")
    tqdm.pandas(desc="Processing dates")
    sepsis_labels['day'] = sepsis_labels['measurement_datetime'].progress_apply(lambda x: x[:10] if pd.notna(x) else None)
    sepsis_labels["measurement_datetime"] = pd.to_datetime(sepsis_labels["measurement_datetime"], errors="coerce")
    sepsis_labels = sepsis_labels.sort_values(by=["person_id", "measurement_datetime"])
    sepsis_labels["time_elapsed"] = (
        sepsis_labels.groupby("person_id")["measurement_datetime"]
        .diff()
        .dt.total_seconds()
        / 3600
    ).fillna(0)

    # --------------------------------------------------- Process Demographics
    print("Processing demographics...")
    demographics = demographics.sort_values(by="visit_start_date").drop_duplicates(subset=["person_id"], keep="last")
    df_merged = pd.merge(sepsis_labels, demographics, on="person_id", how="left")    
    df_merged = df_merged.dropna(subset=["day", "birth_datetime"])
    df_merged["birth_datetime"] = pd.to_datetime(df_merged["birth_datetime"], errors="coerce")    
    demographics["visit_start_date"] = pd.to_datetime(demographics["visit_start_date"], errors="coerce")
    
    tqdm.pandas(desc="Calculating ages")
    df_merged["age_in_months"] = df_merged.progress_apply(
        lambda row: calculate_age_in_months(row["day"], row["birth_datetime"]), axis=1
    )
    df_merged.drop(["visit_occurrence_id", "visit_start_date", "birth_datetime"], axis=1, inplace=True, errors='ignore')
    
    # --------------------------------------------------- Process Drugs
    print("Processing drugs...")
    drugs["drug_datetime_hourly"] = pd.to_datetime(drugs["drug_datetime_hourly"], errors="coerce")
    df_drugs_agg = (
        drugs.groupby(["person_id", "drug_datetime_hourly"])
        .agg({
            "drug_concept_id": lambda x: " ".join(sorted(map(str, x))),
            "route_concept_id": lambda x: " ".join(sorted(map(str, x))),
        })
        .reset_index()
        .rename(columns={
            "drug_concept_id": "current_drug_concept_id",
            "route_concept_id": "current_route_concept_id",
        })
    )
    df_merged = pd.merge(
        df_merged,
        df_drugs_agg,
        how="left",
        left_on=["person_id", "measurement_datetime"],
        right_on=["person_id", "drug_datetime_hourly"]
    )
    df_merged.drop(["drug_datetime_hourly"], axis=1, inplace=True, errors='ignore')

    print("Processing drug usage history...")
    groups_sepsis = []
    for pid in tqdm(df_merged['person_id'].unique(), desc="Processing patients"):
        grp_sepsis = df_merged[df_merged['person_id'] == pid]
        grp_drugs = drugs[drugs["person_id"] == pid]
        updated_grp = find_last_drug_usage(grp_sepsis, grp_drugs)
        groups_sepsis.append(updated_grp)

    df_merged = pd.concat(groups_sepsis, axis=0).reset_index(drop=True)
    
    # --------------------------------------------------- Process Medications Measurements
    print("Processing medication measurements...")
    tqdm.pandas(desc="Processing measurement dates")
    meds_measurements["day"] = meds_measurements['measurement_datetime'].progress_apply(
        lambda x: str(x)[:10] if pd.notna(x) else None
    )
    meds_measurements["measurement_datetime"] = pd.to_datetime(meds_measurements["measurement_datetime"], errors="coerce")
    meds_measurements = meds_measurements[meds_measurements["Heart rate"].between(0, 200, inclusive="both")]
    meds_measurements = meds_measurements[meds_measurements["Respiratory rate"].between(0, 40, inclusive="both")]

    df_obs_agg = meds_measurements.groupby(["person_id", "day"]).agg({
        "Body temperature": "max",
        "Respiratory rate": "max",
        "Heart rate": "max",
        "Measurement of oxygen saturation at periphery": "mean"
    }).reset_index()

    df_merged = df_merged.merge(df_obs_agg, on=["person_id", "day"], how="left")

    print("Filling missing values in medication measurements...")
    for col in tqdm(["Body temperature", "Respiratory rate", "Heart rate", "Measurement of oxygen saturation at periphery"],
                    desc="Processing columns"):
        median_val = df_merged[col].median(skipna=True)
        df_merged[col] = df_merged[col].fillna(median_val)
    
    # --------------------------------------------------- Process Lab Measurements
    print("Processing lab measurements...")
    lab_measurements.columns = lab_measurements.columns.str.replace('[', '(', regex=False).str.replace(']', ')', regex=False)
    tqdm.pandas(desc="Processing lab dates")
    lab_measurements["day"] = lab_measurements['measurement_datetime'].progress_apply(
        lambda x: str(x)[:10] if pd.notna(x) else None
    )
    lab_measurements["measurement_datetime"] = pd.to_datetime(lab_measurements["measurement_datetime"], errors="coerce")
    
    columns_to_exclude = ["person_id", "day", "visit_occurence_id", "measurement_datetime"]
    df_lab_agg = (
        lab_measurements.groupby(["person_id", "day"])
              .agg({col: "mean" for col in lab_measurements.columns if col not in columns_to_exclude})
              .reset_index()
    )
    df_merged = df_merged.merge(df_lab_agg, on=["person_id", "day"], how="left")

    print("Filling missing values in lab measurements...")
    numeric_cols_lab = [c for c in lab_measurements.columns if c not in columns_to_exclude]
    for col in tqdm(numeric_cols_lab, desc="Processing columns"):
        median_val = df_merged[col].median(skipna=True)
        df_merged[col] = df_merged[col].fillna(median_val)

    cols_to_drop = ["day", "visit_occurrence_id_x", "visit_occurrence_id_y", 
                    "visit_occurence_id", "Ionised calcium measurement"]
    df_merged.drop(columns=cols_to_drop, inplace=True, errors='ignore')

    # --------------------------------------------------- Encoding Categorical Data
    print("Encoding categorical data...")
    cat_cols = ["gender", "current_drug_concept_id", "current_route_concept_id", 
                "last_drug_concept_id", "last_route_concept_id"]
    if is_train:
        encoders = {}
        for col in tqdm(cat_cols, desc="Encoding columns"):
            df_merged[col] = df_merged[col].astype(str)
            le = LabelEncoder()
            df_merged[col] = le.fit_transform(df_merged[col])
            encoders[col] = le
    else:
        for col in tqdm(cat_cols, desc="Encoding columns"):
            df_merged[col] = df_merged[col].astype(str)
            le = encoders[col]
            df_merged[col] = df_merged[col].where(df_merged[col].isin(le.classes_), le.classes_[0])
            df_merged[col] = le.transform(df_merged[col])

    print("Preprocessing completed!")
    return df_merged, encoders if is_train else df_merged


def find_last_drug_usage(measurements, drugs):
    """
    Find the last drug usage information for each measurement timestamp.
    """
    measurements = measurements.sort_values(by=["person_id", "measurement_datetime"]).copy()
    drugs = drugs.sort_values(by=["person_id", "drug_datetime_hourly"]).copy()

    last_drug_ids = []
    last_route_ids = []

    measurement_group = measurements
    drug_group = drugs
    pointer = 0
    n_drugs = len(drug_group)

    for _, row in measurement_group.iterrows():
        current_time = row["measurement_datetime"]

        while pointer < (n_drugs - 1) and drug_group.iloc[pointer + 1]["drug_datetime_hourly"] <= current_time:
            pointer += 1

        if n_drugs > 0 and drug_group.iloc[pointer]["drug_datetime_hourly"] <= current_time:
            last_drug_ids.append(drug_group.iloc[pointer]["drug_concept_id"])
            last_route_ids.append(drug_group.iloc[pointer]["route_concept_id"])
        else:
            last_drug_ids.append(None)
            last_route_ids.append(None)

    measurements["last_drug_concept_id"] = last_drug_ids
    measurements["last_route_concept_id"] = last_route_ids

    return measurements

def calculate_age_in_months(reference_date, birth_date):
    """
    Calculate age in months from the given reference date and birth date.
    
    Args:
        reference_date (str or datetime): The reference date for age calculation.
        birth_date (str or datetime): The birth date of the individual.

    Returns:
        int: Age in months.
    """
    current_day = pd.to_datetime(reference_date, errors="coerce")
    birth_date = pd.to_datetime(birth_date, errors="coerce")

    if pd.isna(current_day) or pd.isna(birth_date):
        return None

    age_in_months = (current_day.year - birth_date.year) * 12 + (current_day.month - birth_date.month)
    return age_in_months


def find_last_drug_usage(measurements, drugs):
    """
    Find the last drug usage information for each measurement timestamp.

    Args:
        measurements (DataFrame): DataFrame containing measurement data with `person_id` and `measurement_datetime`.
        drugs (DataFrame): DataFrame containing drug usage data with `person_id`, `drug_datetime_hourly`, 
                           `drug_concept_id`, and `route_concept_id`.

    Returns:
        DataFrame: Updated `measurements` DataFrame with `last_drug_concept_id` and `last_route_concept_id`.
    """
    # Ensure data is sorted for efficient pointer traversal
    measurements = measurements.sort_values(by=["person_id", "measurement_datetime"]).copy()
    drugs = drugs.sort_values(by=["person_id", "drug_datetime_hourly"]).copy()

    # Prepare lists for last drug and route IDs
    last_drug_ids = []
    last_route_ids = []

    # Iterate over each group of person_id
    for person_id, measurement_group in measurements.groupby("person_id"):
        drug_group = drugs[drugs["person_id"] == person_id]
        pointer = 0
        n_drugs = len(drug_group)

        for _, row in measurement_group.iterrows():
            current_time = row["measurement_datetime"]

            # Advance the pointer to the most recent drug usage before the current time
            while pointer < (n_drugs - 1) and drug_group.iloc[pointer + 1]["drug_datetime_hourly"] <= current_time:
                pointer += 1

            # Assign the last drug and route if available
            if n_drugs > 0 and drug_group.iloc[pointer]["drug_datetime_hourly"] <= current_time:
                last_drug_ids.append(drug_group.iloc[pointer]["drug_concept_id"])
                last_route_ids.append(drug_group.iloc[pointer]["route_concept_id"])
            else:
                last_drug_ids.append(None)
                last_route_ids.append(None)

    # Append new columns to the measurements DataFrame
    measurements["last_drug_concept_id"] = last_drug_ids
    measurements["last_route_concept_id"] = last_route_ids

    return measurements



train_df, encoders = preprocess_dataset(
    train_labels, train_devices, train_drugs, train_lab_measurements, 
    train_meds_measurements, train_observations, train_demographics, train_procedures, 
    is_train=True
)

train_df = train_df.sort_values(['person_id', 'measurement_datetime'])
features_to_drop = ['person_id', 'measurement_datetime']
X = train_df.drop(features_to_drop + ['SepsisLabel'], axis=1)
y = train_df['SepsisLabel']
groups = train_df['person_id']

test_labels = test_data['sepsis_labels']
test_devices = test_data['devices']
test_drugs = test_data['drugs']
test_lab_measurements = test_data['lab_measurements']
test_meds_measurements = test_data['meds_measurements']
test_observations = test_data['observations']
test_demographics = test_data['demographics']
test_procedures = test_data['procedures']


test_df = preprocess_dataset(
    test_labels, test_devices, test_drugs, test_lab_measurements, 
    test_meds_measurements, test_observations, test_demographics, test_procedures, is_train=False, encoders=encoders
)

test_df = test_df[0]
test_features = test_df.copy()
test_features = test_features.drop(features_to_drop, axis=1, errors='ignore')




Starting data preprocessing...
Processing sepsis labels...


Processing dates:   0%|          | 0/331639 [00:00<?, ?it/s]

Processing demographics...


Calculating ages:   0%|          | 0/331624 [00:00<?, ?it/s]

Processing drugs...
Processing drug usage history...


Processing patients:   0%|          | 0/2640 [00:00<?, ?it/s]

Processing medication measurements...


Processing measurement dates:   0%|          | 0/257749 [00:00<?, ?it/s]

Filling missing values in medication measurements...


Processing columns:   0%|          | 0/4 [00:00<?, ?it/s]

Processing lab measurements...


Processing lab dates:   0%|          | 0/69307 [00:00<?, ?it/s]

Filling missing values in lab measurements...


Processing columns:   0%|          | 0/38 [00:00<?, ?it/s]

Encoding categorical data...


Encoding columns:   0%|          | 0/5 [00:00<?, ?it/s]

Preprocessing completed!
Starting data preprocessing...
Processing sepsis labels...


Processing dates:   0%|          | 0/130483 [00:00<?, ?it/s]

Processing demographics...


Calculating ages:   0%|          | 0/130483 [00:00<?, ?it/s]

Processing drugs...
Processing drug usage history...


Processing patients:   0%|          | 0/1138 [00:00<?, ?it/s]

Processing medication measurements...


Processing measurement dates:   0%|          | 0/104398 [00:00<?, ?it/s]

Filling missing values in medication measurements...


Processing columns:   0%|          | 0/4 [00:00<?, ?it/s]

Processing lab measurements...


Processing lab dates:   0%|          | 0/26970 [00:00<?, ?it/s]

Filling missing values in lab measurements...


Processing columns:   0%|          | 0/38 [00:00<?, ?it/s]

Encoding categorical data...


Encoding columns:   0%|          | 0/5 [00:00<?, ?it/s]

Preprocessing completed!


# mutltimodel ensemble

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, f1_score
import numpy as np
import pandas as pd

# Function to calculate evaluation metrics
def calculate_metrics(y_true, y_pred, y_pred_proba):
    """Calculate evaluation metrics."""
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "F1_Score": f1_score(y_true, y_pred),
        "AUC": roc_auc_score(y_true, y_pred_proba),
        "PR_AUC": average_precision_score(y_true, y_pred_proba)
    }

# Define parameters for RandomForest
random_forest_params = {
    'n_estimators': 500,          # Number of trees in the forest
    'max_depth': 5,              # Maximum depth of the tree
    'min_samples_split': 2,       # Minimum number of samples required to split an internal node
    'min_samples_leaf': 1,        # Minimum number of samples required to be at a leaf node
    'max_features': 'sqrt',       # Number of features to consider when looking for the best split
    'bootstrap': True,            # Whether bootstrap samples are used when building trees
    'random_state': 42,           # Seed for reproducibility
    'verbose': 1                  # Controls verbosity of the output
}

# Initialize the RandomForest model
rf_model = RandomForestClassifier(**random_forest_params)

# Set the threshold for predictions
THRESHOLD = 0.6100

# Train the RandomForest model on the full dataset
print("Training RandomForest on the full dataset...")
rf_model.fit(X, y)

# Predict probabilities and labels
y_pred_proba = rf_model.predict_proba(X)[:, 1]
y_pred = (y_pred_proba >= THRESHOLD).astype(int)

# Calculate metrics
metrics = calculate_metrics(y, y_pred, y_pred_proba)

# Print the results
print("\nResults on the Full Dataset:")
for metric_name, metric_value in metrics.items():
    if metric_name == "PR_AUC":  
        print(f"{metric_name}: {metric_value:.4f} ðŸŒŸ")
    else:
        print(f"{metric_name}: {metric_value:.4f}")

Training RandomForest on the full dataset...


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   10.5s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   42.7s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:  1.6min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.8s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    3.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    7.4s



Results on the Full Dataset:
Accuracy: 0.9796
F1_Score: 0.0343
AUC: 0.9836
PR_AUC: 0.7237 ðŸŒŸ


In [8]:
# Make predictions on the test set for each model
rf_model_pred = rf_model.predict_proba(test_features)[:, 1]

# Create the submission format
test_df['person_id_datetime'] = (
    test_df['person_id'].astype(str) + '_' + 
    test_df['measurement_datetime'].astype(str)
)


pd.DataFrame({
    'person_id_datetime': test_df['person_id_datetime'],
    'SepsisLabel': rf_model_pred
}).to_csv('submission.csv', index=False)



# Check the first few rows of the submission file
df = pd.read_csv('submission.csv')
print(df.head())


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s


            person_id_datetime  SepsisLabel
0  3858662_2019-11-29 01:00:00     0.002812
1  3858662_2019-11-29 03:00:00     0.002305
2  3858662_2019-11-29 05:00:00     0.002305
3  3858662_2019-11-29 06:00:00     0.002812
4  3858662_2019-11-29 07:00:00     0.002812
