<a href="https://colab.research.google.com/github/vandrearczyk/hecktor-euvip2024/blob/main/baseline_prediction_hecktor_dummy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install scikit-survival

import numpy as np
import os
import pandas as pd
from sksurv.datasets import get_x_y
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_censored
from sklearn.model_selection import train_test_split
from google.colab import files

Collecting scikit-survival
  Downloading scikit_survival-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.0/49.0 kB[0m [31m908.9 kB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn<1.6,>=1.4.0 (from scikit-survival)
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading scikit_survival-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, scikit-survival
  Attempting uninstall: scikit-learn
    Found existing ins

In [None]:
def load_features(folder_path, file_start=""):
    """
    Load all CSV files from a specified folder and concatenate them into a single DataFrame.

    Args:
    folder_path (str): Path to the folder containing CSV files.

    Returns:
    pd.DataFrame: Combined DataFrame from all CSV files.
    """
    dfs = []
    for filename in os.listdir(folder_path):
        if filename.startswith(file_start) and filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

def preprocess_data(combined_df, prefixes=None):
    """
    Preprocess the combined DataFrame by keeping the first three columns and those starting with specified prefixes.
    Then pivot the table to combine 'Modality', 'ROI', and each feature.

    Args:
    combined_df (pd.DataFrame): Combined DataFrame from multiple CSV files.
    prefixes (list of str or None): List of prefixes to keep in the DataFrame columns.
                                    If None, all columns are retained.

    Returns:
    pd.DataFrame: Pivoted DataFrame ready for model training.
    """
    # Keep the first three columns
    first_three_columns = combined_df.iloc[:, :3]

    # If prefixes is None, keep all columns, otherwise filter columns by the specified prefixes
    if prefixes is None:
        filtered_df = combined_df
    else:
        filtered_columns = [col for col in combined_df.columns if any(col.startswith(prefix) for prefix in prefixes)]
        filtered_df = pd.concat([first_three_columns, combined_df[filtered_columns]], axis=1)

    # Melt the filtered DataFrame
    feature_columns = [col for col in filtered_df.columns if col not in first_three_columns.columns]
    melted_df = filtered_df.melt(id_vars=['PatientID', 'Modality', 'ROI'], value_vars=feature_columns, var_name='Feature')

    # Create combined feature names
    melted_df['Combined'] = melted_df['ROI'] + '_' + melted_df['Modality'] + '_' + melted_df['Feature']

    # Pivot the DataFrame
    pivoted_df = melted_df.pivot_table(index='PatientID', columns='Combined', values='value')
    pivoted_df.reset_index(inplace=True)

    return pivoted_df

In [None]:
# Upload features
if any(fn.startswith('features_album') for fn in os.listdir('.')):
  print('Features already uploaded')
else:
  uploaded = files.upload()

Saving features_album_GBM-UPENN-OS_MR_FLAIR-ET-NET.csv to features_album_GBM-UPENN-OS_MR_FLAIR-ET-NET.csv
Saving features_album_GBM-UPENN-OS_MR_T1c-ET.csv to features_album_GBM-UPENN-OS_MR_T1c-ET.csv
Saving features_album_GBM-UPENN-OS_MR_T1-ET.csv to features_album_GBM-UPENN-OS_MR_T1-ET.csv
Saving features_album_GBM-UPENN-OS_MR_T1-NET.csv to features_album_GBM-UPENN-OS_MR_T1-NET.csv
Saving features_album_GBM-UPENN-OS_MR_FLAIR-ET.csv to features_album_GBM-UPENN-OS_MR_FLAIR-ET.csv
Saving features_album_GBM-UPENN-OS_MR_T1-ET-NET-Edema.csv to features_album_GBM-UPENN-OS_MR_T1-ET-NET-Edema.csv
Saving features_album_GBM-UPENN-OS_MR_FLAIR-Edema.csv to features_album_GBM-UPENN-OS_MR_FLAIR-Edema.csv
Saving features_album_GBM-UPENN-OS_MR_FLAIR-ET-NET-Edema.csv to features_album_GBM-UPENN-OS_MR_FLAIR-ET-NET-Edema.csv
Saving features_album_GBM-UPENN-OS_MR_T1-ET-NET.csv to features_album_GBM-UPENN-OS_MR_T1-ET-NET.csv
Saving features_album_GBM-UPENN-OS_MR_FLAIR-NET.csv to features_album_GBM-UPENN-OS

In [None]:
# Upload survival_data
if os.path.exists('dummy_survival_data.csv'):
  print('Survival data already uploaded')
else:
  uploaded = files.upload()

Saving dummy_survival_data.csv to dummy_survival_data.csv


In [None]:
# Load and preprocess the data
features_df = load_features(folder_path='./', file_start="features_album")
pivoted_df = preprocess_data(features_df, prefixes=['original_intensity'])
survival_df = pd.read_csv('dummy_survival_data.csv')


In [None]:
# Prepare data for training
X = pivoted_df.drop(columns=['PatientID'])
X = X.fillna(X.mean())
y = np.array([(status, time) for status, time in zip(survival_df['SurvivalStatus'], survival_df['SurvivalTime'])],
                dtype=[('event', 'bool'), ('time', 'float')])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomSurvivalForest(n_estimators=100, min_samples_split=10, min_samples_leaf=15, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
cindex_train = concordance_index_censored(y_train['event'], y_train['time'], model.predict(X_train))[0]
cindex_test = concordance_index_censored(y_test['event'], y_test['time'], model.predict(X_test))[0]

print(f'Concordance Index (Train): {cindex_train:.2f}')
print(f'Concordance Index (Test): {cindex_test:.2f}')

Concordance Index (Train): 0.83
Concordance Index (Test): 0.53


In [None]:
# Train another model only with clinical info and compare them statistically
pivoted_df = preprocess_data(features_df, prefixes=['original_glcm'])

# Prepare data for training
X2 = pivoted_df.drop(columns=['PatientID'])
X2 = X.fillna(X2.mean())
y = np.array([(status, time) for status, time in zip(survival_df['SurvivalStatus'], survival_df['SurvivalTime'])],
                dtype=[('event', 'bool'), ('time', 'float')])
X_train2, X_test2, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=42)

# Train the model
model2 = RandomSurvivalForest(n_estimators=100, min_samples_split=10, min_samples_leaf=15, random_state=42)
model2.fit(X_train2, y_train)

# Evaluate the model
cindex_train2 = concordance_index_censored(y_train['event'], y_train['time'], model2.predict(X_train2))[0]
cindex_test2 = concordance_index_censored(y_test['event'], y_test['time'], model2.predict(X_test2))[0]

print(f'Concordance Index (Train): {cindex_train2:.2f}')
print(f'Concordance Index (Test): {cindex_test2:.2f}')

Concordance Index (Train): 0.83
Concordance Index (Test): 0.53


In [None]:
# Statistical test between the 2 models
from sklearn.utils import resample

# Function to calculate C-index
def calculate_cindex(y_true, y_pred):
    return concordance_index_censored(y_true['event'], y_true['time'], y_pred)[0]

# Resample and compute differences
def bootstrap_cindex_difference(X_test1, X_test2, y_test, model1, model2, n_iterations=1000):
    differences = []
    for _ in range(n_iterations):
        X_resampled, y_resampled = resample(X_test1, y_test, random_state=_)
        X_resampled2, y_resampled = resample(X_test2, y_test, random_state=_)
        cindex1 = calculate_cindex(y_resampled, model1.predict(X_resampled))
        cindex2 = calculate_cindex(y_resampled, model2.predict(X_resampled))
        differences.append(cindex1 - cindex2)
    return np.array(differences)

# Compute the differences
differences = bootstrap_cindex_difference(X_test, X_test2, y_test, model, model2)
observed_diff = cindex_test - cindex_test2

# Compute p-value
p_value = np.mean(differences >= observed_diff)
print(f'p-value: {p_value:.3f}')

p-value: 1.000
