<a href="https://colab.research.google.com/github/vandrearczyk/hecktor-euvip2024/blob/main/baseline_prediction_hecktor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
! pip install scikit-survival

import numpy as np
import os
import pandas as pd
from sksurv.datasets import get_x_y
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_censored
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.utils import resample
from google.colab import files



In [41]:
def load_features(folder_path, file_start=""):
    """
    Load all CSV files from a specified folder and concatenate them into a single DataFrame.

    Args:
    folder_path (str): Path to the folder containing CSV files.

    Returns:
    pd.DataFrame: Combined DataFrame from all CSV files.
    """
    dfs = []
    for filename in os.listdir(folder_path):
        if filename.startswith(file_start) and filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

def preprocess_data(combined_df, prefixes=None):
    """
    Preprocess the combined DataFrame by keeping the first three columns and those starting with specified prefixes.
    Then pivot the table to combine 'Modality', 'ROI', and each feature.

    Args:
    combined_df (pd.DataFrame): Combined DataFrame from multiple CSV files.
    prefixes (list of str or None): List of prefixes to keep in the DataFrame columns.
                                    If None, all columns are retained.

    Returns:
    pd.DataFrame: Pivoted DataFrame ready for model training.
    """
    # Keep the first three columns
    first_three_columns = combined_df.iloc[:, :3]

    # If prefixes is None, keep all columns, otherwise filter columns by the specified prefixes
    if prefixes is None:
        filtered_df = combined_df
    else:
        filtered_columns = [col for col in combined_df.columns if any(col.startswith(prefix) for prefix in prefixes)]
        filtered_df = pd.concat([first_three_columns, combined_df[filtered_columns]], axis=1)

    # Melt the filtered DataFrame
    feature_columns = [col for col in filtered_df.columns if col not in first_three_columns.columns]
    melted_df = filtered_df.melt(id_vars=['PatientID', 'Modality', 'ROI'], value_vars=feature_columns, var_name='Feature')

    # Create combined feature names
    melted_df['Combined'] = melted_df['ROI'] + '_' + melted_df['Modality'] + '_' + melted_df['Feature']

    # Pivot the DataFrame
    pivoted_df = melted_df.pivot_table(index='PatientID', columns='Combined', values='value')
    pivoted_df.reset_index(inplace=True)

    print("Number of features: ", pivoted_df.shape[1])

    return pivoted_df

def filter_patients(pivoted_df, survival_df):
    """
    Filter out patients not present in both the pivoted and survival DataFrames
    based on the 'PatientID' column.

    Args:
    pivoted_df (pd.DataFrame): DataFrame containing patient features.
    survival_df (pd.DataFrame): DataFrame containing patient survival data.

    Returns:
    tuple: A tuple containing the filtered pivoted_df and survival_df DataFrames.
    """

    # Identify patients to be deleted from each DataFrame
    deleted_from_survival = set(survival_df['PatientID']) - set(pivoted_df['PatientID'])
    deleted_from_pivoted = set(pivoted_df['PatientID']) - set(survival_df['PatientID'])

    # Print details of deleted patients
    print("Deleted patients from survival_df:", len(deleted_from_survival), deleted_from_survival)
    print("Deleted patients from pivoted_df:", len(deleted_from_pivoted), deleted_from_pivoted)

    # Filter DataFrames to only include matching patients
    pivoted_df = pivoted_df[pivoted_df['PatientID'].isin(survival_df['PatientID'])]
    survival_df = survival_df[survival_df['PatientID'].isin(pivoted_df['PatientID'])]

    # Print the number of remaining patients
    print("Remaining patients:", pivoted_df.shape[0])

    return pivoted_df, survival_df

def bootstrap_cindex(y_test, X_test, model, n_bootstraps=1000, random_state=42):
    # Extract event and time data from y_test (assume y_test is a structured array or 2D array)
    events = y_test['event']  # Assuming the first column is 'event'
    times = y_test['time']   # Assuming the second column is 'time'

    # Store all bootstrapped C-index values
    cindex_values = []

    # Set random seed for reproducibility
    np.random.seed(random_state)

    # Perform bootstrapping
    for i in range(n_bootstraps):
        # Sample with replacement from the test set
        indices = resample(np.arange(len(y_test)), replace=True)
        y_test_bootstrap = y_test[indices]
        X_test_bootstrap = X_test[indices]

        # Predict on the bootstrap sample
        predictions = model.predict(X_test_bootstrap)

        # Calculate C-index
        cindex = concordance_index_censored(
            y_test_bootstrap['event'],  # 'event' column
            y_test_bootstrap['time'],  # 'time' column
            predictions
        )[0]

        # Store the C-index value
        cindex_values.append(cindex)

    # Convert to a numpy array for easy statistical calculations
    cindex_values = np.array(cindex_values)

    # Calculate the mean C-index and confidence intervals
    mean_cindex = np.mean(cindex_values)
    ci_lower = np.percentile(cindex_values, 2.5)
    ci_upper = np.percentile(cindex_values, 97.5)

    return mean_cindex, ci_lower, ci_upper

In [24]:
# Upload features
if any(fn.startswith('features_album') for fn in os.listdir('.')):
  print('Features already uploaded')
else:
  uploaded = files.upload()

Features already uploaded


In [25]:
# Upload survival_data
if os.path.exists('hecktor2022_endpoint_training.csv'):
  print('Survival data already uploaded')
else:
  uploaded = files.upload()

Survival data already uploaded


Number of features:  79


In [62]:
# Load and preprocess the data
features_df = load_features(folder_path='./', file_start="features_album")
pivoted_df = preprocess_data(features_df, prefixes=['original_intensity', 'original_SUV'])
survival_df = pd.read_csv('hecktor2022_endpoint_training.csv')
# Filter out patients if not present in features or survival data
pivoted_df, survival_df = filter_patients(pivoted_df, survival_df)

# Prepare data for training
X = pivoted_df.drop(columns=['PatientID'])
X = X.fillna(X.mean())
y = np.array([(status, time) for status, time in zip(survival_df['Relapse'], survival_df['RFS'])],
                dtype=[('event', 'bool'), ('time', 'float')])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection
selector = SelectKBest(mutual_info_classif, k=10)  # Keep top 10 features
X_train = selector.fit_transform(X_train, y_train['event'])
X_test = selector.transform(X_test)
print("Number of features after selection:", X_test.shape[1])

# Train the model
model = RandomSurvivalForest(n_estimators=50, min_samples_split=20, min_samples_leaf=30, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
cindex_train = concordance_index_censored(y_train['event'], y_train['time'], model.predict(X_train))[0]
cindex_test = concordance_index_censored(y_test['event'], y_test['time'], model.predict(X_test))[0]
print(f'Concordance Index (Train): {cindex_train:.2f}')
print(f'Concordance Index (Test): {cindex_test:.2f}')

# Evaluate with boostrapping
mean_cindex, ci_lower, ci_upper = bootstrap_cindex(y_test, X_test, model)
print(f"Bootstrap C-index: {mean_cindex:.3f} (95% CI: {ci_lower:.3f} - {ci_upper:.3f})")

Number of features:  79
Deleted patients from survival_df: 16 {'HMR-024', 'HMR-016', 'HGJ-074', 'MDA-001', 'CHUV-033', 'MDA-006', 'MDA-003', 'MDA-146', 'HMR-005', 'HMR-034', 'HMR-029', 'MDA-007', 'MDA-005', 'HGJ-073', 'CHUV-035', 'HMR-030'}
Deleted patients from pivoted_df: 34 {'CHUP-025', 'CHUP-048', 'CHUP-041', 'CHUP-043', 'CHUP-027', 'CHUP-018', 'CHUP-062', 'CHUP-007', 'CHUP-069', 'CHUP-016', 'CHUP-055', 'CHUV-010', 'CHUV-007', 'CHUP-024', 'CHUP-012', 'CHUV-005', 'CHUP-049', 'CHUP-015', 'CHUV-003', 'CHUP-070', 'CHUP-033', 'CHUP-040', 'CHUP-039', 'CHUV-002', 'CHUP-050', 'CHUP-071', 'CHUV-011', 'CHUP-023', 'CHUP-005', 'CHUP-019', 'CHUV-008', 'CHUP-060', 'CHUP-004', 'CHUP-003'}
Remaining patients: 472
Concordance Index (Train): 0.81
Concordance Index (Test): 0.53


KeyError: "None of [Index([51, 92, 14, 71, 60, 20, 82, 86, 74, 74, 87, 23,  2, 21, 52,  1, 87, 29,\n       37,  1, 63, 59, 20, 32, 75, 57, 21, 88, 48, 90, 58, 41, 91, 59, 79, 14,\n       61, 61, 46, 61, 50, 54, 63,  2, 50,  6, 20, 72, 38, 17,  3, 88, 59, 13,\n        8, 89, 52,  1, 83, 91, 59, 70, 43,  7, 46, 34, 77, 80, 35, 49,  3,  1,\n        5, 53,  3, 53, 92, 62, 17, 89, 43, 33, 73, 61, 13, 94, 47, 14, 71, 77,\n       86, 61, 39, 84, 79],\n      dtype='int64', name='Combined')] are in the [columns]"

In [59]:
# Train another model e.g. only with clinical info and compare them statistically

# Load and preprocess the data
features_df = load_features(folder_path='./', file_start="features_album")
pivoted_df = preprocess_data(features_df, prefixes=['original_glcm'])
survival_df = pd.read_csv('hecktor2022_endpoint_training.csv')
# Filter out patients if not present in features or survival data
pivoted_df, survival_df = filter_patients(pivoted_df, survival_df)

# Prepare data for training
X2 = pivoted_df.drop(columns=['PatientID'])
X2 = X2.fillna(X2.mean())
y = np.array([(status, time) for status, time in zip(survival_df['Relapse'], survival_df['RFS'])],
                dtype=[('event', 'bool'), ('time', 'float')])
X_train2, X_test2, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=42)

# Feature selection
selector = SelectKBest(mutual_info_classif, k=10)  # Keep top 10 features
X_train2 = selector.fit_transform(X_train2, y_train['event'])
X_test2 = selector.transform(X_test2)
print("Number of features after selection:", X_test2.shape[1])

# Train the model
# model2 = RandomSurvivalForest(n_estimators=100, min_samples_split=10, min_samples_leaf=15, random_state=42)
model2 = RandomSurvivalForest(n_estimators=50, min_samples_split=20, min_samples_leaf=30, random_state=42)
model2.fit(X_train2, y_train)

# Evaluate the model
cindex_train2 = concordance_index_censored(y_train['event'], y_train['time'], model2.predict(X_train2))[0]
cindex_test2 = concordance_index_censored(y_test['event'], y_test['time'], model2.predict(X_test2))[0]
print(f'Concordance Index (Train): {cindex_train2:.2f}')
print(f'Concordance Index (Test): {cindex_test2:.2f}')

# Evaluate with boostrapping
mean_cindex, ci_lower, ci_upper = bootstrap_cindex(y_test, X_test2, model2)
print(f"Bootstrap C-index: {mean_cindex:.3f} (95% CI: {ci_lower:.3f} - {ci_upper:.3f})")

Number of features:  49
Deleted patients from survival_df: 32 {'HMR-024', 'MDA-180', 'MDA-146', 'HMR-034', 'HMR-029', 'MDA-121', 'MDA-005', 'HMR-030', 'MDA-124', 'HGJ-074', 'MDA-001', 'CHUP-029', 'MDA-091', 'MDA-128', 'HGJ-073', 'MDA-003', 'MDA-029', 'MDA-192', 'HMR-016', 'MDA-201', 'MDA-048', 'CHUV-035', 'MDA-006', 'MDA-200', 'MDA-179', 'MDA-166', 'CHUV-033', 'CHUP-032', 'MDA-036', 'MDA-169', 'HMR-005', 'MDA-007'}
Deleted patients from pivoted_df: 34 {'CHUP-025', 'CHUP-048', 'CHUP-041', 'CHUP-043', 'CHUP-027', 'CHUP-018', 'CHUP-062', 'CHUP-007', 'CHUP-069', 'CHUP-016', 'CHUP-055', 'CHUV-010', 'CHUV-007', 'CHUP-024', 'CHUP-012', 'CHUV-005', 'CHUP-049', 'CHUP-015', 'CHUV-003', 'CHUP-070', 'CHUP-033', 'CHUP-040', 'CHUP-039', 'CHUV-002', 'CHUP-050', 'CHUP-071', 'CHUV-011', 'CHUP-023', 'CHUP-005', 'CHUP-019', 'CHUV-008', 'CHUP-060', 'CHUP-004', 'CHUP-003'}
Remaining patients: 456
Number of features after selection: 10
Concordance Index (Train): 0.78
Concordance Index (Test): 0.51
Bootstrap

In [None]:
# Function to calculate C-index
def calculate_cindex(y_true, y_pred):
    return concordance_index_censored(y_true['event'], y_true['time'], y_pred)[0]

# Resample and compute differences
def bootstrap_cindex_difference(X_test1, X_test2, y_test, model1, model2, n_iterations=1000):
    differences = []
    for _ in range(n_iterations):
        X_resampled, y_resampled = resample(X_test1, y_test, random_state=_)
        X_resampled2, y_resampled = resample(X_test2, y_test, random_state=_)
        cindex1 = calculate_cindex(y_resampled, model1.predict(X_resampled))
        cindex2 = calculate_cindex(y_resampled, model2.predict(X_resampled))
        differences.append(cindex1 - cindex2)
    return np.array(differences)

# Compute the differences
differences = bootstrap_cindex_difference(X_test, X_test2, y_test, model, model2)
observed_diff = cindex_test - cindex_test2

# Compute p-value
p_value = np.mean(differences >= observed_diff)
print(f'p-value: {p_value:.3f}')



ValueError: X has 10 features, but RandomSurvivalForest is expecting 36 features as input.