<a href="https://colab.research.google.com/github/vandrearczyk/hecktor-euvip2024/blob/main/baseline_prediction_hecktor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
! pip install scikit-survival

import numpy as np
import os
import pandas as pd
from sksurv.datasets import get_x_y
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_censored
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from google.colab import files



In [2]:
def load_features(folder_path, file_start=""):
    """
    Load all CSV files from a specified folder and concatenate them into a single DataFrame.

    Args:
    folder_path (str): Path to the folder containing CSV files.

    Returns:
    pd.DataFrame: Combined DataFrame from all CSV files.
    """
    dfs = []
    for filename in os.listdir(folder_path):
        if filename.startswith(file_start) and filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

def preprocess_data(combined_df, prefixes=None):
    """
    Preprocess the combined DataFrame by keeping the first three columns and those starting with specified prefixes.
    Then pivot the table to combine 'Modality', 'ROI', and each feature.

    Args:
    combined_df (pd.DataFrame): Combined DataFrame from multiple CSV files.
    prefixes (list of str or None): List of prefixes to keep in the DataFrame columns.
                                    If None, all columns are retained.

    Returns:
    pd.DataFrame: Pivoted DataFrame ready for model training.
    """
    # Keep the first three columns
    first_three_columns = combined_df.iloc[:, :3]

    # If prefixes is None, keep all columns, otherwise filter columns by the specified prefixes
    if prefixes is None:
        filtered_df = combined_df
    else:
        filtered_columns = [col for col in combined_df.columns if any(col.startswith(prefix) for prefix in prefixes)]
        filtered_df = pd.concat([first_three_columns, combined_df[filtered_columns]], axis=1)

    # Melt the filtered DataFrame
    feature_columns = [col for col in filtered_df.columns if col not in first_three_columns.columns]
    melted_df = filtered_df.melt(id_vars=['PatientID', 'Modality', 'ROI'], value_vars=feature_columns, var_name='Feature')

    # Create combined feature names
    melted_df['Combined'] = melted_df['ROI'] + '_' + melted_df['Modality'] + '_' + melted_df['Feature']

    # Pivot the DataFrame
    pivoted_df = melted_df.pivot_table(index='PatientID', columns='Combined', values='value')
    pivoted_df.reset_index(inplace=True)

    return pivoted_df

In [3]:
# Upload features
if any(fn.startswith('features_album') for fn in os.listdir('.')):
  print('Features already uploaded')
else:
  uploaded = files.upload()

Saving features_album_HECKTOR_EUVIP_2024_PT- GTVp.csv to features_album_HECKTOR_EUVIP_2024_PT- GTVp.csv
Saving features_album_HECKTOR_EUVIP_2024_PT- GTVn.csv to features_album_HECKTOR_EUVIP_2024_PT- GTVn.csv
Saving features_album_HECKTOR_EUVIP_2024_CT- GTVp.csv to features_album_HECKTOR_EUVIP_2024_CT- GTVp.csv
Saving features_album_HECKTOR_EUVIP_2024_CT- GTVn.csv to features_album_HECKTOR_EUVIP_2024_CT- GTVn.csv


In [4]:
# Upload survival_data
if os.path.exists('hecktor2022_endpoint_training.csv'):
  print('Survival data already uploaded')
else:
  uploaded = files.upload()

Saving hecktor2022_endpoint_training.csv to hecktor2022_endpoint_training.csv


In [26]:
# Load and preprocess the data
features_df = load_features(folder_path='./', file_start="features_album")
pivoted_df = preprocess_data(features_df, prefixes=['original_intensity'])
survival_df = pd.read_csv('hecktor2022_endpoint_training.csv')

In [27]:
# Filter patients if not present in features or survival data
deleted_from_survival = set(survival_df['PatientID']) - set(pivoted_df['PatientID'])
deleted_from_pivoted = set(pivoted_df['PatientID']) - set(survival_df['PatientID'])

print("Deleted patients from survival_df:", len(deleted_from_survival), deleted_from_survival)
print("Deleted patients from pivoted_df:", len(deleted_from_pivoted), deleted_from_pivoted)

# Drop rows
pivoted_df = pivoted_df[pivoted_df['PatientID'].isin(survival_df['PatientID'])]
survival_df = survival_df[survival_df['PatientID'].isin(pivoted_df['PatientID'])]
print("Remaining patients: ", pivoted_df.shape[0])

Deleted patients from survival_df: 16 {'MDA-146', 'MDA-001', 'HGJ-074', 'HMR-030', 'HMR-024', 'MDA-006', 'CHUV-035', 'MDA-003', 'MDA-007', 'CHUV-033', 'HMR-034', 'HMR-029', 'HMR-005', 'HGJ-073', 'HMR-016', 'MDA-005'}
Deleted patients from pivoted_df: 34 {'CHUV-005', 'CHUP-071', 'CHUP-039', 'CHUP-019', 'CHUP-016', 'CHUP-070', 'CHUP-043', 'CHUV-003', 'CHUP-062', 'CHUP-069', 'CHUV-007', 'CHUP-050', 'CHUV-008', 'CHUP-012', 'CHUP-023', 'CHUV-011', 'CHUP-055', 'CHUP-004', 'CHUP-024', 'CHUP-015', 'CHUP-048', 'CHUP-033', 'CHUP-060', 'CHUP-018', 'CHUP-027', 'CHUP-049', 'CHUV-002', 'CHUV-010', 'CHUP-005', 'CHUP-007', 'CHUP-040', 'CHUP-025', 'CHUP-003', 'CHUP-041'}
Remaining patients:  472


In [28]:
# Prepare data for training
X = pivoted_df.drop(columns=['PatientID'])
X = X.fillna(X.mean())
y = np.array([(status, time) for status, time in zip(survival_df['Relapse'], survival_df['RFS'])],
                dtype=[('event', 'bool'), ('time', 'float')])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection
selector = SelectKBest(mutual_info_classif, k=10)  # Keep top 10 features
X_train_selected = selector.fit_transform(X_train, y_train['event'])
X_test_selected = selector.transform(X_test)

# Train the model
# model = RandomSurvivalForest(n_estimators=100, min_samples_split=10, min_samples_leaf=15, random_state=42)
model = RandomSurvivalForest(n_estimators=50, min_samples_split=20, min_samples_leaf=30, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
cindex_train = concordance_index_censored(y_train['event'], y_train['time'], model.predict(X_train))[0]
cindex_test = concordance_index_censored(y_test['event'], y_test['time'], model.predict(X_test))[0]

print(f'Concordance Index (Train): {cindex_train:.2f}')
print(f'Concordance Index (Test): {cindex_test:.2f}')

Concordance Index (Train): 0.80
Concordance Index (Test): 0.55


In [11]:
# Train another model only with clinical info and compare them statistically
pivoted_df = preprocess_data(features_df, prefixes=['original_glcm'])

# Prepare data for training
X2 = pivoted_df.drop(columns=['PatientID'])
X2 = X.fillna(X2.mean())
y = np.array([(status, time) for status, time in zip(survival_df['Relapse'], survival_df['RFS'])],
                dtype=[('event', 'bool'), ('time', 'float')])
X_train2, X_test2, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=42)

# Train the model
model2 = RandomSurvivalForest(n_estimators=100, min_samples_split=10, min_samples_leaf=15, random_state=42)
model2.fit(X_train2, y_train)

# Evaluate the model
cindex_train2 = concordance_index_censored(y_train['event'], y_train['time'], model2.predict(X_train2))[0]
cindex_test2 = concordance_index_censored(y_test['event'], y_test['time'], model2.predict(X_test2))[0]

print(f'Concordance Index (Train): {cindex_train2:.2f}')
print(f'Concordance Index (Test): {cindex_test2:.2f}')

Concordance Index (Train): 0.88
Concordance Index (Test): 0.50


In [None]:
# Statistical test between the 2 models
from sklearn.utils import resample

# Function to calculate C-index
def calculate_cindex(y_true, y_pred):
    return concordance_index_censored(y_true['event'], y_true['time'], y_pred)[0]

# Resample and compute differences
def bootstrap_cindex_difference(X_test1, X_test2, y_test, model1, model2, n_iterations=1000):
    differences = []
    for _ in range(n_iterations):
        X_resampled, y_resampled = resample(X_test1, y_test, random_state=_)
        X_resampled2, y_resampled = resample(X_test2, y_test, random_state=_)
        cindex1 = calculate_cindex(y_resampled, model1.predict(X_resampled))
        cindex2 = calculate_cindex(y_resampled, model2.predict(X_resampled))
        differences.append(cindex1 - cindex2)
    return np.array(differences)

# Compute the differences
differences = bootstrap_cindex_difference(X_test, X_test2, y_test, model, model2)
observed_diff = cindex_test - cindex_test2

# Compute p-value
p_value = np.mean(differences >= observed_diff)
print(f'p-value: {p_value:.3f}')