<a href="https://colab.research.google.com/github/vandrearczyk/hecktor-euvip2024/blob/main/baseline_classification_hecktor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split, permutation_test_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from statsmodels.stats.contingency_tables import mcnemar
from scipy.stats import ttest_rel

from google.colab import files

In [47]:
def load_features(folder_path, file_start=""):
    """
    Load all CSV files from a specified folder, filter by file_start if provided,
    and concatenate them into a single DataFrame.

    Args:
        folder_path (str): Path to the folder containing CSV files.
        file_start (str): Optional starting string for filenames.

    Returns:
        pd.DataFrame: Combined DataFrame from all CSV files.
    """
    dfs = [pd.read_csv(os.path.join(folder_path, filename))
           for filename in os.listdir(folder_path)
           if filename.startswith(file_start) and filename.endswith(".csv")]
    return pd.concat(dfs, ignore_index=True)

def preprocess_data(df):
    """
    Preprocess the DataFrame by retaining the first three columns and pivoting to
    combine 'Modality', 'ROI', and each feature.

    Args:
        df (pd.DataFrame): Input DataFrame.

    Returns:
        pd.DataFrame: Pivoted DataFrame ready for model training.
    """
    first_three_cols = df.iloc[:, :3]
    feature_columns = df.columns.difference(first_three_cols.columns)

    melted_df = df.melt(id_vars=['PatientID', 'Modality', 'ROI'],
                        value_vars=feature_columns, var_name='Feature')
    melted_df['Combined'] = melted_df['ROI'] + '_' + melted_df['Modality'] + '_' + melted_df['Feature']

    pivoted_df = melted_df.pivot_table(index='PatientID', columns='Combined', values='value')
    pivoted_df.reset_index(inplace=True)

    print("Number of features: ", pivoted_df.shape[1])
    return pivoted_df

def filter_patients(df1, df2):
    """
    Filter out patients not present in both DataFrames based on 'PatientID'.

    Args:
        df1 (pd.DataFrame): DataFrame containing patient features.
        df2 (pd.DataFrame): DataFrame containing patient survival data.

    Returns:
        tuple: Filtered DataFrames.
    """
    df1_patients = set(df1['PatientID'])
    df2_patients = set(df2['PatientID'])

    df1 = df1[df1['PatientID'].isin(df2_patients)]
    df2 = df2[df2['PatientID'].isin(df1_patients)]

    print(f"Remaining patients: {df1.shape[0]}")
    return df1, df2

def feature_preprocessing(df, target_column='PatientID'):
    """
    Preprocess features by separating numerical and categorical columns, then applying
    respective transformations (imputation, scaling, and encoding).

    Args:
        df (pd.DataFrame): Input DataFrame.
        target_column (str): Name of the target column.

    Returns:
        pd.DataFrame: Preprocessed feature DataFrame.
    """
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    if target_column in categorical_cols: categorical_cols.remove(target_column)
    if target_column in numerical_cols: numerical_cols.remove(target_column)

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

    df_features = df.drop(columns=[target_column])
    df_preprocessed = preprocessor.fit_transform(df_features)

    return pd.concat([df[[target_column]].reset_index(drop=True),
                      pd.DataFrame(df_preprocessed, columns=preprocessor.get_feature_names_out())], axis=1)

def drop_correlated_features(X_train, X_test, threshold=0.9):
    """
    Remove highly correlated features from training and test sets.

    Args:
        X_train (pd.DataFrame): Training set.
        X_test (pd.DataFrame): Test set.
        threshold (float): Correlation threshold.

    Returns:
        tuple: Updated training and test sets.
    """
    correlation_matrix = X_train.corr().abs()
    upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
    to_drop = [col for col in upper_triangle.columns if any(upper_triangle[col] > threshold)]

    X_train = X_train.drop(columns=to_drop)
    X_test = X_test.drop(columns=to_drop)

    print(f"Number of features after dropping correlated features: {X_train.shape[1]}")
    return X_train, X_test

def extract_components(df):
    """
    Extract and categorize components (which define the available features in a DataFrame) from column names.

    Args:
        df (pd.DataFrame): Input DataFrame.

    Returns:
        dict: Dictionary of categorized components.
    """
    df.columns = df.columns.str.replace(' ', '', regex=False)
    components = {'rois': set(), 'modalities': set(), 'originals': set(), 'families': set(), 'features': set()}

    for col in df.columns.difference(['PatientID', 'Split']):
        roi, modality, original, family, feature = col.split('_')
        components['rois'].add(roi)
        components['modalities'].add(modality)
        components['originals'].add(original)
        components['families'].add(family)
        components['features'].add(feature)

    return {key: sorted(value) for key, value in components.items()}

def select_feature(df, feature_dict=None):
    """
    Filter columns of the DataFrame based on specified components.

    Args:
        df (pd.DataFrame): Input DataFrame.
        feature_dict (dict): Dictionary of components for filtering.

    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    df.columns = df.columns.str.replace(' ', '', regex=False)
    keep_columns = ['PatientID']
    feature_dict = feature_dict or {}

    for col in df.columns.difference(['PatientID']):
        roi, modality, orig, family, feature = col.split('_')
        if (feature_dict.get('rois') is None or roi in feature_dict['rois']) and \
           (feature_dict.get('modalities') is None or modality in feature_dict['modalities']) and \
           (feature_dict.get('originals') is None or orig in feature_dict['originals']) and \
           (feature_dict.get('families') is None or family in feature_dict['families']) and \
           (feature_dict.get('features') is None or feature in feature_dict['features']):
            keep_columns.append(col)

    return df[keep_columns]

def bootstrap_analysis(X, y, model, n_bootstrap=100, n_resamples=50):
    """
    Perform bootstrap analysis to compute ROC AUC with confidence intervals.

    Args:
        X (pd.DataFrame): Feature set.
        y (pd.Series): Target variable.
        model: Model to evaluate.
        n_bootstrap (int): Number of bootstrap iterations.
        n_resamples (int): Number of resamples for AUC means.

    Returns:
        tuple: Mean AUC, lower and upper confidence intervals.
    """
    boot_roc_aucs = [roc_auc_score(y_resample, model.predict(X_resample))
                     for _ in range(n_bootstrap)
                     for X_resample, y_resample in [resample(X, y, random_state=_)]]

    resampled_means = [np.mean(resample(boot_roc_aucs, random_state=_)) for _ in range(n_resamples)]

    return np.mean(resampled_means), np.percentile(resampled_means, 2.5), np.percentile(resampled_means, 97.5)

def evaluate_model(model, X_train, y_train, X_test, y_test, run_bootstrap=False):
    """
    Evaluate model performance with accuracy and ROC AUC scores.

    Args:
        model: Model to evaluate.
        X_train, X_test (pd.DataFrame): Training and test feature sets.
        y_train, y_test (pd.DataFrame): Training and test target sets.
        run_bootstrap (bool): Whether to run bootstrap analysis.

    Returns:
        None
    """
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    print(f'Accuracy (Train): {accuracy_score(y_train, y_train_pred):.2f}')
    print(f'Accuracy (Test): {accuracy_score(y_test, y_test_pred):.2f}')
    print(f'ROC AUC (Train): {roc_auc_score(y_train, y_train_pred):.2f}')
    print(f'ROC AUC (Test): {roc_auc_score(y_test, y_test_pred):.2f}')

    if run_bootstrap:
        mean_of_means, ci_lower, ci_upper = bootstrap_analysis(X_test, y_test, model)

        # Output bootstrap results
        print(f"Bootstrap ROC AUC (Test): {mean_of_means:.3f} (95% CI: {ci_lower:.3f} - {ci_upper:.3f})")

    # Detailed classification report
    print("Classification Report (Test):\n", classification_report(y_test, y_test_pred))



In [None]:
# Upload features
if any(fn.startswith('features_album') for fn in os.listdir('.')):
  print('Features already uploaded')
else:
  uploaded = files.upload()

Saving features_album_HECKTOR_EUVIP_2024_PT- GTVp.csv to features_album_HECKTOR_EUVIP_2024_PT- GTVp.csv
Saving features_album_HECKTOR_EUVIP_2024_CT- GTVp.csv to features_album_HECKTOR_EUVIP_2024_CT- GTVp.csv
Saving features_album_HECKTOR_EUVIP_2024_PT- GTVn.csv to features_album_HECKTOR_EUVIP_2024_PT- GTVn.csv
Saving features_album_HECKTOR_EUVIP_2024_CT- GTVn.csv to features_album_HECKTOR_EUVIP_2024_CT- GTVn.csv


In [None]:
# Upload hpv outcome
if os.path.exists('hecktor2022_HPV_outcomesBalanced.csv'):
  print('Outcome data already uploaded')
else:
  uploaded = files.upload()

Saving hecktor2022_HPV_outcomesBalanced.csv to hecktor2022_HPV_outcomesBalanced.csv


In [None]:
# Upload patient split
if any(fn.startswith('patient_split') for fn in os.listdir('.')):
  print('Patient split already uploaded')
else:
  uploaded = files.upload()

Saving patient_split.csv to patient_split.csv


In [None]:
# Upload clinical info https://drive.switch.ch/index.php/s/tnnb1y1FSXhdFNV
if any(fn.startswith('hecktor2022_clinicalFeatures') for fn in os.listdir('.')):
  print('Clinical info already uploaded')
else:
  uploaded = files.upload()

Saving hecktor2022_clinicalFeatures.csv to hecktor2022_clinicalFeatures.csv


In [None]:
# Load the data (features, outcomes and train/test split)
features_df = load_features(folder_path='./', file_start="features_album")
outcome_df = pd.read_csv('hecktor2022_HPV_outcomesBalanced.csv')
split_df = pd.read_csv('patient_split.csv')
clinical_df = pd.read_csv('hecktor2022_clinicalFeatures.csv')
print(features_df.shape,outcome_df.shape,split_df.shape, clinical_df.shape)

(1904, 115) (102, 2) (102, 2) (524, 10)


In [None]:
# Preprocess the data
features_df = preprocess_data(features_df)
# Filter out patients if not present in features or outcome data
features_df, outcome_df = filter_patients(features_df, outcome_df)
clinical_df, outcome_df = filter_patients(clinical_df, outcome_df)
# Ensure indices are aligned
features_df = features_df.set_index('PatientID').reindex(split_df['PatientID']).reset_index()
outcome_df = outcome_df.set_index('PatientID').reindex(split_df['PatientID']).reset_index()
clinical_df = clinical_df.set_index('PatientID').reindex(split_df['PatientID']).reset_index()

# Print all available features (rois, modalities, families and feature names)
print("available features:", extract_components(features_df))

Number of features:  221
Remaining patients: 102
Remaining patients: 102
available features: {'rois': ['GTVn', 'GTVp'], 'modalities': ['CT', 'PT'], 'originals': ['original'], 'families': ['SUV', 'glcm', 'glrlm', 'glszm', 'intensity', 'shape'], 'features': ['10Percentile', '90Percentile', 'Autocorrelation', 'ClusterProminence', 'ClusterShade', 'ClusterTendency', 'Contrast', 'Correlation', 'DifferenceAverage', 'DifferenceEntropy', 'DifferenceVariance', 'Elongation', 'Energy', 'Entropy', 'Flatness', 'GrayLevelNonUniformity', 'GrayLevelNonUniformityNormalized', 'GrayLevelVariance', 'HighGrayLevelRunEmphasis', 'HighGrayLevelZoneEmphasis', 'Id', 'Idm', 'Idmn', 'Idn', 'Imc1', 'Imc2', 'InterquartileRange', 'InverseVariance', 'JointAverage', 'JointEnergy', 'JointEntropy', 'Kurtosis', 'LargeAreaEmphasis', 'LargeAreaHighGrayLevelEmphasis', 'LargeAreaLowGrayLevelEmphasis', 'LeastAxisLength', 'LongRunEmphasis', 'LongRunHighGrayLevelEmphasis', 'LongRunLowGrayLevelEmphasis', 'LowGrayLevelRunEmphasis'

In [None]:
# Select specific set of features
features_dict = {
    'rois':['GTVp','GTVn'],
    'modalities':['CT','PT'],
    'originals':None,
    'families':None,
    'features':None
}
features_df1 = select_feature(features_df, features_dict)
# features_df1 = select_feature(features_df, None)
print("Shape of radiomics data:",features_df1.shape)

# Select clinical infos
clinical_features = ['CenterID', 'Gender', 'Age'] #'CenterID', 'Gender', 'Age', 'Weight', 'Tobacco', 'Alcohol', 'Performance status', 'Surgery', 'Chemotherapy'
clinical_features.insert(0,'PatientID')
clinical_df1 = clinical_df[clinical_features]

# Merge with clinical features
features_df1 = pd.merge(features_df1, clinical_df1, on='PatientID', how='inner')
print("Shape of radiomics+clinical data:", features_df1.shape)

Shape of radiomics data: (102, 221)
Shape of radiomics+clinical data: (102, 230)


In [None]:
# Pre-process features (one-hot encoding, scaling, imputation missing values)
features_df1 = feature_preprocessing(features_df1, target_column='PatientID')

# Prepare data for training
X = features_df1.drop(columns=['PatientID'])
print("Number of features:", X.shape[1])
y = outcome_df['Outcome']

# Split the dataset into training and testing sets based on the 'Split' column
X_train = X[split_df['Split'] == 'train']
X_test = X[split_df['Split'] == 'test']
y_train = y[split_df['Split'] == 'train']
y_test = y[split_df['Split'] == 'test']

# Remove correlated features
X_train, X_test = drop_correlated_features(X_train, X_test)

# Feature selection
# selector = SelectKBest(mutual_info_classif, k=20)
# X_train = selector.fit_transform(X_train, y_train)
# X_test = selector.transform(X_test)
# print("Number of features after selection:", X_test.shape[1])

# Define the model
model = RandomForestClassifier(n_estimators=200, min_samples_split=5, min_samples_leaf=10, max_depth=None, random_state=42)
# model = LogisticRegression(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model (recall of the positive class is also known as “sensitivity”; recall of the negative class is “specificity”.)
evaluate_model(model, X_train, y_train, X_test, y_test, run_bootstrap=True)

Number of features: 230
Number of features after dropping correlated features: 78
Accuracy (Train): 0.89
Accuracy (Test): 0.85
ROC AUC (Train): 0.89
ROC AUC (Test): 0.88
Bootstrap ROC AUC (Test): 0.870 (95% CI: 0.861 - 0.886)
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.73      1.00      0.84         8
           1       1.00      0.75      0.86        12

    accuracy                           0.85        20
   macro avg       0.86      0.88      0.85        20
weighted avg       0.89      0.85      0.85        20



Second model for comparison

In [None]:
# Select specific set of features
features_dict = {
    'rois':[],
    'modalities':None,
    'originals':None,
    'families':None,
    'features':None
}
features_df2 = select_feature(features_df, features_dict)
print("Shape of radiomics data:",features_df2.shape)

# Select clinical infos
clinical_features = ['CenterID'] # 'Age', 'Gender', ...
clinical_features.insert(0,'PatientID')
clinical_df2 = clinical_df[clinical_features]

# Merge with clinical features
features_df2 = pd.merge(features_df2, clinical_df2, on='PatientID', how='inner')
print("Shape of radiomics+clinical data:", features_df2.shape)

Shape of radiomics data: (102, 1)
Shape of radiomics+clinical data: (102, 2)


In [None]:
# Pre-process features (one-hot encoding, scaling, imputation missing values)
features_df2 = feature_preprocessing(features_df2, target_column='PatientID')

# Prepare data for training
X = features_df2.drop(columns=['PatientID'])
print("Number of features:", X.shape[1])
y = outcome_df['Outcome']

# Split the dataset into training and testing sets based on the 'Split' column
X_train = X[split_df['Split'] == 'train']
X_test2 = X[split_df['Split'] == 'test']
y_train = y[split_df['Split'] == 'train']
y_test = y[split_df['Split'] == 'test']

# Remove correlated features
# X_train, X_test2 = drop_correlated_features(X_train, X_test2)

# Feature selection
# selector = SelectKBest(mutual_info_classif, k=20)
# X_train = selector.fit_transform(X_train, y_train)
# X_test = selector.transform(X_test)
# print("Number of features after selection:", X_test.shape[1])

# Define the model
model2 = RandomForestClassifier(n_estimators=200, min_samples_split=5, min_samples_leaf=10, max_depth=None, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
evaluate_model(model, X_train, y_train, X_test2, y_test, run_bootstrap=True)

Number of features: 1
Accuracy (Train): 0.76
Accuracy (Test): 0.70
ROC AUC (Train): 0.76
ROC AUC (Test): 0.73
Bootstrap ROC AUC (Test): 0.725 (95% CI: 0.710 - 0.740)
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.58      0.88      0.70         8
           1       0.88      0.58      0.70        12

    accuracy                           0.70        20
   macro avg       0.73      0.73      0.70        20
weighted avg       0.76      0.70      0.70        20

