<a href="https://colab.research.google.com/github/vandrearczyk/hecktor-euvip2024/blob/main/baseline_classification_hecktor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split, permutation_test_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from statsmodels.stats.contingency_tables import mcnemar
from scipy.stats import ttest_rel

from google.colab import files

In [118]:
def load_features(folder_path, file_start=""):
    """
    Load all CSV files from a specified folder and concatenate them into a single DataFrame.

    Args:
    folder_path (str): Path to the folder containing CSV files.

    Returns:
    pd.DataFrame: Combined DataFrame from all CSV files.
    """
    dfs = []
    for filename in os.listdir(folder_path):
        if filename.startswith(file_start) and filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

def preprocess_data(df):
    """
    Preprocess the DataFrame by keeping the first three columns.
    Then pivot the table to combine 'Modality', 'ROI', and each feature.

    Args:
    df (pd.DataFrame): Combined DataFrame from multiple CSV files.

    Returns:
    pd.DataFrame: Pivoted DataFrame ready for model training.
    """
    # Keep the first three columns
    first_three_columns = df.iloc[:, :3]

    # Keep all columns
    filtered_df = df

    # Melt the filtered DataFrame
    feature_columns = [col for col in filtered_df.columns if col not in first_three_columns.columns]
    melted_df = filtered_df.melt(id_vars=['PatientID', 'Modality', 'ROI'], value_vars=feature_columns, var_name='Feature')

    # Create combined feature names
    melted_df['Combined'] = melted_df['ROI'] + '_' + melted_df['Modality'] + '_' + melted_df['Feature']

    # Pivot the DataFrame
    pivoted_df = melted_df.pivot_table(index='PatientID', columns='Combined', values='value')
    pivoted_df.reset_index(inplace=True)

    print("Number of features: ", pivoted_df.shape[1])

    return pivoted_df

# Could now be removed
def filter_patients(df1, df2):
    """
    Filter out patients not present in both DataFrames df1 and df2
    based on the 'PatientID' column.

    Args:
    df1 (pd.DataFrame): DataFrame containing patient features.
    df2 (pd.DataFrame): DataFrame containing patient survival data.

    Returns:
    tuple: A tuple containing the filtered pivoted_df and df2 DataFrames.
    """

    # Identify patients to be deleted from each DataFrame
    deleted_from_df2 = set(df2['PatientID']) - set(df1['PatientID'])
    deleted_from_df1 = set(df1['PatientID']) - set(df2['PatientID'])

    # Print details of deleted patients
    print("Deleted patients from df2:", len(deleted_from_df2), deleted_from_df2)
    print("Deleted patients from df1:", len(deleted_from_df1), deleted_from_df1)

    # Filter DataFrames to only include matching patients
    df1 = df1[df1['PatientID'].isin(df2['PatientID'])]
    df2 = df2[df2['PatientID'].isin(df1['PatientID'])]

    # Print the number of remaining patients
    print("Remaining patients:", df1.shape[0])

    return df1, df2

def feature_preprocessing(df, target_column='PatientID'):
    # Step 1: Identify categorical and numerical columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    # Remove the target column from the list of features
    if target_column in categorical_cols:
        categorical_cols.remove(target_column)
    if target_column in numerical_cols:
        numerical_cols.remove(target_column)

    # Step 2: Preprocessing for numerical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
        ('scaler', StandardScaler())  # Standardize features
    ])

    # Step 3: Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
        ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode
    ])

    # Step 4: Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    # Step 5: Apply preprocessing to the data (excluding the target column)
    df_features = df.drop(columns=[target_column])
    df_preprocessed = preprocessor.fit_transform(df_features)

    # Convert the resulting array back to a DataFrame with appropriate column names
    df_preprocessed = pd.DataFrame(df_preprocessed, columns=preprocessor.get_feature_names_out())

    # Step 6: Concatenate the PatientID column with the processed features
    df_preprocessed_with_id = pd.concat([df[[target_column]].reset_index(drop=True), df_preprocessed], axis=1)

    return df_preprocessed_with_id

def drop_correlated_features(X_train, X_test, threshold=0.9):
    """
    Drops highly correlated features from the training and test sets.

    Parameters:
        X_train (pd.DataFrame): Training feature set.
        X_test (pd.DataFrame): Test feature set.
        threshold (float): Correlation threshold for dropping features. Default is 0.9.

    Returns:
        X_train (pd.DataFrame): Training set with correlated features removed.
        X_test (pd.DataFrame): Test set with correlated features removed.
    """
    # Compute the correlation matrix
    correlation_matrix = X_train.corr().abs()

    # Select the upper triangle of the correlation matrix
    upper_triangle = correlation_matrix.where(
        np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

    # Identify features with correlation greater than the threshold
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]

    # Drop correlated features from the training and test sets
    X_train = X_train.drop(columns=to_drop)
    X_test = X_test.drop(columns=to_drop)

    print("Number of features after dropping correlated features:", X_train.shape[1])

    return X_train, X_test

def extract_components(df):
    # Clean column names to remove spaces
    df.columns = df.columns.str.replace(' ', '', regex=False)

    # Initialize sets to store unique components
    rois = set()
    modalities = set()
    originals = set()
    families = set()
    features = set()

    # Iterate over the columns and extract components
    for col in df.columns:
        # Skip 'PatientID' and 'Split' columns
        if col in ['PatientID', 'Split']:
            continue

        # Split the column name into components
        col_parts = col.split('_')

        # Add each component to its corresponding set
        rois.add(col_parts[0])
        modalities.add(col_parts[1])
        originals.add(col_parts[2])
        families.add(col_parts[3])
        features.add(col_parts[4])

    # Convert sets to sorted lists
    return {
        'rois': sorted(rois),
        'modalities': sorted(modalities),
        'originals': sorted(originals),
        'families': sorted(families),
        'features': sorted(features)
    }

def select_feature(df, feature_dict=None):
    # Remove all spaces from column names
    df.columns = df.columns.str.replace(' ', '', regex=False)

    # Define the prefix for columns that should be kept (PatientID and Split)
    keep_columns = ['PatientID']

    # If feature_dict is not provided, set it to an empty dictionary
    if feature_dict is None:
        feature_dict = {}

    # Extract filters from the dictionary
    rois = feature_dict.get('rois', None)
    modalities = feature_dict.get('modalities', None)
    originals = feature_dict.get('originals', None)
    families = feature_dict.get('families', None)
    features = feature_dict.get('features', None)

    # Iterate over all columns in the dataframe
    for col in df.columns:
        # Skip the PatientID and Split columns
        if col in keep_columns:
            continue

        # Split the column name into parts
        col_parts = col.split('_')

        # Unpack the parts based on the known pattern
        roi, modality, orig, family, feature = col_parts[0], col_parts[1], col_parts[2], col_parts[3], col_parts[4]

        # Check if this column should be included based on the filters provided
        if ((rois is None or roi in rois) and
            (modalities is None or modality in modalities) and
            (originals is None or orig in originals) and
            (families is None or family in families) and
            (features is None or feature in features)):
            keep_columns.append(col)

    # Return the dataframe with only the selected columns
    return df[keep_columns]

def bootstrap_analysis(X, y, model, n_bootstrap=100, n_resamples=50):
    boot_roc_aucs = []

    # First bootstrap to get 100 AUCs
    for _ in range(n_bootstrap):
        # Resample with replacement
        X_resample, y_resample = resample(X, y, random_state=_)

        # Make predictions on the resampled data
        y_pred_resample = model.predict(X_resample)

        # Calculate AUC
        roc_auc = roc_auc_score(y_resample, y_pred_resample)
        boot_roc_aucs.append(roc_auc)

    # Now, resample these AUCs 50 times and calculate mean AUCs
    resampled_means = []
    for _ in range(n_resamples):
        resample_aucs = resample(boot_roc_aucs, random_state=_)
        resampled_means.append(np.mean(resample_aucs))

    # Calculate the mean of the resampled means and 95% confidence intervals
    mean_of_means = np.mean(resampled_means)
    ci_lower = np.percentile(resampled_means, 2.5)
    ci_upper = np.percentile(resampled_means, 97.5)

    return mean_of_means, ci_lower, ci_upper

def evaluate_model(model, X_train, y_train, X_test, y_test, run_bootstrap=False):
    # Evaluate the model
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    roc_auc_train = roc_auc_score(y_train, y_train_pred)
    roc_auc_test = roc_auc_score(y_test, y_test_pred)

    print(f'Accuracy (Train): {train_accuracy:.2f}')
    print(f'Accuracy (Test): {test_accuracy:.2f}')
    print(f'ROC AUC (Train): {roc_auc_train:.2f}')
    print(f'ROC AUC (Test): {roc_auc_test:.2f}')

    # Perform bootstrap analysis if required
    if run_bootstrap:
        mean_of_means, ci_lower, ci_upper = bootstrap_analysis(X_test, y_test, model)

        # Output bootstrap results
        print(f"Bootstrap ROC AUC (Test): {mean_of_means:.3f} (95% CI: {ci_lower:.3f} - {ci_upper:.3f})")

    # Detailed classification report
    print("Classification Report (Test):\n", classification_report(y_test, y_test_pred))

def compare_models_statistically(model1, model2, X_test1, X_test2, y_test):
    # Get predictions
    y_pred1 = model1.predict(X_test1)
    y_pred2 = model2.predict(X_test2)

    # Calculate accuracy for both models
    accuracy1 = accuracy_score(y_test, y_pred1)
    accuracy2 = accuracy_score(y_test, y_pred2)

    # Paired t-test for comparing model performance (accuracy)
    t_stat, p_value_ttest = ttest_rel(y_pred1, y_pred2)

    # McNemar's test for model comparison on the same dataset
    contingency_table = confusion_matrix(y_pred1 == y_test, y_pred2 == y_test)
    mcnemar_result = mcnemar(contingency_table, exact=False)

    # Print the results
    print(f"Accuracy of model1: {accuracy1:.4f}")
    print(f"Accuracy of model2: {accuracy2:.4f}")
    print("\nPaired t-test results:")
    print(f"t-statistic: {t_stat:.4f}")
    print(f"p-value: {p_value_ttest:.4f}")

    print("\nMcNemar's test results:")
    print(f"statistic: {mcnemar_result.statistic:.4f}")
    print(f"p-value: {mcnemar_result.pvalue:.4f}")

    # Interpretation
    if p_value_ttest < 0.05:
        print("\nPaired t-test: Significant difference between the two models.")
    else:
        print("\nPaired t-test: No significant difference between the two models.")

    if mcnemar_result.pvalue < 0.05:
        print("McNemar's test: Significant difference between the two models.")
    else:
        print("McNemar's test: No significant difference between the two models.")


In [3]:
# Upload features
if any(fn.startswith('features_album') for fn in os.listdir('.')):
  print('Features already uploaded')
else:
  uploaded = files.upload()

Saving features_album_HECKTOR_EUVIP2024_HPV_PT- GTVp.csv to features_album_HECKTOR_EUVIP2024_HPV_PT- GTVp.csv
Saving features_album_HECKTOR_EUVIP2024_HPV_PT- GTVn.csv to features_album_HECKTOR_EUVIP2024_HPV_PT- GTVn.csv
Saving features_album_HECKTOR_EUVIP2024_HPV_CT- GTVp.csv to features_album_HECKTOR_EUVIP2024_HPV_CT- GTVp.csv
Saving features_album_HECKTOR_EUVIP2024_HPV_CT- GTVn.csv to features_album_HECKTOR_EUVIP2024_HPV_CT- GTVn.csv


In [4]:
# Upload hpv outcome
if os.path.exists('hecktor2022_HPV_outcomesBalanced.csv'):
  print('Outcome data already uploaded')
else:
  uploaded = files.upload()

Saving hecktor2022_HPV_outcomesBalanced.csv to hecktor2022_HPV_outcomesBalanced.csv


In [8]:
# Upload patient split
if any(fn.startswith('patient_split') for fn in os.listdir('.')):
  print('Patient split already uploaded')
else:
  uploaded = files.upload()

Saving patient_split.csv to patient_split.csv


In [6]:
# Upload clinical info
if any(fn.startswith('hecktor2022_clinicalFeatures') for fn in os.listdir('.')):
  print('Clinical info already uploaded')
else:
  uploaded = files.upload()

Saving hecktor2022_clinicalFeatures.csv to hecktor2022_clinicalFeatures.csv


In [119]:
# Load the data (features, outcomes and train/test split)
features_df = load_features(folder_path='./', file_start="features_album")
outcome_df = pd.read_csv('hecktor2022_HPV_outcomesBalanced.csv')
split_df = pd.read_csv('patient_split.csv')
clinical_df = pd.read_csv('hecktor2022_clinicalFeatures.csv')
print(features_df.shape,outcome_df.shape,split_df.shape, clinical_df.shape)

(408, 115) (102, 2) (102, 2) (524, 10)


In [120]:
# Preprocess the data
features_df = preprocess_data(features_df)
# Filter out patients if not present in features or outcome data
features_df, outcome_df = filter_patients(features_df, outcome_df)
clinical_df, outcome_df = filter_patients(clinical_df, outcome_df)
# Ensure indices are aligned
features_df = features_df.set_index('PatientID').reindex(split_df['PatientID']).reset_index()
outcome_df = outcome_df.set_index('PatientID').reindex(split_df['PatientID']).reset_index()
clinical_df = clinical_df.set_index('PatientID').reindex(split_df['PatientID']).reset_index()

# Print all available features (rois, modalities, families and feature names)
print("available features:", extract_components(features_df))

Number of features:  359
Deleted patients from df2: 0 set()
Deleted patients from df1: 0 set()
Remaining patients: 102
Deleted patients from df2: 0 set()
Deleted patients from df1: 422 {'HGJ-028', 'HGJ-080', 'MDA-085', 'CHUV-040', 'MDA-178', 'HGJ-088', 'MDA-044', 'MDA-182', 'CHUM-018', 'MDA-140', 'CHUV-039', 'HGJ-035', 'MDA-004', 'CHUM-016', 'CHUP-000', 'CHUM-047', 'MDA-142', 'MDA-043', 'MDA-115', 'CHUM-027', 'CHUM-063', 'CHUV-009', 'CHUM-017', 'CHUV-015', 'MDA-056', 'CHUS-098', 'CHUP-027', 'CHUM-032', 'MDA-037', 'MDA-046', 'CHUV-025', 'MDA-001', 'CHUP-054', 'CHUV-042', 'CHUV-029', 'MDA-196', 'CHUS-019', 'CHUV-041', 'MDA-042', 'CHUM-011', 'CHUS-078', 'CHUS-090', 'MDA-014', 'CHUP-065', 'MDA-010', 'CHUV-021', 'MDA-180', 'MDA-027', 'CHUS-038', 'MDA-134', 'HMR-001', 'MDA-146', 'CHUM-002', 'MDA-109', 'CHUM-054', 'CHUP-060', 'MDA-030', 'CHUS-005', 'MDA-187', 'HMR-005', 'CHUS-060', 'CHUS-039', 'MDA-036', 'CHUS-010', 'CHUV-002', 'HGJ-008', 'MDA-195', 'CHUS-007', 'CHUP-063', 'MDA-177', 'HGJ-046

In [121]:
# Select specific set of features
features_dict = {
    'rois':['GTVp','GTVn'],
    'modalities':['CT','PT'],
    'originals':None,
    'families':None,
    'features':None
}
features_df1 = select_feature(features_df, features_dict)
print("Shape of radiomics data:",features_df1.shape)

# Select clinical infos
clinical_features = ['Age', 'Gender'] # 'Age', 'Gender', ...
clinical_features.insert(0,'PatientID')
clinical_df1 = clinical_df[clinical_features]

# Merge with clinical features
features_df1 = pd.merge(features_df1, clinical_df1, on='PatientID', how='inner')
print("Shape of radiomics+clinical data:", features_df1.shape)

Shape of radiomics data: (102, 359)
Shape of radiomics+clinical data: (102, 361)


In [123]:
# Pre-process features (one-hot encoding, scaling, imputation missing values)
features_df1 = feature_preprocessing(features_df1, target_column='PatientID')

# Prepare data for training
X = features_df1.drop(columns=['PatientID'])
print("Number of features:", X.shape[1])
y = outcome_df['Outcome']

# Split the dataset into training and testing sets based on the 'Split' column
X_train = X[split_df['Split'] == 'train']
X_test = X[split_df['Split'] == 'test']
y_train = y[split_df['Split'] == 'train']
y_test = y[split_df['Split'] == 'test']

# Remove correlated features
X_train, X_test = drop_correlated_features(X_train, X_test)

# Feature selection
# selector = SelectKBest(mutual_info_classif, k=20)
# X_train = selector.fit_transform(X_train, y_train)
# X_test = selector.transform(X_test)
# print("Number of features after selection:", X_test.shape[1])

# Define the model
model = RandomForestClassifier(n_estimators=200, min_samples_split=5, min_samples_leaf=10, max_depth=None, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model (recall of the positive class is also known as “sensitivity”; recall of the negative class is “specificity”.)
evaluate_model(model, X_train, y_train, X_test, y_test, run_bootstrap=True)

Number of features: 361
Accuracy (Train): 0.88
Accuracy (Test): 0.85
ROC AUC (Train): 0.88
ROC AUC (Test): 0.88
Bootstrap ROC AUC (Test): 0.877 (95% CI: 0.865 - 0.890)
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.73      1.00      0.84         8
           1       1.00      0.75      0.86        12

    accuracy                           0.85        20
   macro avg       0.86      0.88      0.85        20
weighted avg       0.89      0.85      0.85        20



Second model for comparison

In [105]:
# Select specific set of features
features_dict = {
    'rois':[],
    'modalities':None,
    'originals':None,
    'families':None,
    'features':None
}
features_df2 = select_feature(features_df, features_dict)
print("Shape of radiomics data:",features_df2.shape)

# Select clinical infos
clinical_features = ['CenterID'] # 'Age', 'Gender', ...
clinical_features.insert(0,'PatientID')
clinical_df2 = clinical_df[clinical_features]

# Merge with clinical features
features_df2 = pd.merge(features_df2, clinical_df2, on='PatientID', how='inner')
print("Shape of radiomics+clinical data:", features_df2.shape)

(102, 1)
(102, 2)


In [116]:
# Pre-process features (one-hot encoding, scaling, imputation missing values)
features_df2 = feature_preprocessing(features_df2, target_column='PatientID')

# Prepare data for training
X = features_df2.drop(columns=['PatientID'])
print("Number of features:", X.shape[1])
y = outcome_df['Outcome']

# Split the dataset into training and testing sets based on the 'Split' column
X_train = X[split_df['Split'] == 'train']
X_test2 = X[split_df['Split'] == 'test']
y_train = y[split_df['Split'] == 'train']
y_test = y[split_df['Split'] == 'test']

# Remove correlated features
# X_train, X_test2 = drop_correlated_features(X_train, X_test2)

# Feature selection
# selector = SelectKBest(mutual_info_classif, k=20)
# X_train = selector.fit_transform(X_train, y_train)
# X_test = selector.transform(X_test)
# print("Number of features after selection:", X_test.shape[1])

# Define the model
model2 = RandomForestClassifier(n_estimators=200, min_samples_split=5, min_samples_leaf=10, max_depth=None, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
evaluate_model(model, X_train, y_train, X_test2, y_test, run_bootstrap=True)

Number of features: 1
Number of features after dropping correlated features: 1
Accuracy (Train): 0.76
Accuracy (Test): 0.70
ROC AUC (Train): 0.76
ROC AUC (Test): 0.73
Bootstrap ROC AUC (Test): 0.725 (95% CI: 0.710 - 0.740)
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.58      0.88      0.70         8
           1       0.88      0.58      0.70        12

    accuracy                           0.70        20
   macro avg       0.73      0.73      0.70        20
weighted avg       0.76      0.70      0.70        20



In [None]:
# To do:
# Add specificity
# Add models: LogisticRegression, SVC
# permutation test permutation_test_score ?
# Check center-only perf ?
# Remove correlated features (find on train)
# How to share the files and colab

(array([1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1.,
        1., 0., 1.]),
 array([1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
        0., 0., 0.]))