<a href="https://colab.research.google.com/github/vandrearczyk/hecktor-euvip2024/blob/main/baseline_classification_hecktor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.utils import resample
from google.colab import files

In [19]:
def load_features(folder_path, file_start=""):
    """
    Load all CSV files from a specified folder and concatenate them into a single DataFrame.

    Args:
    folder_path (str): Path to the folder containing CSV files.

    Returns:
    pd.DataFrame: Combined DataFrame from all CSV files.
    """
    dfs = []
    for filename in os.listdir(folder_path):
        if filename.startswith(file_start) and filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

# def preprocess_data(df, prefixes=None):
#     """
#     Preprocess the DataFrame by keeping the first three columns and those starting with specified prefixes.
#     Then pivot the table to combine 'Modality', 'ROI', and each feature.

#     Args:
#     df (pd.DataFrame): Combined DataFrame from multiple CSV files.
#     prefixes (list of str or None): List of prefixes to keep in the DataFrame columns.
#                                     If None, all columns are retained.

#     Returns:
#     pd.DataFrame: Pivoted DataFrame ready for model training.
#     """
#     # Keep the first three columns
#     first_three_columns = df.iloc[:, :3]

#     # If prefixes is None, keep all columns, otherwise filter columns by the specified prefixes
#     if prefixes is None:
#         filtered_df = df
#     else:
#         filtered_columns = [col for col in df.columns if any(col.startswith(prefix) for prefix in prefixes)]
#         filtered_df = pd.concat([first_three_columns, df[filtered_columns]], axis=1)

#     # Melt the filtered DataFrame
#     feature_columns = [col for col in filtered_df.columns if col not in first_three_columns.columns]
#     melted_df = filtered_df.melt(id_vars=['PatientID', 'Modality', 'ROI'], value_vars=feature_columns, var_name='Feature')

#     # Create combined feature names
#     melted_df['Combined'] = melted_df['ROI'] + '_' + melted_df['Modality'] + '_' + melted_df['Feature']

#     # Pivot the DataFrame
#     pivoted_df = melted_df.pivot_table(index='PatientID', columns='Combined', values='value')
#     pivoted_df.reset_index(inplace=True)

#     print("Number of features: ", pivoted_df.shape[1])

#     return pivoted_df

def preprocess_data(df):
    """
    Preprocess the DataFrame by keeping the first three columns.
    Then pivot the table to combine 'Modality', 'ROI', and each feature.

    Args:
    df (pd.DataFrame): Combined DataFrame from multiple CSV files.

    Returns:
    pd.DataFrame: Pivoted DataFrame ready for model training.
    """
    # Keep the first three columns
    first_three_columns = df.iloc[:, :3]

    # Keep all columns
    filtered_df = df

    # Melt the filtered DataFrame
    feature_columns = [col for col in filtered_df.columns if col not in first_three_columns.columns]
    melted_df = filtered_df.melt(id_vars=['PatientID', 'Modality', 'ROI'], value_vars=feature_columns, var_name='Feature')

    # Create combined feature names
    melted_df['Combined'] = melted_df['ROI'] + '_' + melted_df['Modality'] + '_' + melted_df['Feature']

    # Pivot the DataFrame
    pivoted_df = melted_df.pivot_table(index='PatientID', columns='Combined', values='value')
    pivoted_df.reset_index(inplace=True)

    print("Number of features: ", pivoted_df.shape[1])

    return pivoted_df

def filter_patients(df1, df2):
    """
    Filter out patients not present in both DataFrames df1 and df2
    based on the 'PatientID' column.

    Args:
    df1 (pd.DataFrame): DataFrame containing patient features.
    df2 (pd.DataFrame): DataFrame containing patient survival data.

    Returns:
    tuple: A tuple containing the filtered pivoted_df and df2 DataFrames.
    """

    # Identify patients to be deleted from each DataFrame
    deleted_from_df2 = set(df2['PatientID']) - set(df1['PatientID'])
    deleted_from_df1 = set(df1['PatientID']) - set(df2['PatientID'])

    # Print details of deleted patients
    print("Deleted patients from df2:", len(deleted_from_df2), deleted_from_df2)
    print("Deleted patients from df1:", len(deleted_from_df1), deleted_from_df1)

    # Filter DataFrames to only include matching patients
    df1 = df1[df1['PatientID'].isin(df2['PatientID'])]
    df2 = df2[df2['PatientID'].isin(df1['PatientID'])]

    # Print the number of remaining patients
    print("Remaining patients:", df1.shape[0])

    return df1, df2

def bootstrap_analysis(X, y, model, n_bootstrap=1000):
    boot_accuracies = []
    boot_roc_aucs = []

    for _ in range(n_bootstrap):
        # Resample with replacement
        X_resample, y_resample = resample(X, y, random_state=42)

        # Make predictions on the resampled data
        y_pred_resample = model.predict(X_resample)

        # Calculate metrics
        accuracy = accuracy_score(y_resample, y_pred_resample)
        roc_auc = roc_auc_score(y_resample, y_pred_resample)

        boot_accuracies.append(accuracy)
        boot_roc_aucs.append(roc_auc)

    # Calculate the mean and 95% confidence intervals
    accuracy_mean = np.mean(boot_accuracies)
    accuracy_ci_lower = np.percentile(boot_accuracies, 2.5)
    accuracy_ci_upper = np.percentile(boot_accuracies, 97.5)

    roc_auc_mean = np.mean(boot_roc_aucs)
    roc_auc_ci_lower = np.percentile(boot_roc_aucs, 2.5)
    roc_auc_ci_upper = np.percentile(boot_roc_aucs, 97.5)

    return (accuracy_mean, accuracy_ci_lower, accuracy_ci_upper), (roc_auc_mean, roc_auc_ci_lower, roc_auc_ci_upper)

def evaluate_model(model, X_train, y_train, X_test, y_test, run_bootstrap=False):
    # Evaluate the model
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    roc_auc_train = roc_auc_score(y_train, y_train_pred)
    roc_auc_test = roc_auc_score(y_test, y_test_pred)

    print(f'Accuracy (Train): {train_accuracy:.2f}')
    print(f'Accuracy (Test): {test_accuracy:.2f}')
    print(f'ROC AUC (Train): {roc_auc_train:.2f}')
    print(f'ROC AUC (Test): {roc_auc_test:.2f}')

    # Perform bootstrap analysis if required
    if run_bootstrap:
        accuracy_results, roc_auc_results = bootstrap_analysis(X_test, y_test, model)

        # Output bootstrap results
        print(f"Bootstrap Accuracy: {accuracy_results[0]:.3f} (95% CI: {accuracy_results[1]:.3f} - {accuracy_results[2]:.3f})")
        print(f"Bootstrap ROC AUC: {roc_auc_results[0]:.3f} (95% CI: {roc_auc_results[1]:.3f} - {roc_auc_results[2]:.3f})")

    # Detailed classification report
    print("Classification Report (Test):\n", classification_report(y_test, y_test_pred))

def extract_components(df):
    # Clean column names to remove spaces
    df.columns = df.columns.str.replace(' ', '', regex=False)

    # Initialize sets to store unique components
    rois = set()
    modalities = set()
    originals = set()
    families = set()
    features = set()

    # Iterate over the columns and extract components
    for col in df.columns:
        # Skip 'PatientID' and 'Split' columns
        if col in ['PatientID', 'Split']:
            continue

        # Split the column name into components
        col_parts = col.split('_')

        # Add each component to its corresponding set
        rois.add(col_parts[0])
        modalities.add(col_parts[1])
        originals.add(col_parts[2])
        families.add(col_parts[3])
        features.add(col_parts[4])

    # Convert sets to sorted lists
    return {
        'rois': sorted(rois),
        'modalities': sorted(modalities),
        'originals': sorted(originals),
        'families': sorted(families),
        'features': sorted(features)
    }

def select_feature(df, rois=None, modalities=None, original=None, families=None, features=None):

    # Remove all spaces from column names
    df.columns = df.columns.str.replace(' ', '', regex=False)

    # Define the prefix for columns that should be kept (PatientID and Split)
    keep_columns = ['PatientID', 'Split']

    # Iterate over all columns in the dataframe
    for col in df.columns:
        # Skip the PatientID and Split columns
        if col in keep_columns:
            continue

        # Split the column name into parts
        col_parts = col.split('_')

        # Unpack the parts based on the known pattern
        roi, modality, orig, family, feature = col_parts[0], col_parts[1], col_parts[2], col_parts[3], col_parts[4]

        # Check if this column should be included based on the filters provided
        if ((rois is None or roi in rois) and
            (modalities is None or modality in modalities) and
            (original is None or orig in original) and
            (families is None or family in families) and
            (features is None or feature in features)):
            keep_columns.append(col)

    # Return the dataframe with only the selected columns
    return df[keep_columns]

In [5]:
# Upload features
if any(fn.startswith('features_album') for fn in os.listdir('.')):
  print('Features already uploaded')
else:
  uploaded = files.upload()

Saving features_album_HECKTOR_EUVIP_2024_PT- GTVp.csv to features_album_HECKTOR_EUVIP_2024_PT- GTVp.csv
Saving features_album_HECKTOR_EUVIP_2024_CT- GTVp.csv to features_album_HECKTOR_EUVIP_2024_CT- GTVp.csv
Saving features_album_HECKTOR_EUVIP_2024_PT- GTVn.csv to features_album_HECKTOR_EUVIP_2024_PT- GTVn.csv
Saving features_album_HECKTOR_EUVIP_2024_CT- GTVn.csv to features_album_HECKTOR_EUVIP_2024_CT- GTVn.csv


In [6]:
# Upload hpv outcome
if os.path.exists('hpv_outcome.csv'):
  print('Outcome data already uploaded')
else:
  uploaded = files.upload()

Saving hpv_outcome.csv to hpv_outcome.csv


In [7]:
# Upload patient split
if any(fn.startswith('patient_split') for fn in os.listdir('.')):
  print('Patient split already uploaded')
else:
  uploaded = files.upload()

Saving patient_split.csv to patient_split.csv


In [66]:
# Load the data (features, outcomes and train/test split)
features_df = load_features(folder_path='./', file_start="features_album")
outcome_df = pd.read_csv('hpv_outcome.csv')
split_df = pd.read_csv('patient_split.csv')

In [67]:
# Preprocess the data
features_df = preprocess_data(features_df)
# Filter out patients if not present in features or outcome data
features_df, outcome_df = filter_patients(features_df, outcome_df)
# Merge split_df with features_df and outcome_df to ensure indices are aligned
features_df = features_df.merge(split_df[['PatientID', 'Split']], on='PatientID')
outcome_df = outcome_df.merge(split_df[['PatientID', 'Split']], on='PatientID')


Number of features:  221
Deleted patients from df2: 0 set()
Deleted patients from df1: 402 {'CHUS-005', 'CHUV-029', 'MDA-030', 'CHUP-052', 'MDA-070', 'CHUM-006', 'MDA-189', 'MDA-033', 'HGJ-028', 'MDA-094', 'HGJ-013', 'MDA-052', 'MDA-150', 'CHUS-027', 'CHUV-008', 'MDA-188', 'CHUP-041', 'MDA-134', 'CHUV-036', 'CHUV-017', 'MDA-028', 'CHUS-045', 'MDA-092', 'CHUP-060', 'CHUS-030', 'HGJ-016', 'MDA-090', 'MDA-104', 'HMR-004', 'CHUP-004', 'HGJ-087', 'MDA-101', 'CHUS-006', 'MDA-098', 'HGJ-086', 'CHUM-010', 'CHUS-098', 'CHUV-023', 'MDA-023', 'CHUV-028', 'CHUS-004', 'CHUV-031', 'CHUP-074', 'CHUV-038', 'MDA-038', 'MDA-082', 'CHUV-012', 'MDA-048', 'MDA-020', 'CHUS-053', 'CHUV-046', 'MDA-027', 'HGJ-048', 'CHUM-017', 'MDA-131', 'CHUS-049', 'HGJ-038', 'MDA-166', 'CHUS-088', 'HGJ-018', 'CHUS-035', 'MDA-165', 'MDA-111', 'MDA-137', 'CHUS-003', 'HMR-014', 'CHUS-020', 'HGJ-035', 'CHUM-012', 'CHUV-003', 'CHUS-097', 'HGJ-072', 'MDA-046', 'CHUS-066', 'MDA-021', 'HGJ-088', 'CHUS-067', 'MDA-081', 'CHUM-043', 'C

In [82]:
print(features_df.columns)

Index(['PatientID', 'GTVn_CT_original_intensity_10Percentile',
       'GTVn_CT_original_intensity_90Percentile',
       'GTVn_CT_original_intensity_Energy',
       'GTVn_CT_original_intensity_Entropy',
       'GTVn_CT_original_intensity_InterquartileRange',
       'GTVn_CT_original_intensity_Kurtosis',
       'GTVn_CT_original_intensity_Maximum', 'GTVn_CT_original_intensity_Mean',
       'GTVn_CT_original_intensity_MeanAbsoluteDeviation',
       ...
       'GTVp_PT_original_shape_Maximum2DDiameterRow',
       'GTVp_PT_original_shape_Maximum2DDiameterSlice',
       'GTVp_PT_original_shape_Maximum3DDiameter',
       'GTVp_PT_original_shape_MeshVolume',
       'GTVp_PT_original_shape_MinorAxisLength',
       'GTVp_PT_original_shape_Sphericity',
       'GTVp_PT_original_shape_SurfaceArea',
       'GTVp_PT_original_shape_SurfaceVolumeRatio',
       'GTVp_PT_original_shape_VoxelVolume', 'Split'],
      dtype='object', length=222)


In [89]:
# Print all available features (rois, modalities, families and feature names)
print(extract_components(features_df))

# Select specific set of features
rois = ['GTVp']
modalities = ['PT']
originals = ['original']
families = ['intensity','SUV','shape']
features = None
features_df1 = select_feature(features_df, rois, modalities, originals, families, features)
print(features_df1.shape)

{'rois': ['GTVn', 'GTVp'], 'modalities': ['CT', 'PT'], 'originals': ['original'], 'families': ['SUV', 'glcm', 'glrlm', 'glszm', 'intensity', 'shape'], 'features': ['10Percentile', '90Percentile', 'Autocorrelation', 'ClusterProminence', 'ClusterShade', 'ClusterTendency', 'Contrast', 'Correlation', 'DifferenceAverage', 'DifferenceEntropy', 'DifferenceVariance', 'Elongation', 'Energy', 'Entropy', 'Flatness', 'GrayLevelNonUniformity', 'GrayLevelNonUniformityNormalized', 'GrayLevelVariance', 'HighGrayLevelRunEmphasis', 'HighGrayLevelZoneEmphasis', 'Id', 'Idm', 'Idmn', 'Idn', 'Imc1', 'Imc2', 'InterquartileRange', 'InverseVariance', 'JointAverage', 'JointEnergy', 'JointEntropy', 'Kurtosis', 'LargeAreaEmphasis', 'LargeAreaHighGrayLevelEmphasis', 'LargeAreaLowGrayLevelEmphasis', 'LeastAxisLength', 'LongRunEmphasis', 'LongRunHighGrayLevelEmphasis', 'LongRunLowGrayLevelEmphasis', 'LowGrayLevelRunEmphasis', 'LowGrayLevelZoneEmphasis', 'MCC', 'MTV', 'MajorAxisLength', 'Maximum', 'Maximum2DDiameterC

In [90]:
# Prepare data for training
X = features_df1.drop(columns=['PatientID', 'Split'])
X = X.fillna(X.mean())
print("Number of features:", X.shape[1])
y = outcome_df['Outcome']

# Ensure the index of y matches the merged_df index
y = y.loc[features_df1.index]

# Split the dataset into training and testing sets based on the 'Split' column
X_train = X[features_df1['Split'] == 'train']
X_test = X[features_df1['Split'] == 'test']
y_train = y[features_df1['Split'] == 'train']
y_test = y[features_df1['Split'] == 'test']

# Feature selection
# selector = SelectKBest(mutual_info_classif, k=20)
# X_train = selector.fit_transform(X_train, y_train)
# X_test = selector.transform(X_test)
# print("Number of features after selection:", X_test.shape[1])

# Train the model
# model = RandomForestClassifier(n_estimators=100, min_samples_split=10, min_samples_leaf=20, random_state=42)
model = RandomForestClassifier(n_estimators=200, min_samples_split=5, min_samples_leaf=10, max_depth=None, random_state=42)
# model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
evaluate_model(model, X_train, y_train, X_test, y_test, run_bootstrap=False)

Number of features: 35
Accuracy (Train): 0.76
Accuracy (Test): 0.63
ROC AUC (Train): 0.76
ROC AUC (Test): 0.65
Classification Report (Test):
               precision    recall  f1-score   support

           0       0.75      0.55      0.63        11
           1       0.55      0.75      0.63         8

    accuracy                           0.63        19
   macro avg       0.65      0.65      0.63        19
weighted avg       0.66      0.63      0.63        19



In [None]:
# To do: Statistical test between 2 models

(9, 12)