notebook to test and find best hyperparameters for various models

In [67]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV as RSCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [2]:
import os

pid = os.getpid()
print("PID: %i" % pid)

n_cpu = os.cpu_count()   # Number of CPUs assigned to this process
print("Number of CPUs in the system:", n_cpu)

# we won't use all the available cpu's for this script 
n_jobs = n_cpu - 2 # The number of tasks to run in parallel

# Control which CPUs are made available for this script
cpu_arg = ''.join([str(ci) + ',' for ci in list(range(n_jobs))])[:-1]
cmd = 'taskset -cp %s %i' % (cpu_arg, pid)
print("executing command '%s' ..." % cmd)
os.system(cmd)

PID: 20364
Number of CPUs in the system: 16
executing command 'taskset -cp 0,1,2,3,4,5,6,7,8,9,10,11,12,13 20364' ...


1

In [3]:
# path to cleaned data file
trainFilePath = 'data/data_train.pkl'

# Read already prepared and saved train datasets
with open(trainFilePath, 'rb') as f:
    data_train = pickle.load(f)

# Create dummy variables for categorical data
data_train = pd.get_dummies(data_train, columns=['clinical_stage', 'biopsy_gleason_gg', 'pathological_gleason_gg',
                                'pathologic_stage', 'lni', 'surgical_margin_status', 'persistent_psa',
                                'TRYSgrupes', 'PLNDO1'])

In [4]:
"""
Explodes the provided "df" dataset based on provided survival column "time" and
clips the data to be in a range [min_time; max_time] (). A new discrete survival column
will be created with name set as variable "time_discrete". "cum_event" boolean determines
if cumulative event column will be created or no.

clip(lower, upper) function will help us create a new discrete survival time column. 
If we specify lower=1 and upper=200, patients who experienced event earlier than 200th 
month will only have records till their event, on other side, if a patient survived past 
200th month, we will clip this information and will only keep information about him til 200th month.
Another example, if we specify lower=140 and upper=200, and if the person experienced event 
at 100th month, we will create records for him till 140th (lower boundary) month.
"""
def explode_data(df,max_time,time,target_column,min_time=1,
                 time_discrete='survival_time_discrete',cum_event=False):

    target_column_discrete = target_column + '_discrete'

    # We create a new time column and clip the data by provided min and max survival times
    df[time_discrete] = df[time].clip(min_time,max_time).apply(range)

    # Exploding the dataset with the created range value in new time column
    data_exploded = df.explode(time_discrete)
    data_exploded.reset_index(drop=True, inplace=True)

    # New column starts at 0, we'll increase each value by 1
    data_exploded[time_discrete] = pd.to_numeric(data_exploded[time_discrete]) + 1

    # New event column, which will indicate the last event date
    data_exploded[target_column_discrete] = (data_exploded[time_discrete] >= data_exploded[time]) * pd.to_numeric(data_exploded[target_column])
    
    if cum_event == True:
        target_column_cumulative = target_column + '_cumulative'

        # Create new event column with duplicated event values from discrete column
        data_exploded[target_column_cumulative] = data_exploded[target_column_discrete]
        
        # For cumulative events, after end_time we will have NA values, we'll replace those with event indicator
        after_survival_time = data_exploded[time_discrete] > data_exploded[time]
        data_exploded.loc[after_survival_time, target_column_discrete] = -1
        data_exploded[target_column_discrete] = data_exploded[target_column_discrete].replace(-1,np.NaN)
        data_exploded.loc[(after_survival_time & (data_exploded[target_column]==0)), target_column_cumulative] = -1
        data_exploded[target_column_cumulative] = data_exploded[target_column_cumulative].replace(-1,np.NaN)

    return data_exploded


"""
Given an exploded dataset with instant mortality probabilities "event_probability_column"
and "id_column" for grouping (optional), cumulative hazard column will be calculated
"""
def cumulative_hazard(df, event_probability_column, id_column):
    data_copy = df.copy()
    if id_column is not None:
        data_copy = data_copy[ [id_column, event_probability_column] ]
    else:
        data_copy = data_copy[ [event_probability_column] ]
    data_copy['negative_log_prob'] = np.log( 1 - data_copy[event_probability_column] )
    if id_column is not None:
        data_copy['cumulative_hazard'] = 1 - np.exp(data_copy.groupby(id_column)['negative_log_prob'].transform(pd.Series.cumsum))
    else:
        data_copy['cumulative_hazard'] = 1 - np.exp(data_copy['negative_log_prob'].cumsum())
    return data_copy['cumulative_hazard']



def add_predict_probabilities_optimized(df_exploded, target_column, model):
    """
    Given exploded datase. Adds predictend instant mortality probabilities as well as cumulative ones.
    """

    df_exploded_copy = df_exploded.copy()
    df_exploded_copy.drop(['patient_id', target_column + '_cumulative'], axis=1, inplace=True)
    
    # probabilities
    y_pred = model.predict_proba(df_exploded_copy)[:,1]
    df_exploded['mortality_instant_prob'] = y_pred

    # Cumulative hazard for each patient
    df_exploded['cumulative_hazard'] = cumulative_hazard(df_exploded,'mortality_instant_prob','patient_id')
    
    return df_exploded

## Random Forest

### RandomizedSearchCV

#### Cancer specific mortality

In [6]:
target_column = 'cancer_specific_mortality'
max_time = 200

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# Explode the dataset
df_train_copy_exploded = explode_data(data_train.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]

Unnamed: 0,age,psa,biopsy_gleason,pathologic_gleason,bcr,mts,death_from_other_causes,clinical_stage_1,clinical_stage_2,clinical_stage_3,...,surgical_margin_status_0,surgical_margin_status_1,persistent_psa_0,persistent_psa_1,TRYSgrupes_0,TRYSgrupes_1,TRYSgrupes_2,PLNDO1_0,PLNDO1_1,survival_time_discrete
0,73.0,6.36,6.0,7.0,1,0,0,0,0,1,...,0,1,1,0,0,1,0,1,0,1
1,73.0,6.36,6.0,7.0,1,0,0,0,0,1,...,0,1,1,0,0,1,0,1,0,2
2,73.0,6.36,6.0,7.0,1,0,0,0,0,1,...,0,1,1,0,0,1,0,1,0,3
3,73.0,6.36,6.0,7.0,1,0,0,0,0,1,...,0,1,1,0,0,1,0,1,0,4
4,73.0,6.36,6.0,7.0,1,0,0,0,0,1,...,0,1,1,0,0,1,0,1,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250195,68.0,6.00,6.0,8.0,1,0,0,0,1,0,...,0,1,1,0,0,0,1,1,0,196
250196,68.0,6.00,6.0,8.0,1,0,0,0,1,0,...,0,1,1,0,0,0,1,1,0,197
250197,68.0,6.00,6.0,8.0,1,0,0,0,1,0,...,0,1,1,0,0,0,1,1,0,198
250198,68.0,6.00,6.0,8.0,1,0,0,0,1,0,...,0,1,1,0,0,0,1,1,0,199


In [21]:
param_grid = {'n_estimators':np.arange(150,700,50),
              'max_features':np.arange(0.1, 1, 0.1),
              'max_depth': [3, 5, 7, 9],
              'max_samples': [0.3, 0.5, 0.8],
              'min_samples_leaf': [1, 2, 3, 4, 5]}

rf_random = RSCV(RandomForestClassifier(), param_grid, n_iter=15, 
             random_state=0, verbose=2, scoring='roc_auc', n_jobs=n_jobs)
model_rf = rf_random.fit(X_train, y_train)
model_rf_best = model_rf.best_estimator_

Fitting 5 folds for each of 15 candidates, totalling 75 fits


In [22]:
# Best parameters
model_rf.best_params_

{'n_estimators': 200,
 'min_samples_leaf': 5,
 'max_samples': 0.3,
 'max_features': 0.5,
 'max_depth': 3}

### Bayesian Optimization

#### cancer specific mortality

##### only train data

In [73]:
target_column = 'cancer_specific_mortality'
max_time = 216

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# Explode the dataset
df_train_copy_exploded = explode_data(data_train.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]

df_train_copy_exploded_cumulative = explode_data(data_train.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_train_copy_exploded_cumulative = df_train_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [74]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.uniform('max_depth',5,20),
        'max_features': hp.choice('max_features', ['sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.uniform('n_estimators', 100, 500)
    }

def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = int(space['max_depth']),
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = int(space['n_estimators']), 
                                 )
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_train_copy_exploded_pred = add_predict_probabilities_optimized(df_train_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Training data ---
        # Selecting a subset of data based on the months
        select = (df_train_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_train_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_train_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [75]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [14:09<00:00, 10.61s/trial, best loss: -0.9522071189577052]


{'criterion': 1,
 'max_depth': 19.560721062307877,
 'max_features': 2,
 'min_samples_leaf': 0.0008970113318042469,
 'min_samples_split': 0.03442949793468529,
 'n_estimators': 117.17628335911297}

In [29]:
best

{'criterion': 1,
 'max_depth': 200.0,
 'max_features': 3,
 'min_samples_leaf': 0.043813847274908446,
 'min_samples_split': 0.551636834827235,
 'n_estimators': 5}

##### train / validation split

In [40]:
target_column = 'cancer_specific_mortality'
max_time = 216

data_train_copy = data_train.copy()

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# 90/10 split and stratify based on 'overall_mortality'
data_train_hyp, data_val_hyp, y_train_hyp, y_test_hyp = train_test_split(
    data_train_copy,
    data_train_copy[target_columns], test_size=0.1, random_state=2, 
    stratify=data_train_copy[target_column])

data_train_hyp[target_columns] = y_train_hyp
data_val_hyp[target_columns] = y_test_hyp


# Explode the dataset
df_train_copy_exploded = explode_data(data_train_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]


# Explode the dataset
df_val_copy_exploded = explode_data(data_val_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_val = df_val_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_val = df_val_copy_exploded[target_column_discrete]

df_val_copy_exploded_cumulative = explode_data(data_val_hyp.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_val_copy_exploded_cumulative = df_val_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [41]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.uniform('max_depth',5,20),
        'max_features': hp.choice('max_features', ['sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.uniform('n_estimators', 100, 500)
    }

def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = int(space['max_depth']),
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = int(space['n_estimators']), 
                                 )
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_val_copy_exploded_pred = add_predict_probabilities_optimized(df_val_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Validation data ---
        # Selecting a subset of data based on the months
        select = (df_val_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_val_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_val_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [44]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [09:15<00:00,  6.95s/trial, best loss: -0.9026579108637769]


{'criterion': 0,
 'max_depth': 17.937192102303005,
 'max_features': 2,
 'min_samples_leaf': 0.03932576931296611,
 'min_samples_split': 0.025730604369255006,
 'n_estimators': 494.2199824963416}

#### death from other causes

##### only train data

In [62]:
target_column = 'death_from_other_causes'
max_time = 216

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# Explode the dataset
df_train_copy_exploded = explode_data(data_train.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]

df_train_copy_exploded_cumulative = explode_data(data_train.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_train_copy_exploded_cumulative = df_train_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [63]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.uniform('max_depth',5,20),
        'max_features': hp.choice('max_features', ['sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.uniform('n_estimators', 100, 500)
    }

def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = int(space['max_depth']),
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = int(space['n_estimators']), 
                                 )
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_train_copy_exploded_pred = add_predict_probabilities_optimized(df_train_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Training data ---
        # Selecting a subset of data based on the months
        select = (df_train_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_train_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_train_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [65]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [09:08<00:00,  6.86s/trial, best loss: -0.8098503112132167]


{'criterion': 1,
 'max_depth': 9.859569892589294,
 'max_features': 0,
 'min_samples_leaf': 0.004777233684864247,
 'min_samples_split': 0.04259570641528535,
 'n_estimators': 187.7291402134495}

##### train / validation split

In [45]:
target_column = 'death_from_other_causes'
max_time = 216

data_train_copy = data_train.copy()

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# 90/10 split and stratify based on 'overall_mortality'
data_train_hyp, data_val_hyp, y_train_hyp, y_test_hyp = train_test_split(
    data_train_copy,
    data_train_copy[target_columns], test_size=0.1, random_state=2, 
    stratify=data_train_copy[target_column])

data_train_hyp[target_columns] = y_train_hyp
data_val_hyp[target_columns] = y_test_hyp


# Explode the dataset
df_train_copy_exploded = explode_data(data_train_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]


# Explode the dataset
df_val_copy_exploded = explode_data(data_val_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_val = df_val_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_val = df_val_copy_exploded[target_column_discrete]

df_val_copy_exploded_cumulative = explode_data(data_val_hyp.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_val_copy_exploded_cumulative = df_val_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [46]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.uniform('max_depth',5,20),
        'max_features': hp.choice('max_features', ['sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.uniform('n_estimators', 100, 500)
    }

def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = int(space['max_depth']),
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = int(space['n_estimators']), 
                                 )
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_val_copy_exploded_pred = add_predict_probabilities_optimized(df_val_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Validation data ---
        # Selecting a subset of data based on the months
        select = (df_val_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_val_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_val_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [47]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [07:58<00:00,  5.98s/trial, best loss: -0.6062502734573373]


{'criterion': 0,
 'max_depth': 15.99874407715512,
 'max_features': 2,
 'min_samples_leaf': 0.05467350337454149,
 'min_samples_split': 0.5886741613999928,
 'n_estimators': 376.7957625705628}

#### mts

##### only train data

In [66]:
target_column = 'mts'
max_time = 216

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# Explode the dataset
df_train_copy_exploded = explode_data(data_train.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]

df_train_copy_exploded_cumulative = explode_data(data_train.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_train_copy_exploded_cumulative = df_train_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [67]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.uniform('max_depth',5,20),
        'max_features': hp.choice('max_features', ['sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.uniform('n_estimators', 100, 500)
    }

def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = int(space['max_depth']),
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = int(space['n_estimators']), 
                                 )
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_train_copy_exploded_pred = add_predict_probabilities_optimized(df_train_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Training data ---
        # Selecting a subset of data based on the months
        select = (df_train_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_train_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_train_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [68]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [07:33<00:00,  5.67s/trial, best loss: -0.9064147406361287]


{'criterion': 0,
 'max_depth': 6.48444416018883,
 'max_features': 0,
 'min_samples_leaf': 0.01920890664943756,
 'min_samples_split': 0.0859146871809218,
 'n_estimators': 450.3159883151506}

##### train / validation split

In [48]:
target_column = 'mts'
max_time = 216

data_train_copy = data_train.copy()

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# 90/10 split and stratify based on 'overall_mortality'
data_train_hyp, data_val_hyp, y_train_hyp, y_test_hyp = train_test_split(
    data_train_copy,
    data_train_copy[target_columns], test_size=0.1, random_state=2, 
    stratify=data_train_copy[target_column])

data_train_hyp[target_columns] = y_train_hyp
data_val_hyp[target_columns] = y_test_hyp


# Explode the dataset
df_train_copy_exploded = explode_data(data_train_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]


# Explode the dataset
df_val_copy_exploded = explode_data(data_val_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_val = df_val_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_val = df_val_copy_exploded[target_column_discrete]

df_val_copy_exploded_cumulative = explode_data(data_val_hyp.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_val_copy_exploded_cumulative = df_val_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [49]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.uniform('max_depth',5,20),
        'max_features': hp.choice('max_features', ['sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.uniform('n_estimators', 100, 500)
    }

def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = int(space['max_depth']),
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = int(space['n_estimators']), 
                                 )
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_val_copy_exploded_pred = add_predict_probabilities_optimized(df_val_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Validation data ---
        # Selecting a subset of data based on the months
        select = (df_val_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_val_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_val_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [50]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [03:49<00:00,  2.87s/trial, best loss: -0.9636169531441549]


{'criterion': 1,
 'max_depth': 13.577996616687528,
 'max_features': 1,
 'min_samples_leaf': 0.036805117857165615,
 'min_samples_split': 0.3255535986892048,
 'n_estimators': 149.6122350278912}

#### bcr

##### only train data

In [70]:
target_column = 'bcr'
max_time = 216

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# Explode the dataset
df_train_copy_exploded = explode_data(data_train.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]

df_train_copy_exploded_cumulative = explode_data(data_train.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_train_copy_exploded_cumulative = df_train_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [71]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.uniform('max_depth',5,20),
        'max_features': hp.choice('max_features', ['sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.uniform('n_estimators', 100, 500)
    }

def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = int(space['max_depth']),
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = int(space['n_estimators']), 
                                 )
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_train_copy_exploded_pred = add_predict_probabilities_optimized(df_train_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Training data ---
        # Selecting a subset of data based on the months
        select = (df_train_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_train_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_train_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [72]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [09:35<00:00,  7.19s/trial, best loss: -0.9324703081729039]


{'criterion': 0,
 'max_depth': 18.542798032669822,
 'max_features': 0,
 'min_samples_leaf': 0.0011043494935267933,
 'min_samples_split': 0.000671643460114342,
 'n_estimators': 316.6923084008164}

##### train / validation split

In [51]:
target_column = 'bcr'
max_time = 216

data_train_copy = data_train.copy()

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# 90/10 split and stratify based on 'overall_mortality'
data_train_hyp, data_val_hyp, y_train_hyp, y_test_hyp = train_test_split(
    data_train_copy,
    data_train_copy[target_columns], test_size=0.1, random_state=2, 
    stratify=data_train_copy[target_column])

data_train_hyp[target_columns] = y_train_hyp
data_val_hyp[target_columns] = y_test_hyp


# Explode the dataset
df_train_copy_exploded = explode_data(data_train_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]


# Explode the dataset
df_val_copy_exploded = explode_data(data_val_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_val = df_val_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_val = df_val_copy_exploded[target_column_discrete]

df_val_copy_exploded_cumulative = explode_data(data_val_hyp.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_val_copy_exploded_cumulative = df_val_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [52]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.uniform('max_depth',5,20),
        'max_features': hp.choice('max_features', ['sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.uniform('n_estimators', 100, 500)
    }

def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = int(space['max_depth']),
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = int(space['n_estimators']), 
                                 )
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_val_copy_exploded_pred = add_predict_probabilities_optimized(df_val_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Validation data ---
        # Selecting a subset of data based on the months
        select = (df_val_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_val_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_val_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [53]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [03:36<00:00,  2.70s/trial, best loss: -0.8865986500383168]


{'criterion': 1,
 'max_depth': 9.362170539012933,
 'max_features': 1,
 'min_samples_leaf': 0.04889852261499672,
 'min_samples_split': 0.2490075040108427,
 'n_estimators': 160.7620972043769}

### XGBoost

#### cancer specific mortality

##### only train data

In [29]:
target_column = 'cancer_specific_mortality'
max_time = 216

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# Explode the dataset
df_train_copy_exploded = explode_data(data_train.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]

df_train_copy_exploded_cumulative = explode_data(data_train.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_train_copy_exploded_cumulative = df_train_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [30]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'learning_rate': hp.quniform('learning_rate', 0, 0.2, 0.025),
    'max_depth': hp.quniform('max_depth', 2, 20, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'n_estimators': hp.uniform('n_estimators', 100, 1000),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.1),
    'reg_lamda': hp.uniform('reg_lambda', 0, 10),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0, 1)
}

def objective(space):
    
    model = XGBClassifier(objective="binary:logistic", random_state=0, booster='gbtree', eval_metric='auc',
                      colsample_bytree=space['colsample_bytree'], gamma=space['gamma'], learning_rate=space['learning_rate'],
                      max_depth=int(space['max_depth']), n_estimators=int(space['n_estimators']), subsample=space['subsample'],
                     min_child_weight=space['min_child_weight'], scale_pos_weight=space['scale_pos_weight'], 
                      reg_lambda=space['reg_lamda'], reg_alpha=space['reg_alpha'])
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_train_copy_exploded_pred = add_predict_probabilities_optimized(df_train_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Training data ---
        # Selecting a subset of data based on the months
        select = (df_train_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_train_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_train_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [31]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 40, #80
            trials = trials)
best

100%|██████████| 40/40 [06:28<00:00,  9.72s/trial, best loss: -0.9987603535217533]


{'colsample_bytree': 0.9,
 'gamma': 0.0,
 'learning_rate': 0.2,
 'max_depth': 18.0,
 'min_child_weight': 2.0,
 'n_estimators': 494.0763574289686,
 'reg_alpha': 0.011624533771820083,
 'reg_lambda': 2.3750241072036045,
 'scale_pos_weight': 0.884642762544353,
 'subsample': 1.0}

##### train / validation split

In [54]:
target_column = 'cancer_specific_mortality'
max_time = 216

data_train_copy = data_train.copy()

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# 90/10 split and stratify based on 'overall_mortality'
data_train_hyp, data_val_hyp, y_train_hyp, y_test_hyp = train_test_split(
    data_train_copy,
    data_train_copy[target_columns], test_size=0.1, random_state=2, 
    stratify=data_train_copy[target_column])

data_train_hyp[target_columns] = y_train_hyp
data_val_hyp[target_columns] = y_test_hyp


# Explode the dataset
df_train_copy_exploded = explode_data(data_train_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]


# Explode the dataset
df_val_copy_exploded = explode_data(data_val_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_val = df_val_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_val = df_val_copy_exploded[target_column_discrete]

df_val_copy_exploded_cumulative = explode_data(data_val_hyp.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_val_copy_exploded_cumulative = df_val_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [55]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'learning_rate': hp.quniform('learning_rate', 0, 0.2, 0.025),
    'max_depth': hp.quniform('max_depth', 2, 20, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'n_estimators': hp.uniform('n_estimators', 100, 1000),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.1),
    'reg_lamda': hp.uniform('reg_lambda', 0, 10),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0, 1)
}

def objective(space):
    
    model = XGBClassifier(objective="binary:logistic", random_state=0, booster='gbtree', eval_metric='auc',
                      colsample_bytree=space['colsample_bytree'], gamma=space['gamma'], learning_rate=space['learning_rate'],
                      max_depth=int(space['max_depth']), n_estimators=int(space['n_estimators']), subsample=space['subsample'],
                     min_child_weight=space['min_child_weight'], scale_pos_weight=space['scale_pos_weight'], 
                      reg_lambda=space['reg_lamda'], reg_alpha=space['reg_alpha'])
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_val_copy_exploded_pred = add_predict_probabilities_optimized(df_val_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Validation data ---
        # Selecting a subset of data based on the months
        select = (df_val_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_val_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_val_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [56]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [08:38<00:00,  6.48s/trial, best loss: -0.8883445945029762]


{'colsample_bytree': 0.9,
 'gamma': 0.4,
 'learning_rate': 0.05,
 'max_depth': 10.0,
 'min_child_weight': 8.0,
 'n_estimators': 691.0814977776121,
 'reg_alpha': 0.036934916140565495,
 'reg_lambda': 4.266113036560311,
 'scale_pos_weight': 0.9921922974444556,
 'subsample': 0.5}

#### death from other causes

##### only train data

In [26]:
target_column = 'death_from_other_causes'
max_time = 216

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# Explode the dataset
df_train_copy_exploded = explode_data(data_train.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]

df_train_copy_exploded_cumulative = explode_data(data_train.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_train_copy_exploded_cumulative = df_train_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [27]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'learning_rate': hp.quniform('learning_rate', 0, 0.2, 0.025),
    'max_depth': hp.quniform('max_depth', 2, 20, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'n_estimators': hp.uniform('n_estimators', 100, 1000),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.1),
    'reg_lamda': hp.uniform('reg_lambda', 0, 10),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0, 1)
}

def objective(space):
    
    model = XGBClassifier(objective="binary:logistic", random_state=0, booster='gbtree', eval_metric='auc',
                      colsample_bytree=space['colsample_bytree'], gamma=space['gamma'], learning_rate=space['learning_rate'],
                      max_depth=int(space['max_depth']), n_estimators=int(space['n_estimators']), subsample=space['subsample'],
                     min_child_weight=space['min_child_weight'], scale_pos_weight=space['scale_pos_weight'], 
                      reg_lambda=space['reg_lamda'], reg_alpha=space['reg_alpha'])
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_train_copy_exploded_pred = add_predict_probabilities_optimized(df_train_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Training data ---
        # Selecting a subset of data based on the months
        select = (df_train_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_train_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_train_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [28]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 40, #80
            trials = trials)
best

100%|██████████| 40/40 [10:43<00:00, 16.09s/trial, best loss: -0.9992182587944143]


{'colsample_bytree': 0.65,
 'gamma': 0.25,
 'learning_rate': 0.17500000000000002,
 'max_depth': 11.0,
 'min_child_weight': 1.0,
 'n_estimators': 618.9569732240903,
 'reg_alpha': 0.00460840736862405,
 'reg_lambda': 1.5957720013269137,
 'scale_pos_weight': 0.9929006236241394,
 'subsample': 0.7000000000000001}

##### train / validation split

In [57]:
target_column = 'death_from_other_causes'
max_time = 216

data_train_copy = data_train.copy()

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# 90/10 split and stratify based on 'overall_mortality'
data_train_hyp, data_val_hyp, y_train_hyp, y_test_hyp = train_test_split(
    data_train_copy,
    data_train_copy[target_columns], test_size=0.1, random_state=2, 
    stratify=data_train_copy[target_column])

data_train_hyp[target_columns] = y_train_hyp
data_val_hyp[target_columns] = y_test_hyp


# Explode the dataset
df_train_copy_exploded = explode_data(data_train_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]


# Explode the dataset
df_val_copy_exploded = explode_data(data_val_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_val = df_val_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_val = df_val_copy_exploded[target_column_discrete]

df_val_copy_exploded_cumulative = explode_data(data_val_hyp.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_val_copy_exploded_cumulative = df_val_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [58]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'learning_rate': hp.quniform('learning_rate', 0, 0.2, 0.025),
    'max_depth': hp.quniform('max_depth', 2, 20, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'n_estimators': hp.uniform('n_estimators', 100, 1000),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.1),
    'reg_lamda': hp.uniform('reg_lambda', 0, 10),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0, 1)
}

def objective(space):
    
    model = XGBClassifier(objective="binary:logistic", random_state=0, booster='gbtree', eval_metric='auc',
                      colsample_bytree=space['colsample_bytree'], gamma=space['gamma'], learning_rate=space['learning_rate'],
                      max_depth=int(space['max_depth']), n_estimators=int(space['n_estimators']), subsample=space['subsample'],
                     min_child_weight=space['min_child_weight'], scale_pos_weight=space['scale_pos_weight'], 
                      reg_lambda=space['reg_lamda'], reg_alpha=space['reg_alpha'])
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_val_copy_exploded_pred = add_predict_probabilities_optimized(df_val_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Validation data ---
        # Selecting a subset of data based on the months
        select = (df_val_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_val_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_val_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [59]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [14:56<00:00, 11.21s/trial, best loss: -0.6426574234416307]


{'colsample_bytree': 0.8500000000000001,
 'gamma': 0.1,
 'learning_rate': 0.025,
 'max_depth': 13.0,
 'min_child_weight': 1.0,
 'n_estimators': 729.6590585991698,
 'reg_alpha': 0.030777819762639377,
 'reg_lambda': 7.139170115130897,
 'scale_pos_weight': 0.29417941021561633,
 'subsample': 0.9}

#### mts

##### only train data

In [18]:
target_column = 'mts'
max_time = 216

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# Explode the dataset
df_train_copy_exploded = explode_data(data_train.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]

df_train_copy_exploded_cumulative = explode_data(data_train.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_train_copy_exploded_cumulative = df_train_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [21]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'learning_rate': hp.quniform('learning_rate', 0, 0.2, 0.025),
    'max_depth': hp.quniform('max_depth', 2, 20, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'n_estimators': hp.uniform('n_estimators', 100, 1000),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.1),
    'reg_lamda': hp.uniform('reg_lambda', 0, 10),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0, 1)
}

def objective(space):
    
    model = XGBClassifier(objective="binary:logistic", random_state=0, booster='gbtree', eval_metric='auc',
                      colsample_bytree=space['colsample_bytree'], gamma=space['gamma'], learning_rate=space['learning_rate'],
                      max_depth=int(space['max_depth']), n_estimators=int(space['n_estimators']), subsample=space['subsample'],
                     min_child_weight=space['min_child_weight'], scale_pos_weight=space['scale_pos_weight'], 
                      reg_lambda=space['reg_lamda'], reg_alpha=space['reg_alpha'])
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_train_copy_exploded_pred = add_predict_probabilities_optimized(df_train_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Training data ---
        # Selecting a subset of data based on the months
        select = (df_train_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_train_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_train_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [22]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 40, #80
            trials = trials)
best

100%|██████████| 40/40 [05:37<00:00,  8.45s/trial, best loss: -0.9946783058491052]


{'colsample_bytree': 0.55,
 'gamma': 0.05,
 'learning_rate': 0.15000000000000002,
 'max_depth': 13.0,
 'min_child_weight': 2.0,
 'n_estimators': 535.373615733675,
 'reg_alpha': 0.012615056264249302,
 'reg_lambda': 0.7584283803105774,
 'scale_pos_weight': 0.9095133867383003,
 'subsample': 0.65}

##### train / validation split

In [60]:
target_column = 'mts'
max_time = 216

data_train_copy = data_train.copy()

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# 90/10 split and stratify based on 'overall_mortality'
data_train_hyp, data_val_hyp, y_train_hyp, y_test_hyp = train_test_split(
    data_train_copy,
    data_train_copy[target_columns], test_size=0.1, random_state=2, 
    stratify=data_train_copy[target_column])

data_train_hyp[target_columns] = y_train_hyp
data_val_hyp[target_columns] = y_test_hyp


# Explode the dataset
df_train_copy_exploded = explode_data(data_train_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]


# Explode the dataset
df_val_copy_exploded = explode_data(data_val_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_val = df_val_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_val = df_val_copy_exploded[target_column_discrete]

df_val_copy_exploded_cumulative = explode_data(data_val_hyp.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_val_copy_exploded_cumulative = df_val_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [61]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'learning_rate': hp.quniform('learning_rate', 0, 0.2, 0.025),
    'max_depth': hp.quniform('max_depth', 2, 20, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'n_estimators': hp.uniform('n_estimators', 100, 1000),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.1),
    'reg_lamda': hp.uniform('reg_lambda', 0, 10),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0, 1)
}

def objective(space):
    
    model = XGBClassifier(objective="binary:logistic", random_state=0, booster='gbtree', eval_metric='auc',
                      colsample_bytree=space['colsample_bytree'], gamma=space['gamma'], learning_rate=space['learning_rate'],
                      max_depth=int(space['max_depth']), n_estimators=int(space['n_estimators']), subsample=space['subsample'],
                     min_child_weight=space['min_child_weight'], scale_pos_weight=space['scale_pos_weight'], 
                      reg_lambda=space['reg_lamda'], reg_alpha=space['reg_alpha'])
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_val_copy_exploded_pred = add_predict_probabilities_optimized(df_val_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Validation data ---
        # Selecting a subset of data based on the months
        select = (df_val_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_val_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_val_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [62]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [06:19<00:00,  4.74s/trial, best loss: -0.9663154025765888]


{'colsample_bytree': 0.9,
 'gamma': 0.35000000000000003,
 'learning_rate': 0.1,
 'max_depth': 7.0,
 'min_child_weight': 1.0,
 'n_estimators': 666.5875643268608,
 'reg_alpha': 0.07425918532653869,
 'reg_lambda': 7.139570109761258,
 'scale_pos_weight': 0.19101970999175796,
 'subsample': 0.9500000000000001}

#### bcr

##### only train data

In [23]:
target_column = 'bcr'
max_time = 216

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# Explode the dataset
df_train_copy_exploded = explode_data(data_train.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]

df_train_copy_exploded_cumulative = explode_data(data_train.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_train_copy_exploded_cumulative = df_train_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [24]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'learning_rate': hp.quniform('learning_rate', 0, 0.2, 0.025),
    'max_depth': hp.quniform('max_depth', 2, 20, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'n_estimators': hp.uniform('n_estimators', 100, 1000),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.1),
    'reg_lamda': hp.uniform('reg_lambda', 0, 10),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0, 1)
}

def objective(space):
    
    model = XGBClassifier(objective="binary:logistic", random_state=0, booster='gbtree', eval_metric='auc',
                      colsample_bytree=space['colsample_bytree'], gamma=space['gamma'], learning_rate=space['learning_rate'],
                      max_depth=int(space['max_depth']), n_estimators=int(space['n_estimators']), subsample=space['subsample'],
                     min_child_weight=space['min_child_weight'], scale_pos_weight=space['scale_pos_weight'], 
                      reg_lambda=space['reg_lamda'], reg_alpha=space['reg_alpha'])
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_train_copy_exploded_pred = add_predict_probabilities_optimized(df_train_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Training data ---
        # Selecting a subset of data based on the months
        select = (df_train_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_train_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_train_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [25]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 40, #80
            trials = trials)
best

100%|██████████| 40/40 [06:04<00:00,  9.12s/trial, best loss: -0.9833019511876315]


{'colsample_bytree': 0.7000000000000001,
 'gamma': 0.1,
 'learning_rate': 0.2,
 'max_depth': 19.0,
 'min_child_weight': 1.0,
 'n_estimators': 320.2127746661102,
 'reg_alpha': 0.06919873757682321,
 'reg_lambda': 0.01593110711960384,
 'scale_pos_weight': 0.4838101096249437,
 'subsample': 0.7000000000000001}

##### train / validation split

In [63]:
target_column = 'bcr'
max_time = 216

data_train_copy = data_train.copy()

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

# 90/10 split and stratify based on 'overall_mortality'
data_train_hyp, data_val_hyp, y_train_hyp, y_test_hyp = train_test_split(
    data_train_copy,
    data_train_copy[target_columns], test_size=0.1, random_state=2, 
    stratify=data_train_copy[target_column])

data_train_hyp[target_columns] = y_train_hyp
data_val_hyp[target_columns] = y_test_hyp


# Explode the dataset
df_train_copy_exploded = explode_data(data_train_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]


# Explode the dataset
df_val_copy_exploded = explode_data(data_val_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_val = df_val_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_val = df_val_copy_exploded[target_column_discrete]

df_val_copy_exploded_cumulative = explode_data(data_val_hyp.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_val_copy_exploded_cumulative = df_val_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [64]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'learning_rate': hp.quniform('learning_rate', 0, 0.2, 0.025),
    'max_depth': hp.quniform('max_depth', 2, 20, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'n_estimators': hp.uniform('n_estimators', 100, 1000),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.1),
    'reg_lamda': hp.uniform('reg_lambda', 0, 10),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0, 1)
}

def objective(space):
    
    model = XGBClassifier(objective="binary:logistic", random_state=0, booster='gbtree', eval_metric='auc',
                      colsample_bytree=space['colsample_bytree'], gamma=space['gamma'], learning_rate=space['learning_rate'],
                      max_depth=int(space['max_depth']), n_estimators=int(space['n_estimators']), subsample=space['subsample'],
                     min_child_weight=space['min_child_weight'], scale_pos_weight=space['scale_pos_weight'], 
                      reg_lambda=space['reg_lamda'], reg_alpha=space['reg_alpha'])
    
    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_val_copy_exploded_pred = add_predict_probabilities_optimized(df_val_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Validation data ---
        # Selecting a subset of data based on the months
        select = (df_val_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_val_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_val_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [65]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [05:54<00:00,  4.44s/trial, best loss: -0.8920886403727195]


{'colsample_bytree': 0.8500000000000001,
 'gamma': 0.7000000000000001,
 'learning_rate': 0.125,
 'max_depth': 2.0,
 'min_child_weight': 10.0,
 'n_estimators': 563.9364756838096,
 'reg_alpha': 0.09983298911625862,
 'reg_lambda': 8.476381530931778,
 'scale_pos_weight': 0.17369127320043823,
 'subsample': 0.75}

### Support Vector Machine

#### cancer specific mortality

In [69]:
target_column = 'cancer_specific_mortality'
max_time = 216

data_train_copy = data_train.copy()

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

X = data_train_copy.drop(target_column, axis=1)
y = data_train_copy[target_column]


# 90/10 split and stratify based on 'overall_mortality'
data_train_hyp, data_val_hyp, y_train_hyp, y_test_hyp = train_test_split(
    data_train_copy,
    data_train_copy[target_columns], test_size=0.1, random_state=2, 
    stratify=data_train_copy[target_column])

data_train_hyp[target_columns] = y_train_hyp
data_val_hyp[target_columns] = y_test_hyp


# Explode the dataset
df_train_copy_exploded = explode_data(data_train_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_train = df_train_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_train = df_train_copy_exploded[target_column_discrete]


# Explode the dataset
df_val_copy_exploded = explode_data(data_val_hyp.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

# Drop targets/features from feature set
x_columns_to_drop = [target_column, target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
x_columns_to_drop.extend(target_columns)
X_val = df_val_copy_exploded.drop(x_columns_to_drop, axis=1)    
y_val = df_val_copy_exploded[target_column_discrete]

df_val_copy_exploded_cumulative = explode_data(data_val_hyp.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)
x_columns_to_drop_exploded_cumulative = [target_column+'_discrete', 'survival_months', 'survival_months_bcr', 'survival_months_mts']
x_columns_to_drop_exploded_cumulative.extend(target_columns)
df_val_copy_exploded_cumulative = df_val_copy_exploded_cumulative.drop(x_columns_to_drop_exploded_cumulative, axis=1)

In [None]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

space = {
    'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    'gamma': hp.quniform('gamma', 0, 1, 0.05),
    'learning_rate': hp.quniform('learning_rate', 0, 0.2, 0.025),
    'max_depth': hp.quniform('max_depth', 2, 20, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'n_estimators': hp.uniform('n_estimators', 100, 1000),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
    'reg_alpha': hp.uniform('reg_alpha', 0, 0.1),
    'reg_lamda': hp.uniform('reg_lambda', 0, 10),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0, 1)
}

def objective(space):
    
    # Create StratifiedKFold object.
    skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)

    for num, (train_id, valid_id) in enumerate(skf.split(X, y)):
        X_train, X_valid = X.loc[train_id], X.loc[valid_id]
        y_train, y_valid = y.loc[train_id], y.loc[valid_id]
        
        

        model = XGBClassifier(objective="binary:logistic", random_state=0, booster='gbtree', eval_metric='auc',
                      colsample_bytree=space['colsample_bytree'], gamma=space['gamma'], learning_rate=space['learning_rate'],
                      max_depth=int(space['max_depth']), n_estimators=int(space['n_estimators']), subsample=space['subsample'],
                     min_child_weight=space['min_child_weight'], scale_pos_weight=space['scale_pos_weight'], 
                      reg_lambda=space['reg_lamda'], reg_alpha=space['reg_alpha'])

        model.fit(X_train, y_train,
                verbose = False,
                eval_set = [(X_train, y_train), (X_valid, y_valid)],
                eval_metric = 'auc',
                early_stopping_rounds = 250)
        
        #Mean of the predictions
        preds += model.predict(X_test) / 10 # Splits
        
        #Mean of feature importance
        model_fi += model.feature_importances_ / 10 #splits
        
        #Out of Fold predictions
        oof_preds[valid_id] = model.predict(X_valid)
        fold_rmse = np.sqrt(mean_squared_error(y_valid, oof_preds[valid_id]))
        print(f"Fold {num} | RMSE: {fold_rmse}")
        
        mean_rmse += fold_rmse / 10

    # fit the data
    model.fit(X_train, y_train)
    
    # add predicted probabilities
    df_val_copy_exploded_pred = add_predict_probabilities_optimized(df_val_copy_exploded_cumulative.copy(), target_column, model)
    
    # calculate auc
    # AUC for each cumulative slice
    # Months at which we'll check the AUC's
    months = list(range(6, max_time, 6))

    train_auc_stats = []
    for month in months:
        # --- Validation data ---
        # Selecting a subset of data based on the months
        select = (df_val_copy_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_val_copy_exploded_pred[target_column+'_cumulative'])
        sub_dat = df_val_copy_exploded_pred[select]

        # If in the sliced data there's a event, calculate AUC metric,
        # otherwise assign NaN value
        if sub_dat[target_column+'_cumulative'].max() == 1:
            fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
            auc_stat = auc(fpr, tpr)
        else:
            auc_stat = float('NaN')
        train_auc_stats.append(auc_stat)

    auc_mean = np.nanmean(train_auc_stats)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_mean, 'status': STATUS_OK }

In [116]:
target_column = 'cancer_specific_mortality'
max_time = 216

space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.uniform('max_depth',5,20),
        'max_features': hp.choice('max_features', ['sqrt','log2', None]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.uniform ('min_samples_split', 0, 1),
        'n_estimators' : hp.uniform('n_estimators', 100, 500)
    }

data_train_copy = data_train.copy().reset_index(drop=True)

# mts and bcr have different survival months columns
match target_column:
    case 'mts':
        time = 'survival_months_mts'
    case 'bcr':
        time = 'survival_months_bcr'
    case _:
        time = 'survival_months'

target_column_discrete = target_column + '_discrete'

# List of columns names which will be dropped from feature set before fitting the model
target_columns = ['cancer_specific_mortality', 'death_from_other_causes', 'bcr', 'mts']

def objective(space):

    # Create StratifiedKFold object.
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

    auc_cv = []

    for train_index, val_index in skf.split(data_train_copy, data_train_copy.loc[:,target_column]):
        
        # get train and validation sets
        train = data_train_copy.loc[train_index,:]
        val = data_train_copy.loc[val_index,:]
        
        # explode train set
        train_exploded = explode_data(train.copy(), min_time=1, max_time=max_time, time=time, target_column=target_column)

        # explode validation set with cumulative events
        val_exploded_cumulative = explode_data(val.copy(), max_time=max_time, min_time=max_time, cum_event=True, time=time, target_column=target_column)

        # define model
        model = RandomForestClassifier(criterion = space['criterion'], max_depth = int(space['max_depth']),
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = int(space['n_estimators']), 
                                 )
        
        # columns to drop before fitting
        x_columns_to_drop = [target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts', 'patient_id']
        x_columns_to_drop.extend(target_columns)

        # train X and y
        X_train = train_exploded.drop(x_columns_to_drop, axis=1)
        y_train = train_exploded[target_column_discrete]

        # fit model
        model.fit(X_train, y_train)

        # columns to drop before making predictions
        x_columns_to_drop = [target_column_discrete, 'survival_months', 'survival_months_bcr', 'survival_months_mts']
        x_columns_to_drop.extend(target_columns)

        # add predicted probabilities
        df_val_exploded_pred = add_predict_probabilities_optimized(val_exploded_cumulative.drop(x_columns_to_drop,axis=1), target_column, model)

        # calculate auc
        # AUC for each cumulative slice
        # Months at which we'll check the AUC's
        months = list(range(6, max_time, 6))

        val_auc_stats = []
        for month in months:
            # --- Validation data ---
            # Selecting a subset of data based on the months
            select = (df_val_exploded_pred['survival_time_discrete'] == month) & pd.notna(df_val_exploded_pred[target_column+'_cumulative'])
            sub_dat = df_val_exploded_pred[select]

            # If in the sliced data there's a event, calculate AUC metric,
            # otherwise assign NaN value
            if sub_dat[target_column+'_cumulative'].max() == 1:
                fpr, tpr, thresholds = roc_curve(sub_dat[target_column+'_cumulative'], sub_dat['cumulative_hazard'])
                auc_stat = auc(fpr, tpr)
            else:
                auc_stat = float('NaN')
            val_auc_stats.append(auc_stat)

        auc_mean = np.nanmean(val_auc_stats)

        auc_cv.append(auc_mean)

    auc_cv_mean = np.nanmean(auc_cv)

    # We aim to maximize auc, therefore we return it as a negative value
    return {'loss': -auc_cv_mean, 'status': STATUS_OK }

In [117]:
trials = Trials()
best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 80, #80
            trials = trials)
best

100%|██████████| 80/80 [38:23<00:00, 28.79s/trial, best loss: -0.8941533300609423]


{'criterion': 0,
 'max_depth': 5.008074817422085,
 'max_features': 1,
 'min_samples_leaf': 0.0348778696429207,
 'min_samples_split': 0.10209281579299412,
 'n_estimators': 334.0017140265728}