In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from warnings import filterwarnings
filterwarnings('ignore')
import gc
import os

### MODELS
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

### sklearn utils
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (precision_score, recall_score, confusion_matrix,
                             classification_report, f1_score)
from sklearn.pipeline import Pipeline, make_pipeline

In [None]:
root_dir = "/your/path"

evaluation_dir = root_dir + "/evaluations"

dataset = pd.read_excel(f"{root_dir}/data/augmented_file.xlsx")

#dataset = entire_parameters.drop(["T2_Parenchyma", "FLAIR_Parenchyma","T1_Parenchyma", "T1CE_Parenchyma", "DWI_Parenchyma", "ADC_Parenchyma"], axis=1)
dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "MB" if x == "MEDULLOBLASTOMA" else x)
dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "EP" if x == "EPENDYMOMA" else x)
dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "PA" if x == "PILOCYTIC ASTROCYTOMA" else x)
dataset["TUMOR_TYPE"] = dataset["TUMOR_TYPE"].apply(lambda x: "BG" if x == "GLIOMA" else x)

In [None]:
# for df in datasets:
#     df["TUMOR_TYPE"].replace({"MEDULLOBLASTOMA": "MB", "EPENDYMOMA": "EP", "PILOCYTIC ASTROCYTOMA": "PA", "GLIOMA": "BG"}, inplace=True)
#     df.loc[(df["TUMOR_TYPE"] != 'MB') & (df["TUMOR_TYPE"] != 'EP'), "TUMOR_TYPE"] = np.nan
#     df.dropna(subset=["TUMOR_TYPE"], inplace=True)


filtered_mb = dataset[dataset["TUMOR_TYPE"] == "MB"].head(25)
filtered_ep = dataset[dataset["TUMOR_TYPE"] == "EP"].head(25)
filtered_pa = dataset[dataset["TUMOR_TYPE"] == "PA"].head(25)
filtered_bg = dataset[dataset["TUMOR_TYPE"] == "BG"].head(25)

filtered_dataset = pd.concat([filtered_mb, filtered_ep, filtered_pa, filtered_bg])

In [None]:
filtered_dataset

In [None]:
filtered_dataset["TUMOR_TYPE"].value_counts()

In [None]:
## name dataframes for each case
filtered_dataset.name = "MB-EP-PA-BG"

datasets = [filtered_dataset]

In [None]:
# create XGBoost instance with default hyper-parameters
xgb_estimator = XGBClassifier(random_state=42)
rf_estimator = RandomForestClassifier(random_state=42)
dt_estimator = DecisionTreeClassifier(random_state=42)
gbc_estimator = GradientBoostingClassifier(random_state=42)
cb_estimator = CatBoostClassifier(verbose = False, random_state=42)
ada_estimator = AdaBoostClassifier(random_state=42)
svm_estimator = SVC(random_state=42, kernel='linear')
lr_estimator = LogisticRegression(random_state=42)


        
xgb_estimator.name = 'XGBoost'
rf_estimator.name = 'Random Forest'
dt_estimator.name = 'Decision Tree'
gbc_estimator.name = 'Gradient Boosting'
cb_estimator.name = 'CatBoost'
ada_estimator.name = 'AdaBoost'
svm_estimator.name = 'SVM'
lr_estimator.name = 'Logistic Regression'

In [None]:
lr_pipeline = Pipeline([
    ('scale', MinMaxScaler()),
    ('model', lr_estimator)
])

voting_clf = VotingClassifier(
    estimators=[('xgb', xgb_estimator), ('rf', rf_estimator), ('lr', lr_pipeline), ('cb', cb_estimator)],
    voting='soft'
)

voting_clf.name = 'Voting Classifier'

models = [xgb_estimator, rf_estimator, dt_estimator, gbc_estimator, cb_estimator, ada_estimator, svm_estimator, lr_estimator, voting_clf]
random_states = [1,42,123,1234,12345]

In [None]:
def create_dir(path, state_dir, results_path, data_name_dir, figures_path, matrices_path):
    try:
        os.mkdir(path) 
    except OSError as error: 
        pass

    try:
        os.mkdir(state_dir) 
    except OSError as error: 
        pass

    try:
        os.mkdir(results_path)
    except OSError as error: 
        pass 
        
    try:
        os.mkdir(data_name_dir) 
    except OSError as error: 
        pass

    try:
        os.mkdir(figures_path)
    except OSError as error: 
        pass 
    
    try:
        os.mkdir(matrices_path) 
    except OSError as error: 
        pass

In [None]:
############################################################################################################
# Note: This is a helper function to print the feature importances for traditional ML models.
#
# Inputs:
#   model: the trained model
#   df: the dataframe used to train the model
#   path: the main path to save the plot to
############################################################################################################

def print_feature_importances(model, df, path):
    feature_imp = pd.Series(model.feature_importances_, index = df.drop('TUMOR_TYPE', axis=1).columns).sort_values(ascending = False)
    sns.set_style('dark', {'axes.grid' : False})
    plt.figure(figsize = (8, 8))
    with sns.plotting_context(rc={"axes.labelsize":16, "xtick.labelsize":14, "ytick.labelsize":14}):
        sns.barplot(x = feature_imp, y = feature_imp.index, palette = "coolwarm", alpha=1)
    print("\n")
    plt.xlabel("Relative scores", fontsize = 13, fontweight="bold")
    plt.ylabel("Features", fontsize = 13, fontweight="bold")
    plt.tight_layout()
    plt.savefig(os.path.join(path, df.name + '_' + model.name + '_FeatureImportances.pdf'), bbox_inches="tight",
                pad_inches=0.1, transparent=False, dpi=300)
    plt.savefig(os.path.join(path, df.name + '_' + model.name + '_FeatureImportances.png'), bbox_inches="tight",
                pad_inches=0.1, transparent=False, dpi=300)
    
    #plt.show()

In [None]:

############################################################################
# NOTE: This is a helper function to print the feature coefficients
#       of the logistic regression and svm model.
# 
# Inputs:
#   model: the trained model
#   df: the dataframe used to train the model
#   path: the main path to save the plot to
#
#   This function can be used for any model that has a coef_ attribute, 
#   when feature_importances_ is not available.
############################################################################

def print_feature_coefficients(model, df, path):
    
    plt.figure(figsize = (12, 8))
    features = df.drop('TUMOR_TYPE', axis=1).columns
    importances = pd.DataFrame(data={
        'Attribute': features,
        'Importance': model.coef_[0]
    })

    importances["Importance"] = importances["Importance"].to_numpy().astype(np.float)
    importances = importances.sort_values(by='Importance', ascending=False)
    indices = np.argsort(importances["Importance"])
    sns.set_style('dark', {'axes.grid' : False})
    with sns.plotting_context(rc={"axes.labelsize":16, "xtick.labelsize":14, "ytick.labelsize":14}):
        plt.barh(range(len(indices)), importances["Importance"], color = sns.cubehelix_palette(18, start=.5, rot=-.5), align='center')
    plt.title('Feature Coefficients')
    plt.barh(range(len(indices)), importances["Importance"], color = sns.cubehelix_palette(18, start=.5, rot=-.5), align='center')
    plt.yticks(range(len(indices)), importances["Attribute"], fontsize = 13)
    plt.xlabel("Relative weight scores", fontsize = 13, fontweight="bold")

    # SAVE BOTH PNG AND PDF
    plt.savefig(os.path.join(path, df.name + '_' + model.name + '_impTable.png'), bbox_inches="tight",
                pad_inches=0.1, transparent=False, dpi=300)
    plt.savefig(os.path.join(path, df.name + '_' + model.name + '_impTable.pdf'), bbox_inches="tight",
                pad_inches=0.1, transparent=False, dpi=300)

    plt.clf()
    #plt.show()
    ####

In [None]:
## This is a helper function to print the results of the model
# Inputs:
#   y_test: the true labels
#   y_pred: the predicted labels
#   df: the dataframe used to train the model
#   model: the trained model
#   path: the main path to save the plot to


def print_results(y_test, y_pred, df, model, path):
    print('Precision: %.4f' % precision_score(y_test, y_pred, average='macro'))
    print('Recall: %.4f' % recall_score(y_test, y_pred, average='macro'))
    print('F1 Score: %.4f' % f1_score(y_test, y_pred, average='macro'))
    print('\n')
    print(classification_report(y_test, y_pred))
    print('\n')
    labels = np.unique(y_test)
    plt.figure(figsize = (11, 8))
    with sns.plotting_context(rc={"axes.labelsize":16, "xtick.labelsize":24, "ytick.labelsize":24}):
        table = sns.heatmap(confusion_matrix(y_test, y_pred),
                            xticklabels = labels,
                            yticklabels = labels,
                            annot=True, fmt='.0f', cmap='Purples',
                            annot_kws={
                            'fontsize': 24,
                            'fontweight': 'bold'})

    table.set_xlabel('\nPredicted Values', fontdict=dict(weight='bold', size=24))
    table.set_ylabel('Actual Values', fontdict=dict(weight='bold', size=24))
    plt.yticks(rotation=0)

    img_path = os.path.join(path, df.name + '_' + model.name + '_matrix.png') 
    pdf_path = os.path.join(path, df.name + '_' + model.name + '_matrix.pdf') 
    
    plt.savefig(img_path, bbox_inches="tight",
                pad_inches=0.1, transparent=False, dpi=300)
    plt.savefig(pdf_path, bbox_inches="tight",
                pad_inches=0.1, transparent=False, dpi=300)
    plt.clf()

In [None]:
# Note: This is a helper function to obtain the results of a trained model.
#      It splits the data into train and test sets, trains the model, and prints the results.


def exec_model(model, df, path, state):
    
    X_train, X_test, y_train, y_test = train_test_split(df.drop(["TUMOR_TYPE"], axis = 1), df["TUMOR_TYPE"], 
                                                        test_size = 0.35, 
                                                        stratify = df["TUMOR_TYPE"], 
                                                        random_state = state)

    if(model.name == 'XGBoost'):
        model = XGBClassifier(random_state=42)
        model.name = 'XGBoost'
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
    
    if(model.name == 'Voting Classifier'):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    else:
        pipe = make_pipeline(StandardScaler(), model)
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)



    if model.name == 'XGBoost':
        y_pred = le.inverse_transform(y_pred)

    scores = []
    scores.append(precision_score(y_test, y_pred, average='macro'))
    scores.append(recall_score(y_test, y_pred, average='macro'))
    scores.append(f1_score(y_test, y_pred, average='macro'))

    
    ################## CREATE FILE PATH IF NOT EXIST #######################
    state_dir = os.path.join(path, "random_state-" + f"{state}")
    data_name_dir = os.path.join(state_dir, df.name + f"_rs{state}")
    results_path = os.path.join(path, "results")
    figures_path = os.path.join(data_name_dir, "figures")
    matrices_path = os.path.join(data_name_dir, "confusion_matrices")

    create_dir(path, state_dir, results_path, data_name_dir, figures_path, matrices_path)

    ######################################################################
    # PRINT RESULTS
    print_results(y_test, y_pred, df, model, matrices_path)

    importance = np.zeros(X_train.columns.size,)

    if model.name == 'Logistic Regression':
        print_feature_coefficients(model, df, figures_path)
    elif model.name == 'SVM':
        print_feature_coefficients(model, df, figures_path)
    elif model.name == 'Voting Classifier':
        pass
    else:
        print_feature_importances(model, df, figures_path)
        importance = np.array(model.feature_importances_)
    
    importance = np.reshape(importance, (1, X_train.columns.size))
    importance_df = pd.DataFrame(data=importance, columns=X_train.columns)

    return importance_df, scores

In [None]:
####################################################################################################
# Note: This is main function that loops over the different datasets and models.
#       It calls the exec_model function to train and evaluate the models.
#       It also saves the results to a file.
#       The results are saved in a directory called "evaluation" in the current working directory.
####################################################################################################

for state in random_states:
    for data in datasets:
        precisions = []
        recalls = []
        f1_scores = []
        importances = pd.DataFrame(data=None, columns=data.drop(["TUMOR_TYPE"], axis = 1).columns)
        score_table = pd.DataFrame({"Models": ["XGB", "RF", "DT", "GB", "CB", "ADA", "SVM", "LR", "VOTING"],
                                    "Precision": np.nan,
                                    "Recall": np.nan,
                                    "F1_Score": np.nan})
        
        
        for model in models:
            print("\n*********************************************************************************" +
                    "\n-------------- " + data.name + "  -  " + model.name + " --------------\n")
            
            importance_df, scores = exec_model(model, data, evaluation_dir, state)
            precisions.append(scores[0])
            recalls.append(scores[1])
            f1_scores.append(scores[2])
            importances = pd.concat([importances, importance_df], axis=0).reset_index(drop=True)

        score_table["Precision"] = precisions
        score_table["Recall"] = recalls
        score_table["F1_Score"] = f1_scores
    
        score_table = pd.concat([score_table, importances], axis = 1)

        print(score_table)
        out_path = os.path.join(evaluation_dir, 'results', data.name + f'_RS{state}.xlsx')
        score_table.to_excel(out_path, encoding='utf-8')
        del score_table, importances

        gc.collect()