### SAVE THE KID

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import scipy.stats as st
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sns
%matplotlib inline

from warnings import filterwarnings
filterwarnings('ignore')
import gc
import os

### MODELS
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier # requires additional library
from xgboost import XGBClassifier

# TRIED BUT NOT USED
#from lightgbm import LGBMClassifier
#from sklearn.neural_network import MLPClassifier
#from sklearn.neighbors import KNeighborsClassifier


### sklearn utils
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (precision_score, recall_score, confusion_matrix,
                             classification_report, accuracy_score, f1_score)
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_curve,auc
from sklearn.preprocessing import LabelEncoder

### Get Dataset

In [None]:
root_dir = "/your/path/to/this/project"
evaluation_dir = root_dir + "/evaluations"
df = pd.read_excel(f"{root_dir}/your/path/to/your/data.xlsx")

In [None]:
df["TUMOR_TYPE"] = df["TUMOR_TYPE"].apply(lambda x: 'MB' if x == "MEDULLOBLASTOMA" else x)
df["TUMOR_TYPE"] = df["TUMOR_TYPE"].apply(lambda x: 'EP' if x == "EPENDYMOMA" else x)
df["TUMOR_TYPE"] = df["TUMOR_TYPE"].apply(lambda x: 'PA' if x == "PILOCYTIC ASTROCYTOMA" else x)
df["TUMOR_TYPE"] = df["TUMOR_TYPE"].apply(lambda x: 'BG' if x == "GLIOMA" else x)

In [None]:
## For Each Case
pfdf_MB_EP = df[(df["TUMOR_TYPE"] == 'MB') | (df["TUMOR_TYPE"] == 'EP')]
pfdf_MB_PA = df[(df["TUMOR_TYPE"] == 'MB') | (df["TUMOR_TYPE"] == 'PA')]
pfdf_MB_BG = df[(df["TUMOR_TYPE"] == 'MB') | (df["TUMOR_TYPE"] == 'BG')]
pfdf_EP_PA = df[(df["TUMOR_TYPE"] == 'EP') | (df["TUMOR_TYPE"] == 'PA')]
pfdf_EP_BG = df[(df["TUMOR_TYPE"] == 'EP') | (df["TUMOR_TYPE"] == 'BG')]
pfdf_PA_BG = df[(df["TUMOR_TYPE"] == 'PA') | (df["TUMOR_TYPE"] == 'BG')]
pfdf_MB_EP_PA_BG = df

In [None]:
## name dataframes for each case
pfdf_MB_EP.name = 'MB_EP'
pfdf_MB_PA.name = 'MB_PA'
pfdf_MB_BG.name = 'MB_BG'
pfdf_EP_PA.name = 'EP_PA'
pfdf_EP_BG.name = 'EP_BG'
pfdf_PA_BG.name = 'PA_BG'
pfdf_MB_EP_PA_BG.name = 'MB_EP_PA_BG'

# determine names for models
svm = "SVM"
lsvm = "LSVM"
logreg = "LR"
rf = "RF"
dt = "DT"
gbc = "GBC"
cbc = "CBC"
xgb = "XGB"

In [None]:
models = [svm, lsvm, logreg, rf, dt, gbc, cbc, xgb]

dataset = [pfdf_MB_EP, pfdf_MB_PA, pfdf_MB_BG, pfdf_EP_PA,
           pfdf_EP_BG, pfdf_PA_BG, pfdf_MB_EP_PA_BG]

In [None]:
############################################################################
# NOTE: This is a helper function to print the feature coefficients
#       of the logistic regression model.
#
# Inputs:
#   model: the trained model
#   df: the dataframe used to train the model
#   path: the main path to save the plot to
#
#   This function can be used for any model that has a coef_ attribute, 
#   when feature_importances_ is not available.
############################################################################

def print_feature_coefficients(model, df, path):
    
    features = df.drop('TUMOR_TYPE', axis=1).columns
    importances = pd.DataFrame(data={
        'Attribute': features,
        'Importance': model.coef_[0]
    })

    importances["Importance"] = importances["Importance"].to_numpy().astype(np.float)
    importances = importances.sort_values(by='Importance', ascending=False)
    indices = np.argsort(importances["Importance"])

    sns.set_style('dark', {'axes.grid' : False})
    with sns.plotting_context(rc={"axes.labelsize":16, "xtick.labelsize":14, "ytick.labelsize":14}):
        plt.barh(range(len(indices)), importances["Importance"], color = sns.cubehelix_palette(18, start=.5, rot=-.5), align='center')
    plt.title('Feature Coefficients')
    plt.barh(range(len(indices)), importances["Importance"], color = sns.cubehelix_palette(18, start=.5, rot=-.5), align='center')
    plt.yticks(range(len(indices)), importances["Attribute"], fontsize = 13)
    plt.xlabel("Relative weight scores", fontsize = 13, fontweight="bold")

    # SAVE BOTH PNG AND PDF
    plt.savefig(os.path.join(path, df.name + '_' + model.name + '_impTable.png'), bbox_inches="tight",
                pad_inches=0.1, transparent=False, dpi=300)
    plt.savefig(os.path.join(path, df.name + '_' + model.name + '_impTable.pdf'), bbox_inches="tight",
                pad_inches=0.1, transparent=False, dpi=300)

    plt.show()
    ####

In [None]:
############################################################################################################
# Note: This is a helper function to print the feature importances for traditional ML models.
#
# Inputs:
#   model: the trained model
#   df: the dataframe used to train the model
#   path: the main path to save the plot to
############################################################################################################

def print_feature_importances(model, df, path):
    feature_imp = pd.Series(model.feature_importances_, index = df.drop('TUMOR_TYPE', axis=1).columns).sort_values(ascending = False)
    sns.set_style('dark', {'axes.grid' : False})
    with sns.plotting_context(rc={"axes.labelsize":16, "xtick.labelsize":14, "ytick.labelsize":14}):
        sns.barplot(x = feature_imp, y = feature_imp.index, palette = "coolwarm", alpha=1)
    print("\n")
    plt.xlabel("Relative scores", fontsize = 13, fontweight="bold")
    plt.ylabel("Features", fontsize = 13, fontweight="bold")
    plt.savefig(os.path.join(path, df.name + '_' + model.name + '_FeatureImportances.pdf'), bbox_inches="tight",
                pad_inches=0.1, transparent=False, dpi=300)
    plt.savefig(os.path.join(path, df.name + '_' + model.name + '_FeatureImportances.png'), bbox_inches="tight",
                pad_inches=0.1, transparent=False, dpi=300)
    plt.show()

In [None]:
## This is a helper function to print the results of the model
# Inputs:
#   y_test: the true labels
#   y_pred: the predicted labels
#   df: the dataframe used to train the model
#   model: the trained model
#   path: the main path to save the plot to


def print_results(y_test, y_pred, df, model, path):
    print('Precision: %.4f' % precision_score(y_test, y_pred, average='macro'))
    print('Recall: %.4f' % recall_score(y_test, y_pred, average='macro'))
    print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))
    print('F1 Score: %.4f' % f1_score(y_test, y_pred, average='macro'))
    print('\n')
    print(classification_report(y_test, y_pred))
    print('\n')
    labels = np.unique(y_pred)
    with sns.plotting_context(rc={"axes.labelsize":16, "xtick.labelsize":14, "ytick.labelsize":14}):
        table = sns.heatmap(confusion_matrix(y_test, y_pred),
                            xticklabels = labels,
                            yticklabels = labels,
                            annot=True, fmt='.0f', cmap='Purples',
                            annot_kws={
                            'fontsize': 16,
                            'fontweight': 'bold'})

    table.set_xlabel('\nPredicted Values', fontdict=dict(weight='bold', size=14))
    table.set_ylabel('Actual Values', fontdict=dict(weight='bold', size=14))
    plt.yticks(rotation=0)

    img_path = os.path.join(path, df.name + '_' + model.name + '_matrix.png') 
    pdf_path = os.path.join(path, df.name + '_' + model.name + '_matrix.pdf') 
    
    plt.savefig(img_path, bbox_inches="tight",
                pad_inches=0.1, transparent=False, dpi=300)
    plt.savefig(pdf_path, bbox_inches="tight",
                pad_inches=0.1, transparent=False, dpi=300)
    plt.show()

In [None]:
# Note: This is a helper function to obtain the results of a trained model.
#      It splits the data into train and test sets, trains the model, and prints the results.

def exec_model(model, df, path, state):
    
    X_train, X_test, y_train, y_test = train_test_split(df.drop(["TUMOR_TYPE"], axis = 1), df["TUMOR_TYPE"], 
                                                        test_size = 0.45, 
                                                        stratify = df["TUMOR_TYPE"], 
                                                        random_state = state)

    ##################################################################################
    # Initilize & train models here because we'll loop over different random states
    # NOTE: We're using the "name" attribute to store the name of the model,
    #       so that we can use it later to save the results to a particular file belongs to that model.
    ##################################################################################

    if(model == "SVM"):
        model = SVC(kernel='linear', random_state=state)
        model.name = "SVM"

    if(model == "LSVM"):
        model = LinearSVC(random_state=state)
        model.name = "LSVM"

    if(model == "LR"):
        model = LogisticRegression(random_state=state)
        model.name = "LR"

    if(model == "RF"):
        model = RandomForestClassifier(random_state=state)
        model.name = "RF"

    if(model == "DT"):
        model = DecisionTreeClassifier(random_state=state)
        model.name = "DT"

    if(model == "GBC"):
        model = GradientBoostingClassifier(random_state=state)
        model.name = "GBC"

    if(model == "CBC"):
        model = CatBoostClassifier(verbose = False, random_state=state)
        model.name = "CBC"

    if(model == 'XGB'):
        model = XGBClassifier(random_state=state)
        model.name = "XGB"
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)

    pipe = make_pipeline(StandardScaler(), model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    
    if model.name == 'XGB':
        y_pred = le.inverse_transform(y_pred)

    
    scores = []
    scores.append(precision_score(y_test, y_pred, average='macro'))
    scores.append(recall_score(y_test, y_pred, average='macro'))
    scores.append(accuracy_score(y_test, y_pred))
    scores.append(f1_score(y_test, y_pred, average='macro'))

    
    ################## CREATE FILE PATH IF NOT EXIST #######################
    state_dir = os.path.join(path, "random_state-" + f"{state}")
    data_name_dir = os.path.join(state_dir, df.name + f"_rs{state}") 
    figures_path = os.path.join(data_name_dir, "figures")
    matrices_path = os.path.join(data_name_dir, "confusion_matrices")

    try:
        os.mkdir(path) 
    except OSError as error: 
        pass

    try:
        os.mkdir(state_dir) 
    except OSError as error: 
        pass
        
    try:
        os.mkdir(data_name_dir) 
    except OSError as error: 
        pass

    try:
        os.mkdir(figures_path)
    except OSError as error: 
        pass 
    
    try:
        os.mkdir(matrices_path) 
    except OSError as error: 
        pass

    ######################################################################
    # PRINT RESULTS
    print_results(y_test, y_pred, df, model, matrices_path)

    importance = np.zeros(X_train.columns.size,)

    if (model.name == 'RF' or model.name == 'GBC' or model.name == 'XGB' or model.name == 'DT' or model.name == 'CBC'):
        print_feature_importances(model, df, figures_path)
        importance = np.array(model.feature_importances_)
        importance = np.reshape(importance, (1, X_train.columns.size))
    elif model.name == 'LR':
        print_feature_coefficients(model, df, figures_path)
    elif model.name == 'SVM':
        print_feature_coefficients(model, df, figures_path)
    elif model.name == 'LSVM':
        print_feature_coefficients(model, df, figures_path)

    if((importance == 0).all()):
        importance = np.reshape(importance, (1, X_train.columns.size))
        importance_df = pd.DataFrame(data=importance, columns=X_train.columns)
    else:
        importance_df = pd.DataFrame(data=importance, columns=X_train.columns)

    return importance_df, scores

In [None]:
####################################################################################################
# Note: This is main function that loops over the different datasets and models.
#       It calls the exec_model function to train and evaluate the models.
#       It also saves the results to a file.
#       The results are saved in a directory called "evaluation" in the current working directory.
####################################################################################################

random_state_list = [1,42,123,1234,12345]
for state in random_state_list:
    for data in dataset:
        precisions = []
        recalls = []
        accuracies = []
        f1_scores = []
        importances = pd.DataFrame(data=None, columns=df.drop(["TUMOR_TYPE"], axis = 1).columns)
        score_table = pd.DataFrame({"Models": ["svm", "lsvm", "logreg", "rf", "dt", "gbc", "cbc", "xgb"],
                                    "Precision": np.nan,
                                    "Recall": np.nan,
                                    "Accuracy": np.nan,
                                    "F1_Score": np.nan})

        
        for model in models:
            print("\n*********************************************************************************" +
                    "\n-------------- " + data.name + "  -  " + model + " --------------\n")
            
            importance_df, scores = exec_model(model, data, evaluation_dir, state)
            precisions.append(scores[0])
            recalls.append(scores[1])
            accuracies.append(scores[2])
            f1_scores.append(scores[3])
            importances = pd.concat([importances, importance_df], axis=0).reset_index(drop=True)

        score_table["Precision"] = precisions
        score_table["Recall"] = recalls
        score_table["Accuracy"] = accuracies
        score_table["F1_Score"] = f1_scores
    
        score_table = pd.concat([score_table, importances], axis = 1)

        print(score_table)
        out_path = os.path.join(evaluation_dir, f'random_state-{state}', data.name + f'_RS{state}.xlsx')
        score_table.to_excel(out_path, encoding='utf-8')
        del score_table, importances

        gc.collect()