# import library

In [None]:
# Importing the necessary libraries and data
import pandas as pd
import os
import numpy as np
from pathlib import Path
import itertools

import plotly.express as px
import plotly.io as pio
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter


# sklearn
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV, train_test_split,  GridSearchCV, KFold
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils.class_weight import compute_sample_weight
from yellowbrick.classifier import ROCAUC, ClassificationReport, ClassPredictionError, ConfusionMatrix

# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.over_sampling import RandomOverSampler

# machine learning models
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

#interpretml
from interpret import show
from interpret.glassbox._ebm._research import *
from interpret.data import ClassHistogram
from interpret.perf import ROC
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.blackbox import ShapKernel, LimeTabular, MorrisSensitivity

from interpret.provider import InlineProvider
from interpret import set_visualize_provider

set_visualize_provider(InlineProvider())

import warnings
warnings.filterwarnings("ignore")


In [None]:
# plt.rcParams['font.sans-serif'] = ['SimHei']
# # Replace the Chinese fonts with the ones supported by your system.
# plt.rcParams['axes.unicode_minus'] = False  # Used to display the negative sign normally


# Dataset Configuration

In [None]:
work_dir = 'E:/Disk E/Grand Blue/Research studies/HDL_multiclass'
os.chdir(work_dir)
os.makedirs('./tables_lipid', exist_ok=True)
os.makedirs('./images_lipid/png', exist_ok=True)
os.makedirs('./images_lipid/pdf', exist_ok=True)

In [None]:
df_prim = pd.read_excel('data/TertileClass_Gensini_data.xlsx')
print(df_prim['Gensini_tertile_label'].value_counts())
# Remove redundant columns+ 'HDL-2b','HDL-3'
df_prim.drop(['Gensini_total_Score', 'Gensini_tertile_label', 'HDL-2b','HDL-3'], axis=1, inplace=True)
df_prim.info()


In [None]:
df = df_prim.copy()

df.info()

target_col ='Gensini_tertile'
print('df: ', Counter(df[target_col]))

# Define label 
# The sequential numerical codes are 0, 1, 2
class_names = ['Low', 'Moderate', 'High']


In [None]:
# df.columns.to_list()

# Model training (function definition)

In [None]:
# Defining multiple functions

def model_tuning(clf, model_name, search_space):
    """
    Function to perform model tuning
    """
    X = df.drop(target_col, axis=1)
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y
    )
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    print('y_train: ', Counter(y_train))
    print('y_test: ', Counter(y_test))
    
    # Merge training data and labels into a DataFrame
    train_df = pd.concat([X_train, y_train], axis=1)
    
       
    clf_name = model_name
    print('\n')
    print(clf_name) 
     
    pipe = Pipeline([('clf', clf)])

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scoring = {'F1_macro': metrics.make_scorer(metrics.f1_score, average='macro')}
    
    grid = GridSearchCV(
        pipe,
        param_grid=search_space,
        cv=kfold,
        scoring=scoring,
        refit='F1_macro',
        verbose=1,
        n_jobs=-1
    )

    grid_model = grid.fit(X_train, y_train)

    print('\nThe optimal parameters are：')
    print(grid_model.best_params_)

    print('\nThe optimal parameters are：')
    clf_best_model = grid_model.best_estimator_['clf']
    print(clf_best_model)

    return clf_best_model, X_train, X_test, y_train, y_test


# Function to create directories if they don't exist
def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)


# Function to save classification reports to an Excel file
def save_classification_reports_to_excel(results, file_path):
    with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
        for model_name, report in results.items():
            df_report = pd.DataFrame(report).transpose()
            df_report.to_excel(writer, sheet_name=model_name)

# Function to create and save individual plots
def save_individual_plots(clf_best_model, X_train, y_train, X_test, y_test, model_name, save_path):
    # create_directory(save_path)
    
    # ROC AUC
    fig, ax = plt.subplots()
    visualizer_rocauc = ROCAUC(clf_best_model, classes=class_names, ax=ax)
    visualizer_rocauc.fit(X_train, y_train)
    visualizer_rocauc.score(X_test, y_test)
    visualizer_rocauc.finalize()
    plt.title('')
    plt.savefig(f'{save_path}/png/{model_name.lower().replace(" ", "_")}_roc_auc.png', dpi=300)
    plt.savefig(f'{save_path}/pdf/{model_name.lower().replace(" ", "_")}_roc_auc.pdf', dpi=300)
    plt.close(fig)
    
    # Classification Report
    fig, ax = plt.subplots()
    visualizer_class_report = ClassificationReport(clf_best_model, classes=class_names, support=True, ax=ax)
    visualizer_class_report.fit(X_train, y_train)
    visualizer_class_report.score(X_test, y_test)
    visualizer_class_report.finalize()
    plt.title('')
    plt.savefig(f'{save_path}/png/{model_name.lower().replace(" ", "_")}_class_report.png', dpi=300)
    plt.savefig(f'{save_path}/pdf/{model_name.lower().replace(" ", "_")}_class_report.pdf', dpi=300)
    plt.close(fig)
    
    # Prediction Error
    fig, ax = plt.subplots()
    visualizer_pred_error = ClassPredictionError(clf_best_model, classes=class_names, ax=ax)
    visualizer_pred_error.fit(X_train, y_train)
    visualizer_pred_error.score(X_test, y_test)
    visualizer_pred_error.finalize()
    plt.title('')
    plt.savefig(f'{save_path}/png/{model_name.lower().replace(" ", "_")}_pred_error.png', dpi=300)
    plt.savefig(f'{save_path}/pdf/{model_name.lower().replace(" ", "_")}_pred_error.pdf', dpi=300)
    plt.close(fig)
    
    # Confusion Matrix
    fig, ax = plt.subplots()
    visualizer_conf_matrix = ConfusionMatrix(clf_best_model, classes=class_names, ax=ax)
    visualizer_conf_matrix.fit(X_train, y_train)
    visualizer_conf_matrix.score(X_test, y_test)
    visualizer_conf_matrix.finalize()
    plt.title('')
    plt.savefig(f'{save_path}/png/{model_name.lower().replace(" ", "_")}_conf_matrix.png', dpi=300)
    plt.savefig(f'{save_path}/pdf/{model_name.lower().replace(" ", "_")}_conf_matrix.pdf', dpi=300)
    plt.close(fig)

# Function to create combined subplots
def create_combined_subplot(fig_title, subplot_func):
    num_models = len(models)
    rows = (num_models + 1) // 2
    fig, axes = plt.subplots(rows, 2, figsize=(12, 6 * rows), sharex=False, sharey=False)

    for i, (clf, model_name, search_space) in enumerate(models):
        ax = axes[i // 2, i % 2]
        clf_best_model, X_train, X_test, y_train, y_test = model_tuning(clf, model_name, search_space)
        subplot_func(clf_best_model, X_train, y_train, X_test, y_test, ax, model_name)
    
    # Hide empty subplots
    if num_models % 2 != 0:
        fig.delaxes(axes[rows - 1, 1])
    
    fig.tight_layout()
    fig.suptitle(fig_title, y=1.02)
    return fig

# Function for ROC AUC
def plot_rocauc(clf_best_model, X_train, y_train, X_test, y_test, ax, model_name):
    visualizer_rocauc = ROCAUC(clf_best_model, classes=class_names, ax=ax)
    visualizer_rocauc.fit(X_train, y_train)
    visualizer_rocauc.score(X_test, y_test)
    visualizer_rocauc.finalize()
    ax.set_title(f'ROC AUC - {model_name}')

# Function for Classification Report
def plot_class_report(clf_best_model, X_train, y_train, X_test, y_test, ax, model_name):
    visualizer_class_report = ClassificationReport(clf_best_model, classes=class_names, support=True, ax=ax)
    visualizer_class_report.fit(X_train, y_train)
    visualizer_class_report.score(X_test, y_test)
    visualizer_class_report.finalize()
    ax.set_title(f'Classification Report - {model_name}')

# Function for Prediction Error
def plot_pred_error(clf_best_model, X_train, y_train, X_test, y_test, ax, model_name):
    visualizer_pred_error = ClassPredictionError(clf_best_model, classes=class_names, ax=ax)
    visualizer_pred_error.fit(X_train, y_train)
    visualizer_pred_error.score(X_test, y_test)
    visualizer_pred_error.finalize()
    ax.set_title(f'Prediction Error - {model_name}')

# Function for Confusion Matrix
def plot_conf_matrix(clf_best_model, X_train, y_train, X_test, y_test, ax, model_name):
    visualizer_conf_matrix = ConfusionMatrix(clf_best_model, classes=class_names, ax=ax)
    visualizer_conf_matrix.fit(X_train, y_train)
    visualizer_conf_matrix.score(X_test, y_test)
    visualizer_conf_matrix.finalize()
    ax.set_title(f'Confusion Matrix - {model_name}')


# New function to extract weighted avg metrics and plot heatmap
def plot_weighted_avg_heatmap(results, save_path):
    data = {
        'Model': [],
        'Precision': [],
        'Recall': [],
        'F1-score': []
        # 'Support': []
    }

    for model_name, report in results.items():
        weighted_avg = report['weighted avg']
        data['Model'].append(model_name)
        data['Precision'].append(weighted_avg['precision'])
        data['Recall'].append(weighted_avg['recall'])
        data['F1-score'].append(weighted_avg['f1-score'])
        # data['Support'].append(weighted_avg['support'])

    df_ave = pd.DataFrame(data)
    df_ave.set_index('Model', inplace=True)

    plt.figure(figsize=(10, 6))
    sns.heatmap(df_ave, annot=True, cmap='YlOrRd', fmt='.3f')
    # Note font colour set to black
    # sns.heatmap(df_ave, annot=True, cmap='YlOrRd', fmt='.3f',annot_kws={"color": "black"})
    # plt.title('Weighted Average Metrics for Different Models')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'{save_path}/png/weighted_avg_heatmap.png', dpi=300)
    plt.savefig(f'{save_path}/pdf/weighted_avg_heatmap.pdf', dpi=300)
    plt.show()


# Initial model name reference

In [None]:
# Building the basic model
# log_clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42)


# log_clf = LogisticRegression(random_state=42)
# lasso_reg = Lasso(random_state=42)
# elastic_net = ElasticNet(random_state=42)
# random_forest_clf = RandomForestClassifier(random_state=42)
# extra_trees_clf = ExtraTreesClassifier(random_state=42)

# Create Decision Tree classifer object
# dt_clf = DecisionTreeClassifier(random_state=42)

# svm_clf = SVC(probability=True, random_state=42)
# mlp_clf = MLPClassifier(max_iter=10000, random_state=42)


# xgb_clf = XGBClassifier(nthread=-1, random_state=42)

# ebm_clf = ExplainableBoostingClassifier(greedy_ratio=0.5, random_state=42)

# # lightgbm for classification
# lgb_clf = LGBMClassifier(random_state=42)

# # catboost for classification
# catb_clf = CatBoostClassifier(verbose=0, n_estimators=100)
# catb_clf = CatBoostClassifier(random_state=42)

# Model hyperparameter configuration

In [None]:
# logistic Model Tuning parameters
search_space_log = [
    {
        #       'clf__solver' : ['newton-cg', 'lbfgs', 'liblinear'],
        #       'clf__penalty' : ['l2'],
        #       'clf__C' : [0.001,0.01,0.1,1,10,100,1000]
    }
]

# xgboost Model Tuning parameters
# Define our search space for grid search
search_space_xgb = [
    {
        # 'clf__n_estimators': [5000],
        # 'clf__learning_rate': [0.01, 0.1, 0.2, 0.3],
        # 'clf__max_depth': [3, 4,5],
        # 'clf__colsample_bytree': [i / 10.0 for i in range(1, 3)],
        # 'clf__gamma': [i / 10.0 for i in range(3)],
        # 'fs__score_func': [chi2],
        # 'fs__k': [10]
    }
]

# RandomForestClassifier Model tuning
search_space_rf = [
    {
    # 'clf__bootstrap': [True],
    # 'clf__max_depth': [80, 90, 100, 110],
    # 'clf__max_features': [2, 3],
    # 'clf__min_samples_leaf': [3, 4, 5],
    # 'clf__min_samples_split': [8, 10, 12],
    # 'clf__n_estimators': [100, 200, 300, 1000]
    }
]



# EBM Model tuning
# Define our search space for grid search
search_space_ebm = [
    {
        # "clf__learning_rate": [0.001, 0.005, 0.01, 0.03],
        # "clf__interactions": [5, 10, 15],
        # "clf__max_interaction_bins": [10, 15, 20],
        # "clf__max_rounds": [5000, 10000, 15000, 20000],
        # "clf__min_samples_leaf": [2, 3, 5],
        # "clf__max_leaves": [3, 5, 10],

    }
]


# LightGBM Model tuning
# Define our search space for grid search
search_space_lgb = [
    {
        # 'clf__n_estimators': [5000],
        # 'clf__learning_rate': [0.1],
        # 'clf__max_depth': range(1,11),
        # 'clf__boosting_type': ['gbdt', 'dart', 'goss']


    }
]


# CatBoost Model tuning
# Define our search space for grid search
search_space_catb = [
    {
        # 'clf__n_estimators': [10, 50, 100, 500, 1000, 5000],
        # 'clf__learning_rate': [0.1],
        # 'clf__max_depth': range(1,11)


    }
]


In [None]:
# # EBMModel tuning
# # Define our search space for grid search
# search_space_ebm = [
#     {   "clf__max_bins": [1024, 4096, 16384, 65536],
#      # "clf__max_interaction_bins": [8, 16, 32, 64, 128, 256],
#      # "clf__outer_bags": [50],
#      # "clf__learning_rate": [0.02, 0.01, 0.005, 0.0025],
#      "clf__greedy_ratio": [0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 4.0],
#      # "clf__cyclic_progress": [0.0, 0.5, 1.0],
#      # "clf__smoothing_rounds": [0, 50, 100, 200, 500, 1000, 2000, 4000],
#      "clf__max_leaves": [3, 4]  
#     }
# ]

# model_tuning(ebm_clf, 'Explainable Boosting Machine', search_space_ebm )

# Model Comparison Type Configuration

In [None]:
# # List of models and their corresponding search spaces
# models = [
#     (LogisticRegression(multi_class='multinomial', solver='lbfgs',penalty= 'l2', random_state=42), 'Logistic Regression', search_space_log),
#     (XGBClassifier(nthread=-1, max_depth=4,  random_state=42), 'XGBoost', search_space_xgb),
#     (ExplainableBoostingClassifier(greedy_ratio=0.5, random_state=42), 'Explainable Boosting Machine', search_space_ebm),
#     (RandomForestClassifier(max_features = 3, random_state=42), 'Random Forest', search_space_rf)
    
# ]

In [None]:
# List of models and their corresponding search spaces
models = [
    (LogisticRegression(C=1, multi_class='multinomial', solver='lbfgs',penalty= 'l2', random_state=42), 'Logistic Regression', search_space_log),
    (XGBClassifier(nthread=-1, max_depth=3,  random_state=42), 'XGBoost', search_space_xgb),
    (ExplainableBoostingClassifier(greedy_ratio=4.0, random_state=42), 'Explainable Boosting Machine', search_space_ebm),
    (RandomForestClassifier( max_depth=80, random_state=42), 'Random Forest', search_space_rf)
    
]

# Machine learning model training (calling functions)

In [None]:
#--------------------call function --------------------

# Set seaborn theme to "darkgrid"
sns.set_theme(style="darkgrid")


classification_reports = {}
for clf, model_name, search_space in models:
    # Save individual plots for each model
    clf_best_model, X_train, X_test, y_train, y_test = model_tuning(clf, model_name, search_space)
    save_individual_plots(clf_best_model, X_train, y_train, X_test, y_test, model_name, './images_lipid')
    
    # Generate classification report
    y_pred = clf_best_model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, target_names=class_names)
    classification_reports[model_name] = report

# Save all classification reports to a single Excel file
save_classification_reports_to_excel(classification_reports, './tables_lipid/classification_reports.xlsx')


# Plot and save the heatmap of weighted average metrics
plot_weighted_avg_heatmap(classification_reports, './images_lipid')


# Create and save combined ROC AUC subplot
fig_rocauc = create_combined_subplot('Combined ROC AUC', plot_rocauc)
fig_rocauc.savefig('./images_lipid/png/combined_rocauc_lipid.png', dpi=300)
fig_rocauc.savefig('./images_lipid/pdf/combined_rocauc_lipid.pdf', dpi=300)

# Create and save combined Classification Report subplot
fig_class_report = create_combined_subplot('Combined Classification Report', plot_class_report)
fig_class_report.savefig('./images_lipid/png/combined_class_report_lipid.png', dpi=300)
fig_class_report.savefig('./images_lipid/pdf/combined_class_report_lipid.pdf', dpi=300)

# Create and save combined Prediction Error subplot
fig_pred_error = create_combined_subplot('Combined Prediction Error', plot_pred_error)
fig_pred_error.savefig('./images_lipid/png/combined_pred_error_lipid.png', dpi=300)
fig_pred_error.savefig('./images_lipid/pdf/combined_pred_error_lipid.pdf', dpi=300)

# Create and save combined Confusion Matrix subplot
fig_conf_matrix = create_combined_subplot('Combined Confusion Matrix', plot_conf_matrix)
fig_conf_matrix.savefig('./images_lipid/png/combined_conf_matrix_lipid.png', dpi=300)
fig_conf_matrix.savefig('./images_lipid/pdf/combined_conf_matrix_lipid.pdf', dpi=300)

plt.show()
