In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.feature_selection import RFECV
import matplotlib.ticker as ticker
import matplotlib.pylab as plt
import warnings
warnings.filterwarnings('ignore')

dt = pd.read_csv('yourdata.csv', encoding='gbk')

target = 'Dead'
predictors = ['Gender', 'Age', 'Ann Arbor staging', 'B symptom','Extranodal involvement', 'ECOG', 'IPI', 'Rash',
              'Edema/Serous effusion', 'Hb', 'PLT', 'ALC', 'AEC', 'ALB', 'GLB', 'LDH']

seed = 2025
X, y = dt[predictors], dt[target]
# Divide the training set and the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=seed)

print('Total number of train instance: {}'.format(X_train.shape[0]))
print('Total number of positive train instance: {}'.format(y_train.sum()))
print('Total number of test instance: {}'.format(X_test.shape[0]))
print('Total number of positive test instance: {}'.format(y_test.sum()))
ratio = float(y_train.value_counts()[0]) / y_train.value_counts()[1]
print(ratio)


cat_cols = ['Age','B symptom','ECOG','Rash','Edema/Serous effusion','Hb','PLT','ALB']

cat_clf = CatBoostClassifier(loss_function="Logloss",
                            eval_metric="AUC",
                            learning_rate=0.01,
                            iterations=1000,
                            random_seed=42,
                            od_type="Iter",
                            depth=4,
                            early_stopping_rounds=800,
                            colsample_bylevel=0.1,
                            l2_leaf_reg=20,
                            random_strength=800,
                            scale_pos_weight=1,
                            silent=True
                            )

from dhcdatalearn.model_evaluation import EvaModelFusion

eva = EvaModelFusion()
eva.train(X_train[cat_cols], y_train)
names = ['CatBoost']
sampling_methods = [cat_clf]

# Output the evaluation metric results of the optimized CatBoost model on the training set
eva.evalu_models(names, sampling_methods,X_train[cat_cols], y_train)
# Output the evaluation metric results of the optimized CatBoost model on the test set
eva.evalu_models(names, sampling_methods, X_test[cat_cols], y_test)


# SHAP
import shap
import matplotlib.pyplot as plt

def shap_plot(X_train, X_test, clf, cols, name):
    plt.figure(figsize=(25, 20), dpi=1000)
    clf.fit(X_train[cols], y_train)
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X_test[cols])
    shap.summary_plot(shap_values, X_test[cols], show=0)
    plt.savefig(name, format='png', dpi=1000, bbox_inches='tight',facecolor='white')

shap_plot(X_train, X_test, cat_clf, cat_cols, "shap(Catoost).png")


def shap_plot(X_train, X_test, clf, cols, name):
    plt.figure(figsize=(25, 20), dpi=1000)
    clf.fit(X_train[cols], y_train)
    explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X_test[cols])
    shap.summary_plot(shap_values, X_test[cols], plot_type="bar", show=0)
    plt.savefig(name, format='png', dpi=1000, bbox_inches='tight',facecolor='white')

shap_plot(X_train, X_test, cat_clf, cat_cols, "shap_bar(Catoost).png")


# LIME
import lime
from lime import lime_tabular
from lime import lime_image

model = cat_clf
model.fit(X_train[cat_cols], y_train)

explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(np.array(X_train[cat_cols])), 
    feature_names=X_train[cat_cols].columns, 
    class_names=["Alive", "Dead"], 
    mode='classification' 
)
idx = 4
data_test = np.array(X_test[cat_cols].iloc[idx]).reshape(1, -1)
prediction = model.predict(data_test)[0]
y_true = np.array(y_test)[idx]
print('Sample{} in the test set, model prediction is {}, true class is {}'.format(idx, prediction, y_true))
print("\n")

exp = explainer.explain_instance(
    data_row=X_test[cat_cols].iloc[idx], 
    predict_fn=model.predict_proba,
    
)
cur_y = 'Alive'
if y_test.values[idx] == 1 :
    cur_y = 'Dead'
    
print('True outcome：',cur_y)
exp.show_in_notebook(show_table=True)