# **Import Library**


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import category_encoders as ce

In [3]:
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [4]:
import shap
from sklearn.decomposition import PCA

In [5]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.under_sampling import NearMiss

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

In [7]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from skopt import BayesSearchCV

# **Import Data**

In [8]:
customers_data = pd.read_csv(r"D:\NCKH\CHURN PREDICTION\CCP 03\Code\BankChurners.csv")

In [None]:
customers_data.sample(5)

In [None]:
customers_data.columns

In [11]:
customers_data.drop(columns=['CLIENTNUM',
        'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], inplace=True)

In [12]:
demography = ['Gender','Dependent_count','Education_Level','Marital_Status','Income_Category']


In [13]:
number = ['Dependent_count','Total_Relationship_Count','Contacts_Count_12_mon',
        'Avg_Utilization_Ratio','Total_Amt_Chng_Q4_Q1','Total_Ct_Chng_Q4_Q1','Months_Inactive_12_mon']
numeric_cols = list(f for f in customers_data.columns if customers_data[f].dtype != 'O')

# **EDA**

## Demography

In [None]:
card_counts = customers_data['Card_Category'].value_counts()

ax = card_counts.plot(kind='bar', figsize=(10, 7), color='#213A57')

plt.title('Distribution of Card Categories')
plt.xlabel('Card Category')
plt.ylabel('Number of Customers')
plt.xticks(rotation=0)
plt.tight_layout()

for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2, p.get_height()),
                ha='center', va='bottom', fontsize=10)

plt.show()

In [None]:
sns.jointplot(data=customers_data, x="Total_Trans_Ct", y="Total_Ct_Chng_Q4_Q1", hue="Attrition_Flag", kind="scatter")

plt.title("Distribution of bank customer churn label",  y=-0.2, fontsize=14, fontweight='bold')
plt.xlabel("Total_Trans_Ct")
plt.ylabel("Total_Ct_Chng_Q4_Q1")

plt.show()

In [None]:
for col in demography:
    customers_data[col].value_counts().plot(kind='bar')
    plt.show()

In [None]:
def barplot_category(customers_data, column, ax):
    data_counts = customers_data[column].value_counts()
    ax.barh(data_counts.index, data_counts.values)
    ax.set_title(f'Barplot of {column}')
    ax.set_xlabel('Quantity')

fig, ax = plt.subplots(3, 2, figsize=(12, 15))
x = 0
y = 0
for col in demography:
    barplot_category(customers_data, col, ax[x, y])
    if y == 1:
        x += 1
        y = 0
    else:
        y += 1
barplot_category(customers_data, 'Card_Category', ax[2,1])
plt.subplots_adjust(hspace=0.5, wspace=0.4)

plt.tight_layout()
plt.show()

## Data distribution



In [None]:
number = ['Dependent_count','Total_Relationship_Count','Contacts_Count_12_mon',
        'Avg_Utilization_Ratio','Total_Amt_Chng_Q4_Q1','Total_Ct_Chng_Q4_Q1','Months_Inactive_12_mon']
plt.figure(figsize=(10, 6))
sns.boxplot(data=customers_data.loc[:,number], orient='h', palette="Blues");
plt.show()

In [None]:
def boxplot_numerical(customers_data, column, ax):
    churn_data = customers_data[column]
    ax.boxplot(churn_data, vert=False)
    ax.set_title(f'Boxplot Chart of {column}')
fig, ax = plt.subplots(7, 2, figsize=(12, 20))
x = 0
y = 0
for col in numeric_cols:
    boxplot_numerical(customers_data, col, ax[x,y])
    if y == 1:
        x += 1
        y = 0
    else:
        y += 1
plt.subplots_adjust(hspace=0.5, wspace=0.4)

In [None]:
hist_var = ['Credit_Limit','Total_Revolving_Bal','Avg_Open_To_Buy','Total_Trans_Amt','Total_Trans_Ct']
hist_name = ['Credit Limit', 'Total Revolving Bal', 'Avg Open To Buy', 'Total Trans Amt', 'Total Trans Ct']
for i, col in enumerate(hist_var):
    # plt.subplot(3, 2, i + 1)
    # plt.subplots_adjust(wspace=0.5, hspace=0.7)  # Arrange histograms in a 2x3 grid
    plt.hist(customers_data[col], bins=50)

    plt.title(hist_name[i])

    plt.show()

## Distribution of Churn

In [None]:
def plot_churn_rate(customers_data, column, ax):
    churn_data = customers_data.groupby([column, 'Attrition_Flag']).size().unstack().fillna(0)
    churn_data.plot(kind='bar', stacked=True, ax=ax)
    ax.set_ylabel('Quantity')
    ax.set_title(f'Number of churned customer by {column}')
    ax.legend(title='Churn', labels=['Churn (1)', 'No Churn (0)'])

fig, ax = plt.subplots(2, 3, figsize=(15, 10))

plot_churn_rate(customers_data, 'Gender', ax[0,0])
plot_churn_rate(customers_data, 'Dependent_count', ax[0, 1])
plot_churn_rate(customers_data, 'Education_Level', ax[0, 2])
plot_churn_rate(customers_data, 'Marital_Status', ax[1, 0])
plot_churn_rate(customers_data, 'Income_Category', ax[1, 1])
plot_churn_rate(customers_data, 'Card_Category', ax[1, 2])

plt.tight_layout()
plt.show()

## Correlation

In [94]:
customers_data_numerical = customers_data.loc[:,numeric_cols]

In [None]:
corr_matrix = customers_data_numerical.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', linewidths=0.5)

In [None]:
sns.scatterplot(x='Credit_Limit', y='Avg_Open_To_Buy', data=customers_data_numerical)
plt.xlabel('Credit Limit');
plt.ylabel('Average Open to Buy');
plt.title('The correlational relationship between Credit_Limit and Avg_Open_To_Buy');

In [None]:
customers_data_numerical.columns

In [None]:
sns.scatterplot(x='Total_Trans_Ct', y='Total_Trans_Amt', data=customers_data_numerical)
plt.xlabel('Total_Trans_Ct');
plt.ylabel('Total_Trans_Amt');
plt.title('The correlational relationship between Total_Trans_Ct and Total_Trans_Amt');

# **DATA PREPROCESSING**

In [17]:
dataset = customers_data.copy()

In [None]:
dataset.sample()

## Remove Ouliers

In [19]:
numeric_cols_outlier = list(f for f in dataset.columns if dataset[f].dtype != 'O')
z_scores = np.abs(stats.zscore(customers_data.loc[:, numeric_cols_outlier]))

dataset_no_outlier = customers_data[(z_scores < 3).all(axis=1)]

In [20]:
dataset_no_outlier.reset_index(drop=True, inplace=True)

In [None]:
dataset_no_outlier.shape

In [None]:
dataset_no_outlier.sample()

## Encoder

In [None]:
dataset_no_outlier['Attrition_Flag'] = dataset_no_outlier['Attrition_Flag'].str.strip()
dataset_no_outlier['Attrition_Flag'] = dataset_no_outlier['Attrition_Flag'].map({'Attrited Customer': 1, 'Existing Customer': 0})

In [None]:
dataset_no_outlier['Gender'] = dataset_no_outlier['Gender'].map({'M': 1, 'F': 0})

## One-hot encoding

In [25]:
pd.set_option('display.max_columns', None)

In [None]:
# colsx=['Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']
customers_data['Income_Category'].value_counts()

In [None]:
he = ce.HashingEncoder(cols=['Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category'])
dataset_no_outlier_hash = he.fit_transform(dataset_no_outlier)
dataset_no_outlier_hash

In [None]:
dataset_no_outlier_hash.shape

## Scale

In [29]:
scaler = StandardScaler()

standard_Columns = ['Customer_Age', 'Months_on_book']

scaled_values = scaler.fit_transform(dataset_no_outlier_hash[standard_Columns])

dataset_no_outlier_hash[standard_Columns] = scaled_values

In [30]:
scale_columns = ['Total_Revolving_Bal', 'Total_Trans_Amt', 'Total_Trans_Ct','Avg_Open_To_Buy']

scaler = MinMaxScaler()
dataset_no_outlier_hash[scale_columns] = scaler.fit_transform(dataset_no_outlier_hash[scale_columns])

In [None]:
dataset_no_outlier_hash.shape

## Process High Correlation

In [32]:
dataset_no_outlier_hash.drop(columns=['Credit_Limit'], inplace=True)

In [33]:
X = dataset_no_outlier_hash[['Total_Trans_Ct', 'Total_Trans_Amt']]

pca = PCA(n_components=1)
pca_result = pca.fit_transform(X)

dataset_no_outlier_hash['PCA_Trans'] = pca_result

In [None]:
sns.scatterplot(x='Total_Trans_Ct', y='PCA_Trans', data=dataset_no_outlier_hash)
plt.xlabel('Total_Trans_Ct');
plt.ylabel('PCA_Trans');
plt.title('The correlational relationship between Total_Trans_Ct and PCA_Trans');

In [None]:
correlation = dataset_no_outlier_hash['Total_Trans_Ct'].corr(dataset_no_outlier_hash['PCA_Trans'])
print(f"Hệ số tương quan giữa 'Total_Trans_Ct' và 'PCA_Trans': {correlation}")

In [36]:
dataset_no_outlier_hash.drop(columns=['Total_Trans_Ct', 'Total_Trans_Amt'], inplace=True)

In [None]:
dataset_no_outlier_hash.shape

In [None]:
dataset_no_outlier_hash.info()

# **Imbalance Data Handling**

In [39]:
Xraw = dataset_no_outlier_hash.drop(columns=['Attrition_Flag'], axis = 1)
yraw = dataset_no_outlier_hash['Attrition_Flag']

In [40]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
sampling_methods = {
    "SMOTE": SMOTE(random_state=42),
    "SMOTEENN": SMOTEENN(random_state=42),
    "SMOTE TomekLink": SMOTETomek(random_state=42),
    "NearMiss": NearMiss()
}

model = XGBClassifier(random_state=42, eval_metric='logloss')

results = {
    method: {"accuracy": [], "precision": [], "recall": [], "f1_score": [], "conf_matrix": []}
    for method in sampling_methods.keys()
}

for sampling_name, sampling_method in sampling_methods.items():
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_score_list = []
    conf_matrices = []

    X_resampled, y_resampled = sampling_method.fit_resample(Xraw, yraw)

    for train_index, test_index in kfold.split(X_resampled, y_resampled):
        X_train, X_test = X_resampled.iloc[train_index, :], X_resampled.iloc[test_index, :]
        y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate metrics
        accuracy_list.append(accuracy_score(y_test, y_pred))
        precision_list.append(precision_score(y_test, y_pred))
        recall_list.append(recall_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))

        # Store confusion matrix
        conf_matrices.append(confusion_matrix(y_test, y_pred))

    # Average metrics over folds
    accuracy_avg = np.mean(accuracy_list)
    precision_avg = np.mean(precision_list)
    recall_avg = np.mean(recall_list)
    f1_score_avg = np.mean(f1_score_list)

    # Average confusion matrix
    avg_conf_matrix = np.mean(conf_matrices, axis=0)

    # Store results
    results[sampling_name]["accuracy"].append(accuracy_avg)
    results[sampling_name]["precision"].append(precision_avg)
    results[sampling_name]["recall"].append(recall_avg)
    results[sampling_name]["f1_score"].append(f1_score_avg)
    results[sampling_name]["conf_matrix"].append(avg_conf_matrix)

    # Plot confusion matrix for each method
    plt.figure(figsize=(8, 6))
    sns.heatmap(avg_conf_matrix, annot=True, fmt=".2f", cmap="Blues", cbar=True)
    plt.title(f"Average Confusion Matrix ({sampling_name})")
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    plt.show()

# Create DataFrame with results
results_df1 = pd.DataFrame({
    'Sampling Method': list(results.keys()),
    'Accuracy': [results[sampling_name]["accuracy"][0] for sampling_name in results.keys()],
    'Precision': [results[sampling_name]["precision"][0] for sampling_name in results.keys()],
    'Recall': [results[sampling_name]["recall"][0] for sampling_name in results.keys()],
    'F1 Score': [results[sampling_name]["f1_score"][0] for sampling_name in results.keys()]
})

# Print the results DataFrame
results_df1

In [None]:
results_1 = results_df1
results_1

In [None]:
results_1.to_csv('results_1.csv', index=False)
from google.colab import files
files.download('results_1.csv')

In [None]:
plt.figure(figsize=(10, 6))

plot_data = results_1.set_index('Sampling Method').T

x_values = plot_data.index

for method in plot_data.columns:
    plt.plot(x_values, plot_data[method], marker='o', label=method)

plt.title('Comparison of Imbalance Handling Methods')
plt.xlabel('Metric')
plt.ylabel('Metric Value')
plt.xticks(rotation=90, ha='right')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# **Train Model-based**

In [None]:
models = {
    "KNN": KNeighborsClassifier(),
    # "SVM": SVC(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, eval_metric='logloss'),
    "Random Forest": RandomForestClassifier(random_state=42),
}

results = []
for model_name, model in models.items():
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_score_list = []
    confusion_matrix_sum = np.zeros((2, 2))  # Sum of confusion matrices
    smoteenn = SMOTEENN(random_state=42)
    X_resampled, y_resampled = smoteenn.fit_resample(Xraw, yraw)

    for train_index, test_index in kfold.split(X_resampled, y_resampled):
        X_train, X_test = X_resampled.iloc[train_index, :], X_resampled.iloc[test_index, :]
        y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Calculate metrics
        accuracy_list.append(accuracy_score(y_test, y_pred))
        precision_list.append(precision_score(y_test, y_pred, zero_division=0))
        recall_list.append(recall_score(y_test, y_pred, zero_division=0))
        f1_score_list.append(f1_score(y_test, y_pred, zero_division=0))
        confusion_matrix_sum += confusion_matrix(y_test, y_pred)  # Sum of confusion matrices

    # Average metrics over folds
    accuracy_avg = np.mean(accuracy_list)
    precision_avg = np.mean(precision_list)
    recall_avg = np.mean(recall_list)
    f1_score_avg = np.mean(f1_score_list)

    # Average confusion matrix
    confusion_matrix_avg = confusion_matrix_sum / 10

    # Add results to list
    results.append([model_name, accuracy_avg, precision_avg, recall_avg, f1_score_avg, confusion_matrix_avg])

# Create DataFrame with results
results_df2 = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'Confusion Matrix'])

# Print results
results_df2

# Plot average confusion matrix for each model
for index, row in results_df2.iterrows():
    plt.figure(figsize=(8, 6))
    sns.heatmap(row['Confusion Matrix'], annot=True, fmt=".2f", cmap="Blues", cbar=True)
    plt.title(f"Average Confusion Matrix ({row['Model']})")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()

In [None]:
results_2 = results_df2.drop(columns='Confusion Matrix')
results_2

In [45]:
# results_2.to_csv('results_2.csv')

In [None]:
plt.figure(figsize=(10, 6))

plot_data = results_2.set_index('Model').T

x_values = plot_data.index

for model in plot_data.columns:
    plt.plot(x_values, plot_data[model], marker='o', label=model)

plt.title('Model Performance Comparison')
plt.xlabel('Metric')
plt.ylabel('Metric Value')
plt.xticks(rotation=90, ha='right')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# **Optimization**

In [None]:
from sklearn.metrics import make_scorer
import time

## XGBOOST

In [None]:
# prompt: tối ưu hóa XGBoost (SmoteENN bằng Xraw, Yraw, kfold = 10) bằng Random Search, Grid Search, Bayesian Optimization xuất df để so sánh 4 phương pháp này

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from xgboost import XGBClassifier
import time
from skopt import BayesSearchCV

param_grid_random = {
    'n_estimators': [int(x) for x in np.linspace(start=50, stop=200, num=10)],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': [int(x) for x in np.linspace(start=3, stop=10, num=4)],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 3, 5],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}


param_grid_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
}

param_space_bayes = {
    'n_estimators': (50, 200),
    'learning_rate': (0.01, 0.3),
    'max_depth': (3, 10),
    'subsample': (0.6, 0.9),
    'colsample_bytree': (0.6, 0.9),
    'gamma': (0, 0.5),
    'min_child_weight': (1, 10),
    'reg_alpha': (0, 1),
    'reg_lambda': (0, 1)
}


optimization_methods = {
    "Random Search": RandomizedSearchCV(
        estimator=XGBClassifier(random_state=42, eval_metric='logloss'),
        param_distributions=param_grid_random,
        n_iter=50, cv=kfold, scoring=make_scorer(f1_score), random_state=42, n_jobs=-1, verbose=1
    ),

    "Grid Search": GridSearchCV(
        estimator=XGBClassifier(random_state=42, eval_metric='logloss'),
        param_grid=param_grid_grid, cv=kfold, scoring=make_scorer(f1_score), n_jobs=-1, verbose=1
    ),

    "Bayesian Optimization": BayesSearchCV(
        estimator=XGBClassifier(random_state=42, eval_metric='logloss'),
        search_spaces=param_space_bayes,
        n_iter=50, cv=kfold, scoring=make_scorer(f1_score), n_jobs=-1, verbose=1
    )
}

results = []
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(Xraw, yraw)

for method_name, method in optimization_methods.items():
    start_time = time.time()
    method.fit(X_resampled, y_resampled)
    end_time = time.time()
    elapsed_time = end_time - start_time
    results.append([method_name, method.best_score_, method.best_params_, elapsed_time])
    print(f"Method: {method_name}")
    print(f"Best Score: {method.best_score_}")
    print(f"Best Params: {method.best_params_}")
    print(f"Elapsed Time: {elapsed_time} seconds")

results_df3 = pd.DataFrame(results, columns=['Method', 'Best Score', 'Best Params', 'Time'])

In [None]:
results_3 = results_df3

In [None]:
# results_3.to_csv('results_3.csv', index=False)
# from google.colab import files
# files.download('results_3.csv')

In [None]:
param_list = [
    {'subsample': 0.9, 'reg_lambda': 0, 'reg_alpha': 0.1, 'n_estimators': 150, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.3, 'gamma': 0.1, 'colsample_bytree': 0.6},
    {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8},
    {'colsample_bytree': 0.8648333247991004, 'gamma': 0.032838033791979145, 'learning_rate': 0.29805763097322235, 'max_depth': 6, 'min_child_weight': 2, 'n_estimators': 195, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.808130004667452}
]

results = []

for i, params in enumerate(param_list):
    model = XGBClassifier(random_state=42, eval_metric='logloss', **params)
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_score_list = []
    conf_matrices = []

    smoteENN = SMOTEENN(random_state=42)
    X_resampled, y_resampled = smoteENN.fit_resample(Xraw, yraw)

    for train_index, test_index in kfold.split(X_resampled, y_resampled):
        X_train, X_test = X_resampled.iloc[train_index, :], X_resampled.iloc[test_index, :]
        y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]

        # Train the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Collect metrics
        accuracy_list.append(accuracy_score(y_test, y_pred))
        precision_list.append(precision_score(y_test, y_pred))
        recall_list.append(recall_score(y_test, y_pred))
        f1_score_list.append(f1_score(y_test, y_pred))
        conf_matrices.append(confusion_matrix(y_test, y_pred))

    # Average metrics
    avg_accuracy = np.mean(accuracy_list)
    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_f1 = np.mean(f1_score_list)
    avg_conf_matrix = np.mean(conf_matrices, axis=0)

    results.append([i+1, avg_accuracy, avg_precision, avg_recall, avg_f1, avg_conf_matrix])

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(avg_conf_matrix, annot=True, fmt=".2f", cmap="Blues", cbar=True)
    plt.title(f"Average Confusion Matrix (Params Set {i+1})")
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    plt.show()

# Create a DataFrame for results
results_df = pd.DataFrame(results, columns=['Param Set', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'Confusion Matrix'])
results_df


In [None]:
results_4 = results_df.drop(columns=['Confusion Matrix'])
results_4

In [None]:
# results_4.to_csv('results_4.csv', index=False)

# from google.colab import files
# files.download('results_4.csv')

In [None]:
plt.figure(figsize=(10, 6))

plot_data = results_4.set_index('Param Set').T

x_values = plot_data.index

for param_set in plot_data.columns:
    plt.plot(x_values, plot_data[param_set], marker='o', label=f'Param Set {param_set}')

plt.title('Comparison of Parameter Sets')
plt.xlabel('Metric')
plt.ylabel('Metric Value')
plt.xticks(rotation=90, ha='right')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## KNN

In [None]:
from sklearn.metrics import make_scorer
import time
# Define parameter grids for optimization
param_grid_random_knn = {
    'n_neighbors': [int(x) for x in np.linspace(start=3, stop=50, num=10)],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

param_grid_grid_knn = {
    'n_neighbors': [5, 10, 15],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}


param_space_bayes_knn = {
    'n_neighbors': (3, 50),
    'weights': ['uniform', 'distance'],
    'p': (1, 2)
}


optimization_methods_knn = {
    "Random Search": RandomizedSearchCV(
        estimator=KNeighborsClassifier(),
        param_distributions=param_grid_random_knn,
        n_iter=20, cv=kfold, scoring=make_scorer(f1_score), random_state=42, n_jobs=-1, verbose=1
    ),

    "Grid Search": GridSearchCV(
        estimator=KNeighborsClassifier(),
        param_grid=param_grid_grid_knn, cv=kfold, scoring=make_scorer(f1_score), n_jobs=-1, verbose=1
    ),

    "Bayesian Optimization": BayesSearchCV(
        estimator=KNeighborsClassifier(),
        search_spaces=param_space_bayes_knn,
        n_iter=20, cv=kfold, scoring=make_scorer(f1_score), n_jobs=-1, verbose=1
    )
}

results_knn = []
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(Xraw, yraw)


for method_name, method in optimization_methods_knn.items():
    start_time = time.time()
    method.fit(X_resampled, y_resampled)
    end_time = time.time()
    elapsed_time = end_time - start_time
    results_knn.append([method_name, method.best_score_, method.best_params_, elapsed_time])
    print(f"Method: {method_name}")
    print(f"Best Score: {method.best_score_}")
    print(f"Best Params: {method.best_params_}")
    print(f"Elapsed Time: {elapsed_time} seconds")

results_df_knn = pd.DataFrame(results_knn, columns=['Method', 'Best Score', 'Best Params', 'Time'])

In [None]:
results_knn = results_df_knn
results_knn

In [None]:
# results_knn.to_csv('results_knn.csv', index=False)
# from google.colab import files
# files.download('results_knn.csv')

In [None]:
def train_knn_models(Xraw, yraw, param_list):
    """Trains KNN models with specified parameters and evaluates performance using stratified k-fold cross-validation.
    Plots the average confusion matrix for each parameter set.

    Args:
        Xraw: Features.
        yraw: Labels.
        param_list: A list of dictionaries where each dictionary contains parameters for a KNN model.

    Returns:
        A pandas DataFrame containing evaluation metrics for each model.
    """

    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    results = []

    for i, params in enumerate(param_list):
        model = KNeighborsClassifier(**params)
        accuracy_list = []
        precision_list = []
        recall_list = []
        f1_score_list = []
        confusion_matrices = []

        smote_enn = SMOTEENN(random_state=42)
        X_resampled, y_resampled = smote_enn.fit_resample(Xraw, yraw)

        for train_index, test_index in kfold.split(X_resampled, y_resampled):
            X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
            y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            accuracy_list.append(accuracy_score(y_test, y_pred))
            precision_list.append(precision_score(y_test, y_pred, zero_division=0))
            recall_list.append(recall_score(y_test, y_pred, zero_division=0))
            f1_score_list.append(f1_score(y_test, y_pred, zero_division=0))
            confusion_matrices.append(confusion_matrix(y_test, y_pred))

        avg_accuracy = np.mean(accuracy_list)
        avg_precision = np.mean(precision_list)
        avg_recall = np.mean(recall_list)
        avg_f1 = np.mean(f1_score_list)
        avg_confusion_matrix = np.mean(confusion_matrices, axis=0)

        results.append([i + 1, avg_accuracy, avg_precision, avg_recall, avg_f1, avg_confusion_matrix])

        # Plot average confusion matrix
        plt.figure(figsize=(6, 5))
        sns.heatmap(avg_confusion_matrix, annot=True, fmt=".2f", cmap="Blues")
        plt.title(f"Average Confusion Matrix for Param Set {i + 1}")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()

    results_df = pd.DataFrame(results, columns=['Param Set', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'Confusion Matrix'])
    return results_df

# Example usage (replace with your actual data and parameter lists)
# Assuming Xraw and yraw are defined
param_list = [
    {'weights': 'uniform', 'p': 1, 'n_neighbors': 8, 'algorithm': 'kd_tree'},
    {'n_neighbors': 5, 'p': 1, 'weights': 'distance'},
    {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
]

results_df = train_knn_models(Xraw, yraw, param_list)

In [None]:
results4_knn = results_df.drop(columns=['Confusion Matrix'])
results4_knn

In [None]:
# results4_knn.to_csv('results4_knn.csv', index=False)
# from google.colab import files
# files.download('results4_knn.csv')

In [None]:
plt.figure(figsize=(10, 6))

plot_data = results4_knn.set_index('Param Set').T

x_values = plot_data.index

for param_set in plot_data.columns:
    plt.plot(x_values, plot_data[param_set], marker='o', label=f'Param Set {param_set}')

plt.title('Comparison of Parameter Sets for KNN')
plt.xlabel('Metric')
plt.ylabel('Metric Value')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## Logistic Regression

In [None]:
param_grid_random = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'] # 'l1' penalty can only be used with these solvers.
}

param_grid_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['liblinear', 'lbfgs']
}

param_space_bayes = {
    'C': (0.001, 100),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}


optimization_methods = {
    "Random Search": RandomizedSearchCV(
        estimator=LogisticRegression(random_state=42, max_iter=1000), # Increased max_iter
        param_distributions=param_grid_random,
        n_iter=20, cv=kfold, scoring=make_scorer(f1_score), random_state=42, n_jobs=-1, verbose=1
    ),

    "Grid Search": GridSearchCV(
        estimator=LogisticRegression(random_state=42, max_iter=1000), # Increased max_iter
        param_grid=param_grid_grid, cv=kfold, scoring=make_scorer(f1_score), n_jobs=-1, verbose=1
    ),

    "Bayesian Optimization": BayesSearchCV(
        estimator=LogisticRegression(random_state=42, max_iter=1000), # Increased max_iter
        search_spaces=param_space_bayes,
        n_iter=20, cv=kfold, scoring=make_scorer(f1_score), n_jobs=-1, verbose=1
    )
}

results = []
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(Xraw, yraw)

for method_name, method in optimization_methods.items():
    start_time = time.time()
    method.fit(X_resampled, y_resampled)
    end_time = time.time()
    elapsed_time = end_time - start_time
    results.append([method_name, method.best_score_, method.best_params_, elapsed_time])
    print(f"Method: {method_name}")
    print(f"Best Score: {method.best_score_}")
    print(f"Best Params: {method.best_params_}")
    print(f"Elapsed Time: {elapsed_time} seconds")

results_df = pd.DataFrame(results, columns=['Method', 'Best Score', 'Best Params', 'Time'])

#No optimization
model = LogisticRegression(random_state=42, max_iter=1000)
start_time = time.time()
model.fit(X_resampled, y_resampled)
end_time = time.time()
elapsed_time = end_time - start_time
no_opt_score = cross_val_score(model, X_resampled, y_resampled, cv = kfold, scoring='f1').mean()

results.append(['No Optimization', no_opt_score, {}, elapsed_time])

results_df = pd.DataFrame(results, columns=['Method', 'Best Score', 'Best Params', 'Time'])
results_df

In [None]:
results3_LR = results_df

In [None]:
# results3_LR.to_csv('results3_LR.csv', index=False)
# from google.colab import files
# files.download('results3_LR.csv')

In [None]:
def train_lr_models(Xraw, yraw, param_list):
    """Trains Logistic Regression models with specified parameters and evaluates using stratified K-fold CV.

    Args:
        Xraw: Features.
        yraw: Labels.
        param_list: List of dictionaries with parameters for Logistic Regression models.

    Returns:
        DataFrame containing confusion matrices and performance metrics for each parameter set.
    """
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    results = []

    for i, params in enumerate(param_list):
        model = LogisticRegression(random_state=42, max_iter=1000, **params)
        conf_matrices = []
        accuracies = []
        precisions = []
        recalls = []
        f1_scores = []

        smote_enn = SMOTEENN(random_state=42)
        X_resampled, y_resampled = smote_enn.fit_resample(Xraw, yraw)

        for train_index, test_index in kfold.split(X_resampled, y_resampled):
            X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
            y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            conf_matrices.append(confusion_matrix(y_test, y_pred))

            # Calculate metrics
            accuracies.append(accuracy_score(y_test, y_pred))
            precisions.append(precision_score(y_test, y_pred))
            recalls.append(recall_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred))

        avg_conf_matrix = np.mean(conf_matrices, axis=0)
        avg_accuracy = np.mean(accuracies)
        avg_precision = np.mean(precisions)
        avg_recall = np.mean(recalls)
        avg_f1_score = np.mean(f1_scores)

        results.append([i + 1, avg_conf_matrix, avg_accuracy, avg_precision, avg_recall, avg_f1_score])

        # Plotting confusion matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(avg_conf_matrix, annot=True, fmt=".2f", cmap="Blues", cbar=True)
        plt.title(f"Average Confusion Matrix (Params Set {i + 1})")
        plt.xlabel("Predicted label")
        plt.ylabel("True label")
        plt.show()

    results_df = pd.DataFrame(results, columns=['Param Set', 'Confusion Matrix', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
    return results_df

# Example usage (replace with your actual data)
param_list = [
    {'solver': 'saga', 'penalty': 'l1', 'C': 100},
    {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'},
    {'C': 99.04073752993764, 'penalty': 'l1', 'solver': 'saga'},
    {}
]

results_df = train_lr_models(Xraw, yraw, param_list)
print(results_df)


In [None]:
results4_LR = results_df.drop(columns=['Confusion Matrix'])
results4_LR

In [None]:
# results4_LR.to_csv('results4_LR.csv', index=False)
# from google.colab import files
# files.download('results4_LR.csv')

In [None]:
plt.figure(figsize=(10, 6))

plot_data = results4_LR.set_index('Param Set').T

x_values = plot_data.index

for param_set in plot_data.columns:
    plt.plot(x_values, plot_data[param_set], marker='o', label=f'Param Set {param_set}')

plt.title('Comparison of Parameter Sets for Logistic Regression')
plt.xlabel('Metric')
plt.ylabel('Metric Value')
plt.xticks(rotation=90, ha='right')  # Rotate x-axis labels for better readability
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## Random Forest

In [None]:
param_grid_random_rf = {
    'n_estimators': [int(x) for x in np.linspace(start=50, stop=200, num=10)],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

param_grid_grid_rf = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_space_bayes_rf = {
    'n_estimators': (50, 200),
    'max_depth': (10, 110),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 4),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

optimization_methods_rf = {
    "Random Search": RandomizedSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_distributions=param_grid_random_rf,
        n_iter=20, cv=kfold, scoring=make_scorer(f1_score), random_state=42, n_jobs=-1, verbose=1
    ),
    "Grid Search": GridSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        param_grid=param_grid_grid_rf, cv=kfold, scoring=make_scorer(f1_score), n_jobs=-1, verbose=1
    ),
    "Bayesian Optimization": BayesSearchCV(
        estimator=RandomForestClassifier(random_state=42),
        search_spaces=param_space_bayes_rf,
        n_iter=20, cv=kfold, scoring=make_scorer(f1_score), n_jobs=-1, verbose=1
    )
}

results = []
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(Xraw, yraw)

for method_name, method in optimization_methods_rf.items():
    start_time = time.time()
    method.fit(X_resampled, y_resampled)
    end_time = time.time()
    elapsed_time = end_time - start_time
    results.append([method_name, method.best_score_, method.best_params_, elapsed_time])
    print(f"Method: {method_name}")
    print(f"Best Score: {method.best_score_}")
    print(f"Best Params: {method.best_params_}")
    print(f"Elapsed Time: {elapsed_time} seconds")

results_df = pd.DataFrame(results, columns=['Method', 'Best Score', 'Best Params', 'Time'])
results_df

In [None]:
results3_RF = results_df

In [None]:
# results3_RF.to_csv('results3_RF.csv', index=False)
# from google.colab import files
# files.download('results3_RF.csv')

In [None]:
# prompt: train model random forest, smoteENN, kfold bằng 10 , xuất 4 chỉ số và confusion matrix với 3 trường hợp như sau:
# 1.{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}
# 2.{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
# 3.OrderedDict([('bootstrap', False), ('max_depth', 109), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 191)])
def train_rf_models(Xraw, yraw, param_list):
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    results = []

    for i, params in enumerate(param_list):
        model = RandomForestClassifier(random_state=42, **params)
        accuracy_list = []
        precision_list = []
        recall_list = []
        f1_score_list = []
        confusion_matrices = []

        smote_enn = SMOTEENN(random_state=42)
        X_resampled, y_resampled = smote_enn.fit_resample(Xraw, yraw)

        for train_index, test_index in kfold.split(X_resampled, y_resampled):
            X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
            y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            accuracy_list.append(accuracy_score(y_test, y_pred))
            precision_list.append(precision_score(y_test, y_pred, zero_division=0))
            recall_list.append(recall_score(y_test, y_pred, zero_division=0))
            f1_score_list.append(f1_score(y_test, y_pred, zero_division=0))
            confusion_matrices.append(confusion_matrix(y_test, y_pred))

        avg_accuracy = np.mean(accuracy_list)
        avg_precision = np.mean(precision_list)
        avg_recall = np.mean(recall_list)
        avg_f1 = np.mean(f1_score_list)
        avg_confusion_matrix = np.mean(confusion_matrices, axis=0)

        results.append([i + 1, avg_accuracy, avg_precision, avg_recall, avg_f1, avg_confusion_matrix])

        plt.figure(figsize=(6, 5))
        sns.heatmap(avg_confusion_matrix, annot=True, fmt=".2f", cmap="Blues")
        plt.title(f"Average Confusion Matrix for Param Set {i + 1}")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.show()

    results_df = pd.DataFrame(results, columns=['Param Set', 'Accuracy', 'Precision', 'Recall', 'F1-score', 'Confusion Matrix'])
    return results_df

# Example usage (replace with your actual data and parameter lists)
param_list = [
    {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False},
    {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200},
    {'bootstrap': False, 'max_depth': 109, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 191}
]

# Assuming Xraw and yraw are your features and labels
results_df = train_rf_models(Xraw, yraw, param_list)
results_df

In [None]:
results4_RF = results_df

In [None]:
# results4_RF.to_csv('results4_RF.csv', index=False)
# from google.colab import files
# files.download('results4_RF.csv')

In [None]:
results4_RF = results_df.drop(columns=['Confusion Matrix'])
results4_RF

plt.figure(figsize=(10, 6))

plot_data = results4_RF.set_index('Param Set').T

x_values = plot_data.index

for param_set in plot_data.columns:
    plt.plot(x_values, plot_data[param_set], marker='o', label=f'Param Set {param_set}')

plt.title('Comparison of Parameter Sets for Random Forest')
plt.xlabel('Metric')
plt.ylabel('Metric Value')
plt.xticks(rotation=90, ha='right')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## SVM

In [None]:
param_grid_random_svm = {
    'C': stats.uniform(0.1, 10),  # Use uniform distribution for C
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'] + list(np.logspace(-3, 3, 7)),  # Include 'scale' and 'auto'
    'degree': [2, 3, 4] # Only for poly kernel
}

param_grid_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto', 0.01, 1] # Include 'scale' and 'auto'
}

param_space_bayes_svm = {
    'C': (0.1, 10),
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': (1e-3, 1e3, 'log-uniform'),
    'degree': (2, 5) # Only for poly kernel
}

optimization_methods_svm = {
    "Random Search": RandomizedSearchCV(
        estimator=SVC(random_state=42),
        param_distributions=param_grid_random_svm,
        n_iter=20, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
        scoring=make_scorer(f1_score), n_jobs=-1, verbose=1, random_state=42
    ),
    "Grid Search": GridSearchCV(
        estimator=SVC(random_state=42),
        param_grid=param_grid_grid_svm,
        cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
        scoring=make_scorer(f1_score), n_jobs=-1, verbose=1
    ),
    "Bayesian Optimization": BayesSearchCV(
        estimator=SVC(random_state=42),
        search_spaces=param_space_bayes_svm,
        n_iter=20, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42),
        scoring=make_scorer(f1_score), n_jobs=-1, verbose=1, random_state=42
    )
}

results = []
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(Xraw, yraw)

for method_name, method in optimization_methods_svm.items():
    start_time = time.time()
    method.fit(X_resampled, y_resampled)
    end_time = time.time()
    elapsed_time = end_time - start_time
    results.append([method_name, method.best_score_, method.best_params_, elapsed_time])
    print(f"Method: {method_name}")
    print(f"Best Score: {method.best_score_}")
    print(f"Best Params: {method.best_params_}")
    print(f"Elapsed Time: {elapsed_time} seconds")

results_df = pd.DataFrame(results, columns=['Method', 'Best Score', 'Best Params', 'Time'])
results_df
results3_SVM = results_df
results3_SVM.to_csv('results3_SVM.csv', index=False)
from google.colab import files
files.download('results3_SVM.csv')

In [None]:
results4_SVM = results_df

In [None]:
results4_SVM.to_csv('results4_SVM.csv', index=False)
from google.colab import files
files.download('results4_SVM.csv')

# **STACKING MODEL**

In [68]:
import warnings
warnings.filterwarnings("ignore")

In [69]:
Xnp = Xraw.to_numpy()
ynp = yraw.to_numpy()

In [70]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Hàm tạo mô hình DNN
def create_dnn_model(input_dim):
    model = Sequential()
    # Lớp ẩn đầu tiên với 64 nơ-ron và hàm kích hoạt ReLU
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    # Lớp ẩn thứ hai với 32 nơ-ron và hàm kích hoạt ReLU
    model.add(Dense(32, activation='relu'))
    # Lớp đầu ra với 1 nơ-ron (đầu ra nhị phân) và hàm kích hoạt sigmoid
    model.add(Dense(1, activation='sigmoid'))
    
    # Biên dịch mô hình với optimizer Adam và hàm mất mát binary_crossentropy
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model


In [None]:
def stacking_model_with_kfold_avg_metrics(X, y, meta_model_type):
    # Base models
    base_estimators = [
        ('knn', KNeighborsClassifier(n_neighbors=3, p=1, weights='distance')),
        ('rf', RandomForestClassifier(max_depth=109, max_features='sqrt',
                                       min_samples_leaf=1, min_samples_split=2,
                                       n_estimators=191, random_state=42)),
        ('lr', LogisticRegression(random_state=42,C=99.04073752993764,penalty='l1',solver='saga')),
        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                              colsample_bytree=0.8648, gamma=0.0328,
                              learning_rate=0.298, max_depth=6, min_child_weight=2,
                              n_estimators=195, reg_alpha=0, reg_lambda=1,
                              subsample=0.8081, random_state=42))
    ]

    # KFold setup for base models
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    base_model_predictions = []
    smoteenn = SMOTEENN(random_state=42)
    X_res, y_res = smoteenn.fit_resample(X, y)
    # Collect predictions from base models using KFold
    for name, model in base_estimators:
        model_predictions = np.zeros_like(y_res)
        for train_index, test_index in kf.split(X_res,y_res):
            X_train, X_test = X_res[train_index], X_res[test_index]
            y_train = y_res[train_index]
            model.fit(X_train, y_train)
            model_predictions[test_index] = model.predict(X_test)
        base_model_predictions.append(model_predictions)

    # Combine predictions as new features
    X_meta = np.column_stack(base_model_predictions)

    # KFold setup for meta model
    metrics = {'Accuracy': [], 'Precision': [], 'Recall': [], 'F1-Score': [], 'ROC_AUC': [],
               'Specificity': []}  # Removed FNR and FPR
    conf_matrix_sum = np.zeros((2, 2))  # Initialize Confusion Matrix sum
    tprs = []  # Store True Positive Rates for ROC
    mean_fpr = np.linspace(0, 1, 100)  # Standard FPR values for averaging
    aucs = []

    for train_index, test_index in kf.split(X_meta,y_res):
        X_train_meta, X_test_meta = X_meta[train_index], X_meta[test_index]
        y_train_meta, y_test_meta = y_res[train_index], y_res[test_index]

        # Logistic Regression as meta model
        if meta_model_type == 'Logistic Regression':
            meta_model = LogisticRegression(random_state=42, solver='liblinear')
            meta_model.fit(X_train_meta, y_train_meta)
            y_pred = meta_model.predict(X_test_meta)
            y_prob = meta_model.predict_proba(X_test_meta)[:, 1]

        # Gradient Boosting as meta model
        elif meta_model_type == 'Gradient Boosting':
            meta_model = GradientBoostingClassifier(random_state=42)
            meta_model.fit(X_train_meta, y_train_meta)
            y_pred = meta_model.predict(X_test_meta)
            y_prob = meta_model.predict_proba(X_test_meta)[:, 1]

        # DNN as meta model (Ensure create_dnn_model is defined elsewhere)
        elif meta_model_type == 'DNN':
            meta_model = create_dnn_model(X_train_meta.shape[1])  # Define this function
            meta_model.fit(X_train_meta, y_train_meta, epochs=10, verbose=0, batch_size=32)
            y_prob = meta_model.predict(X_test_meta).ravel()
            y_pred = (y_prob > 0.5).astype(int)

        # Calculate metrics
        metrics['Accuracy'].append(accuracy_score(y_test_meta, y_pred))
        metrics['Precision'].append(precision_score(y_test_meta, y_pred, zero_division=1))
        metrics['Recall'].append(recall_score(y_test_meta, y_pred, zero_division=1))
        metrics['F1-Score'].append(f1_score(y_test_meta, y_pred))
        metrics['ROC_AUC'].append(roc_auc_score(y_test_meta, y_prob))

        # Aggregate confusion matrix
        conf_matrix = confusion_matrix(y_test_meta, y_pred)
        conf_matrix_sum += conf_matrix

        # Compute Specificity
        TN, FP, FN, TP = conf_matrix.ravel()
        specificity = TN / (TN + FP)

        metrics['Specificity'].append(specificity)

        # Compute ROC Curve
        fpr_vals, tpr, _ = roc_curve(y_test_meta, y_prob)
        tprs.append(np.interp(mean_fpr, fpr_vals, tpr))  # Interpolate TPR
        tprs[-1][0] = 0.0  # Ensure TPR starts at 0
        roc_auc = auc(fpr_vals, tpr)
        aucs.append(roc_auc)

    # Average ROC Curve
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0  # Ensure TPR ends at 1
    mean_auc = auc(mean_fpr, mean_tpr)

    # Normalize confusion matrix for display
    conf_matrix_avg = conf_matrix_sum / 10

    # Display confusion matrix using seaborn heatmap
    plt.figure(figsize=(7, 5))
    sns.heatmap(conf_matrix_avg, annot=True, fmt='.2f', cmap='Blues', 
                xticklabels=[0, 1], yticklabels=[0, 1], cbar=False)
    plt.title(f"Confusion Matrix")
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

    # Display averaged ROC Curve
    plt.figure()
    plt.plot(mean_fpr, mean_tpr, color='b', label=f'Mean ROC (AUC = {mean_auc:.2f})', lw=2)
    plt.fill_between(mean_fpr, mean_tpr - np.std(tprs, axis=0), mean_tpr + np.std(tprs, axis=0), color='blue', alpha=0.2)
    plt.plot([0, 1], [0, 1], color='r', linestyle='--', lw=2)
    plt.title(f"ROC Curve")
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.show()

    return {k: np.mean(v) for k, v in metrics.items()}

final_results = {}
for meta_model_type in ['Logistic Regression', 'Gradient Boosting', 'DNN']:
    print(f"Đang chạy mô hình Stacking với Meta Model: {meta_model_type}")
    metrics = stacking_model_with_kfold_avg_metrics(Xnp, ynp, meta_model_type)
    final_results[meta_model_type] = metrics
    print(f"\nKết quả cho {meta_model_type}:")
    for metric, value in metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")

# Hiển thị kết quả cuối cùng trong DataFrame
results_df = pd.DataFrame(final_results).T
print("\nTổng hợp kết quả cuối cùng:")
results_df

In [None]:
results_df.round(4)

In [79]:
results_df.to_csv('StackingModel.csv')

In [80]:
results_df = pd.read_csv(r"D:\NCKH\CHURN PREDICTION\CCP 03\Result\Results\3 stacking\StackingModel.csv")

In [None]:
plt.figure(figsize=(10, 6))

plot_data = results_df.set_index('Model').T

x_values = plot_data.index

for model in plot_data.columns:
    plt.plot(x_values, plot_data[model], marker='o', label=model)

plt.title('Model Performance Comparison')
plt.xlabel('Metric')
plt.ylabel('Metric Value')
plt.xticks(rotation=90, ha='right')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# **Features Analysis**

In [None]:
smote_enn = SMOTEENN(random_state=42)
X, y = smote_enn.fit_resample(Xnp, ynp)

# Các mô hình cơ bản
base_estimators = [
    ('knn', KNeighborsClassifier(n_neighbors=3, p=1, weights='distance')),
    ('rf', RandomForestClassifier(max_depth=109, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=191, random_state=42)),
    ('lr', LogisticRegression(random_state=42, C=99.04073752993764, penalty='l1', solver='saga')),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', colsample_bytree=0.8648, gamma=0.0328, learning_rate=0.298, max_depth=6, min_child_weight=2, n_estimators=195, reg_alpha=0, reg_lambda=1, subsample=0.8081, random_state=42))
]

# Cấu hình StratifiedKFold cho mô hình cơ bản
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
base_model_predictions = []

# Thu thập dự đoán từ các mô hình cơ bản sử dụng StratifiedKFold
for name, model in base_estimators:
    model_predictions = np.zeros_like(y)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        model_predictions[test_index] = model.predict(X_test)
    base_model_predictions.append(model_predictions)

# Kết hợp các dự đoán thành các đặc trưng mới
X_meta = np.column_stack(base_model_predictions)

# Chia dữ liệu thành train/test với tỉ lệ 80/20
X_train_meta, X_test_meta, y_train_meta, y_test_meta = train_test_split(X_meta, y, test_size=0.2, random_state=42, stratify=y)

# Huấn luyện mô hình meta (DNN) sử dụng dữ liệu train/test
meta_model = create_dnn_model(X_train_meta.shape[1])
meta_model.fit(X_train_meta, y_train_meta)

# Dự đoán nhãn và xác suất cho dữ liệu test
y_pred = meta_model.predict(X_test_meta)
y_prob = y_pred  # Dự đoán đã trả về xác suất nhãn dương

# Tính toán các chỉ số
metrics = {
    'Accuracy': accuracy_score(y_test_meta, (y_pred > 0.5)),  # Chuyển xác suất thành nhãn phân loại
    'Precision': precision_score(y_test_meta, (y_pred > 0.5), zero_division=1),
    'Recall': recall_score(y_test_meta, (y_pred > 0.5), zero_division=1),
    'F1-Score': f1_score(y_test_meta, (y_pred > 0.5)),
    'ROC_AUC': roc_auc_score(y_test_meta, y_prob)  # Sử dụng xác suất cho ROC AUC
}

# Chuyển các chỉ số thành DataFrame
metrics_df = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])

# Hiển thị bảng các chỉ số
print(metrics_df)

# Confusion Matrix và Heatmap
cm = confusion_matrix(y_test_meta, (y_pred > 0.5))

# Create a heatmap for the confusion matrix
plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix of Meta Model")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
metrics_df

In [None]:
explainer = shap.Explainer(meta_model, X_train_meta)
shap_values = explainer(X_test_meta)

In [None]:
shap.summary_plot(shap_values, X_test_meta)
plt.show()

In [88]:
# smote_enn = SMOTEENN(random_state=42)
# X, y = smote_enn.fit_resample(Xraw, yraw)

# # Base models
# base_estimators = [
#     ('knn', KNeighborsClassifier(n_neighbors=3, p=1, weights='distance')),
#     ('rf', RandomForestClassifier(max_depth=109, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=191, random_state=42)),
#     ('lr', LogisticRegression(random_state=42,C=99.04073752993764,penalty='l1',solver='saga')),
#     ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', colsample_bytree=0.8648, gamma=0.0328, learning_rate=0.298, max_depth=6, min_child_weight=2, n_estimators=195, reg_alpha=0, reg_lambda=1, subsample=0.8081, random_state=42))
# ]

# # KFold setup for base models (used for base model prediction collection)
# skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
# base_model_predictions = []

# # Collect predictions from base models using StratifiedKFold
# for name, model in base_estimators:
#     model_predictions = np.zeros_like(y)
#     for train_index, test_index in skf.split(X, y):  # Pass both X and y here
#         X_train, X_test = X.iloc[train_index], X.iloc[test_index]  # Use .iloc for DataFrame slicing
#         y_train, y_test = y[train_index], y[test_index]
#         model.fit(X_train, y_train)
#         model_predictions[test_index] = model.predict(X_test)
#     base_model_predictions.append(model_predictions)

# # Combine predictions as new features
# X_meta = np.column_stack(base_model_predictions)

# # Now use train_test_split for an 80/20 split
# X_train_meta, X_test_meta, y_train_meta, y_test_meta = train_test_split(X_meta, y, test_size=0.2, random_state=42, stratify=y)

# # Train meta model (GradientBoostingClassifier) using the 80/20 split
# meta_model = GradientBoostingClassifier(random_state=42)
# meta_model.fit(X_train_meta, y_train_meta)
# y_pred = meta_model.predict(X_test_meta)
# y_prob = meta_model.predict_proba(X_test_meta)[:, 1]

# metrics = {
#     'Accuracy': accuracy_score(y_test_meta, y_pred),
#     'Precision': precision_score(y_test_meta, y_pred, zero_division=1),
#     'Recall': recall_score(y_test_meta, y_pred, zero_division=1),
#     'F1-Score': f1_score(y_test_meta, y_pred),
#     'ROC_AUC': roc_auc_score(y_test_meta, y_prob)
# }

# # Convert metrics dictionary to DataFrame
# metrics_df = pd.DataFrame(list(metrics.items()), columns=['Metric', 'Value'])

# # Display the DataFrame
# print(metrics_df)

# # Confusion Matrix and Heatmap
# cm = confusion_matrix(y_test_meta, y_pred)

# # Create a heatmap for the confusion matrix
# plt.figure(figsize=(7, 5))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
# plt.title("Confusion Matrix of Meta Model")
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.show()

In [89]:
metrics_df.to_csv('finalModel.csv')

In [None]:
shap.plots.waterfall(shap_values[0])

In [None]:
shap.plots.bar(shap_values)

In [None]:
shap.plots.beeswarm(shap_values)

# LR

In [None]:
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(Xraw, yraw)

# Bước 2: Tách dữ liệu thành 80% huấn luyện và 20% kiểm tra
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Bước 3: Khởi tạo mô hình Logistic Regression với các tham số
LR_model = LogisticRegression(random_state=42, 
                              C=99.04073752993764, 
                              penalty='l1', 
                              solver='saga')

# Bước 4: Huấn luyện mô hình
LR_model.fit(X_train, y_train)

# Bước 5: Dự đoán và đánh giá mô hình
y_pred = LR_model.predict(X_test)

# Bước 6: Tính toán các chỉ số
metrics = {
    "Metric": ["Accuracy", "Precision", "Recall", "F1-Score"],
    "Value": [
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred, average='binary'),  # Điều chỉnh average nếu cần
        recall_score(y_test, y_pred, average='binary'),
        f1_score(y_test, y_pred, average='binary')
    ]
}

# Chuyển thành DataFrame
metrics_df = pd.DataFrame(metrics)

# Bước 7: In DataFrame
print(metrics_df)

# Vẽ confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [44]:
metrics_df.to_csv('resultLR.csv')

In [42]:
explainerLR = shap.Explainer(LR_model, X_train)
shap_valuesLR = explainerLR(X_test)

In [None]:
shap.summary_plot(shap_valuesLR, X_test)
plt.show()

In [None]:
features = ['Total_Trans_Amt','Total_Trans_Ct','PCA_Trans','Total_Relationship_Count','Total_Revolving_Bal']

# Kiểm tra xem các đặc trưng có trong dữ liệu X_resampled không
for feature in features:
    if feature in X_test.columns:
        plt.figure(figsize=(10, 6))
        
        # Trích xuất giá trị SHAP
        shap_values_np = shap_valuesLR.values
        
        # Kiểm tra kiểu dữ liệu của shap_values
        if isinstance(shap_values_np, np.ndarray):
            # Vẽ biểu đồ SHAP đối với mỗi đặc trưng
            shap.dependence_plot(feature, shap_values_np, X_test)
            plt.title(f"Shapley Dependence Plot for '{feature}' Feature")
            plt.show()
        else:
            print(f"Shapley values are not in a valid NumPy array format for feature '{feature}'.")
    else:
        print(f"Feature '{feature}' does not exist in the dataset.")

In [None]:
shap.plots.waterfall(shap_valuesLR[0])

In [None]:
shap.initjs()

In [None]:
shap.plots.force(shap_valuesLR[0])

In [None]:
shap.plots.force(shap_valuesLR[:100])

In [49]:
shap.save_html('shap_force_plot.html', shap.force_plot(shap_valuesLR[:100]))

In [None]:
shap.plots.bar(shap_values)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
coefficients = LR_model.coef_[0]
intercept = LR_model.intercept_
features = X_train.columns
coeff_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})
intercept_df = pd.DataFrame({'Feature': ['Intercept'], 'Coefficient': [intercept]})
coeff_df = pd.concat([coeff_df, intercept_df], ignore_index=True)
coeff_df

In [95]:
coeff_df.to_csv('coefficient.csv')

In [None]:
coeff_df

In [None]:
Xraw.shape