In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
import pickle

In [3]:
df = pd.read_csv("../data/processed/bank-dataset-processed.csv",index_col=0)

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,y
0,58,4.0,1.0,3.0,0.0,2143,1.0,0.0,1.0,5,8.0,261,1,-1,0,0.0
1,44,10.0,2.0,2.0,0.0,29,1.0,0.0,1.0,5,8.0,151,1,-1,0,0.0
2,33,2.0,1.0,2.0,0.0,2,1.0,1.0,1.0,5,8.0,76,1,-1,0,0.0
3,47,1.0,1.0,0.0,0.0,1506,1.0,0.0,1.0,5,8.0,92,1,-1,0,0.0
4,33,5.0,2.0,0.0,0.0,1,0.0,0.0,1.0,5,8.0,198,1,-1,0,0.0


In [5]:
df.describe()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,y
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,4.696025,1.167725,2.060516,0.018027,1362.272058,0.555838,0.160226,0.416536,15.806419,5.523014,258.16308,2.763841,40.197828,0.580323,0.116985
std,10.618762,3.662424,0.60823,0.778704,0.133049,3044.765829,0.496878,0.36682,0.609586,8.322476,3.006911,257.527812,3.098021,100.128746,2.303441,0.321406
min,18.0,0.0,0.0,0.0,0.0,-8019.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.0,0.0,0.0
25%,33.0,1.0,1.0,2.0,0.0,72.0,0.0,0.0,0.0,8.0,3.0,103.0,1.0,-1.0,0.0,0.0
50%,39.0,4.0,1.0,2.0,0.0,448.0,1.0,0.0,0.0,16.0,6.0,180.0,2.0,-1.0,0.0,0.0
75%,48.0,8.0,2.0,3.0,0.0,1428.0,1.0,0.0,1.0,21.0,8.0,319.0,3.0,-1.0,0.0,0.0
max,95.0,11.0,2.0,3.0,1.0,102127.0,1.0,1.0,2.0,31.0,11.0,4918.0,63.0,871.0,275.0,1.0


In [6]:
# Features and target
X = df.drop('y', axis=1)
y = df['y']

In [7]:
# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Feature scaling (important for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Define model
xgb_model = xgb.XGBClassifier(
    random_state=42,
    eval_metric='logloss',   # Avoid unnecessary warnings
    scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train)  # Handle imbalance
)

In [12]:
# Fit the model
xgb_model.fit(X_train_scaled, y_train)

In [13]:
# Predict on test set
y_pred_xgb = xgb_model.predict(X_test_scaled)

In [14]:
# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))

Accuracy: 0.8761472962512441
Confusion Matrix:
 [[7031  921]
 [ 199  892]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.97      0.88      0.93      7952
         1.0       0.49      0.82      0.61      1091

    accuracy                           0.88      9043
   macro avg       0.73      0.85      0.77      9043
weighted avg       0.91      0.88      0.89      9043



# Testing to train model by balancing data using SMOTE


In [22]:
# Applying SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [25]:
# Initialize XGBoost Classifier
xgb_model = xgb.XGBClassifier(scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
                               random_state=42, eval_metric='logloss')

In [26]:
# Train XGBoost Model
xgb_model.fit(X_train_smote, y_train_smote)

In [27]:
# Make Predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8682959194957426
Confusion Matrix:
[[6934 1018]
 [ 173  918]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.98      0.87      0.92      7952
         1.0       0.47      0.84      0.61      1091

    accuracy                           0.87      9043
   macro avg       0.72      0.86      0.76      9043
weighted avg       0.92      0.87      0.88      9043



In [30]:
# Save the XGBoost model
with open('models/xgboost_model.pkl', 'wb') as file:
    pickle.dump(xgb_model, file)