In [1]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import pickle

In [2]:
df = pd.read_csv("../data/processed/bank-dataset-additional-processed-skewness.csv",index_col=0)

In [3]:
df.head()

Unnamed: 0,default,housing,loan,y,duration_boxcox,campaign_cbrt,pdays_special,pdays_transformed,had_previous,age_scaled,...,education_high.school,education_illiterate,education_professional.course,education_university.degree,contact_telephone,poutcome_nonexistent,poutcome_success,month_num,month_sin,month_cos
0,False,False,False,False,8.480303,1.0,0,6.907755,0,1.642253,...,False,False,False,False,True,True,False,5,0.5,-0.866025
2,False,True,False,False,8.166177,1.0,0,6.907755,0,-0.196452,...,True,False,False,False,True,True,False,5,0.5,-0.866025
3,False,False,False,False,7.319957,1.0,0,6.907755,0,0.09387,...,False,False,False,False,True,True,False,5,0.5,-0.866025
4,False,False,True,False,8.842255,1.0,0,6.907755,0,1.642253,...,True,False,False,False,True,True,False,5,0.5,-0.866025
6,False,False,False,False,7.152132,1.0,0,6.907755,0,1.932575,...,False,False,True,False,True,True,False,5,0.5,-0.866025


In [4]:
# Features and target
X = df.drop('y', axis=1)
y = df['y']

In [5]:
# Feature scaling (important for logistic regression)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# testing with logistic regression model

In [6]:
# Load the model back
with open('models/logistic_regression_model-skewness.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Now you can predict
y_pred = loaded_model.predict(X)

In [7]:
# Evaluation
print("Accuracy:", accuracy_score(y, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
print("Classification Report:\n", classification_report(y, y_pred))

Accuracy: 0.7603975334557859
Confusion Matrix:
 [[20940  5689]
 [ 1616  2243]]
Classification Report:
               precision    recall  f1-score   support

       False       0.93      0.79      0.85     26629
        True       0.28      0.58      0.38      3859

    accuracy                           0.76     30488
   macro avg       0.61      0.68      0.62     30488
weighted avg       0.85      0.76      0.79     30488



# testing with random forest model

In [8]:
# Load the model back
with open('models/random_forest_model-skewness.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Now you can predict
y_pred = loaded_model.predict(X)

In [9]:
# Evaluation
print("Accuracy:", accuracy_score(y, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
print("Classification Report:\n", classification_report(y, y_pred))

Accuracy: 0.8692600367357649
Confusion Matrix:
 [[25944   685]
 [ 3301   558]]
Classification Report:
               precision    recall  f1-score   support

       False       0.89      0.97      0.93     26629
        True       0.45      0.14      0.22      3859

    accuracy                           0.87     30488
   macro avg       0.67      0.56      0.57     30488
weighted avg       0.83      0.87      0.84     30488



# testing with xgboost model

In [10]:
# Load the model back
with open('models/xgboost_model_skewness.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Now you can predict
y_pred = loaded_model.predict(X)

In [11]:
# Evaluation
print("Accuracy:", accuracy_score(y, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
print("Classification Report:\n", classification_report(y, y_pred))

Accuracy: 0.762791918131724
Confusion Matrix:
 [[20472  6157]
 [ 1075  2784]]
Classification Report:
               precision    recall  f1-score   support

       False       0.95      0.77      0.85     26629
        True       0.31      0.72      0.43      3859

    accuracy                           0.76     30488
   macro avg       0.63      0.75      0.64     30488
weighted avg       0.87      0.76      0.80     30488



## Model Performance Summary

| Model               | Accuracy | Precision (True) | Recall (True) | F1 (True) | Notes                                              |
|---------------------|:--------:|:----------------:|:-------------:|:---------:|----------------------------------------------------|
| Logistic Regression | 76.04%   | 0.28              | 0.58          | 0.38      | Decent recall boost, precision still low           |
| Random Forest       | 86.93%   | 0.45              | 0.14          | 0.22      | Great accuracy for "no" class, struggles on "yes"  |
| XGBoost             | 76.28%   | 0.31              | 0.72          | 0.43      | Recall king, decent F1, similar accuracy to LogReg |
