In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
import pickle

In [2]:
df = pd.read_csv("../data/processed/bank-dataset-processed.csv",index_col=0)

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,y
0,58,4.0,1.0,3.0,0.0,2143,1.0,0.0,1.0,5,8.0,261,1,-1,0,0.0
1,44,10.0,2.0,2.0,0.0,29,1.0,0.0,1.0,5,8.0,151,1,-1,0,0.0
2,33,2.0,1.0,2.0,0.0,2,1.0,1.0,1.0,5,8.0,76,1,-1,0,0.0
3,47,1.0,1.0,0.0,0.0,1506,1.0,0.0,1.0,5,8.0,92,1,-1,0,0.0
4,33,5.0,2.0,0.0,0.0,1,0.0,0.0,1.0,5,8.0,198,1,-1,0,0.0


In [4]:
df.describe()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,y
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,4.696025,1.167725,2.060516,0.018027,1362.272058,0.555838,0.160226,0.416536,15.806419,5.523014,258.16308,2.763841,40.197828,0.580323,0.116985
std,10.618762,3.662424,0.60823,0.778704,0.133049,3044.765829,0.496878,0.36682,0.609586,8.322476,3.006911,257.527812,3.098021,100.128746,2.303441,0.321406
min,18.0,0.0,0.0,0.0,0.0,-8019.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.0,0.0,0.0
25%,33.0,1.0,1.0,2.0,0.0,72.0,0.0,0.0,0.0,8.0,3.0,103.0,1.0,-1.0,0.0,0.0
50%,39.0,4.0,1.0,2.0,0.0,448.0,1.0,0.0,0.0,16.0,6.0,180.0,2.0,-1.0,0.0,0.0
75%,48.0,8.0,2.0,3.0,0.0,1428.0,1.0,0.0,1.0,21.0,8.0,319.0,3.0,-1.0,0.0,0.0
max,95.0,11.0,2.0,3.0,1.0,102127.0,1.0,1.0,2.0,31.0,11.0,4918.0,63.0,871.0,275.0,1.0


In [5]:
# Features and target
X = df.drop('y', axis=1)
y = df['y']

In [6]:
# Split the data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Feature scaling (important for logistic regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
model = LogisticRegression(class_weight="balanced",random_state=42)
model.fit(X_train_scaled, y_train)

In [9]:
# Predictions
y_pred = model.predict(X_test_scaled)

In [10]:
# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.800729846289948
Confusion Matrix:
 [[6388 1564]
 [ 238  853]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      0.80      0.88      7952
         1.0       0.35      0.78      0.49      1091

    accuracy                           0.80      9043
   macro avg       0.66      0.79      0.68      9043
weighted avg       0.89      0.80      0.83      9043



In [27]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

In [28]:
# Train Logistic Regression
logreg = LogisticRegression(random_state=42, max_iter=1000)
logreg.fit(X_train_smote, y_train_smote)

In [31]:
# Predict
y_pred_logreg = logreg.predict(X_test_scaled)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))
print("AUC-ROC:", roc_auc_score(y_test, logreg.predict_proba(X_test_scaled)[:,1]))

Accuracy: 0.8013933429171735
Confusion Matrix:
 [[6395 1557]
 [ 239  852]]
Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      0.80      0.88      7952
         1.0       0.35      0.78      0.49      1091

    accuracy                           0.80      9043
   macro avg       0.66      0.79      0.68      9043
weighted avg       0.89      0.80      0.83      9043

AUC-ROC: 0.8650910965333706


In [13]:
# After fitting your model
coefficients = model.coef_[0]  # LogisticRegression stores coefficients in coef_ (inside a list)
feature_names = X.columns

# Create a dataframe with coefficients and odds ratios
odds_ratios = np.exp(coefficients)

summary = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients,
    "Odds Ratio": odds_ratios
})

print(summary.sort_values(by="Odds Ratio", ascending=False))

      Feature  Coefficient  Odds Ratio
11   duration     1.362228    3.904883
14   previous     0.339747    1.404592
13      pdays     0.249340    1.283178
3   education     0.157926    1.171080
2     marital     0.154371    1.166924
0         age     0.119503    1.126937
5     balance     0.085164    1.088895
1         job     0.064163    1.066267
10      month     0.032844    1.033389
9         day    -0.045811    0.955223
4     default    -0.077404    0.925516
8     contact    -0.267083    0.765610
7        loan    -0.274254    0.760139
12   campaign    -0.434548    0.647557
6     housing    -0.631282    0.531910


Odds ratio > 1 → Positive effect (increases probability of y=1)

Odds ratio < 1 → Negative effect (decreases probability of y=1)

Odds ratio = 1 → No effect

In [33]:
# Example: Save your trained model
with open('models/logistic_regression_model.pkl', 'wb') as file:
    pickle.dump(logreg, file)