In [1]:
import pandas as pd

df = pd.read_csv("bank-full.csv", sep=';')

print("Shape:", df.shape)
df.head()

Shape: (45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [2]:
print("Target Value Counts:")
print(df['y'].value_counts())

print("\nTarget Distribution (%):")
print(df['y'].value_counts(normalize=True) * 100)

Target Value Counts:
y
no     39922
yes     5289
Name: count, dtype: int64

Target Distribution (%):
y
no     88.30152
yes    11.69848
Name: proportion, dtype: float64


In [3]:
# Convert target to binary
df['y'] = df['y'].map({'yes': 1, 'no': 0})

df['y'].value_counts()

y
0    39922
1     5289
Name: count, dtype: int64

In [4]:
# Separate features and target
X = df.drop('y', axis=1)
y = df['y']

print("Feature shape:", X.shape)
print("Target shape:", y.shape)

Feature shape: (45211, 16)
Target shape: (45211,)


In [5]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

print("Categorical Columns:")
print(categorical_cols)

print("\nNumerical Columns:")
print(numerical_cols)

Categorical Columns:
Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')

Numerical Columns:
Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'], dtype='object')


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (36168, 16)
X_test shape: (9043, 16)


In [7]:
X_train_encoded = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)

print("Encoded Train Shape:", X_train_encoded.shape)
print("Encoded Test Shape:", X_test_encoded.shape)

Encoded Train Shape: (36168, 42)
Encoded Test Shape: (9043, 42)


In [8]:
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit on training data only
X_train_scaled = scaler.fit_transform(X_train_encoded)

# Transform test data
X_test_scaled = scaler.transform(X_test_encoded)

print("Scaling complete.")

Scaling complete.


**Model #1.LOGISTIC REGRESSION

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix
)

In [12]:
# Initialize model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Train
log_reg.fit(X_train_scaled, y_train)

# Predict
y_pred_lr = log_reg.predict(X_test_scaled)
y_prob_lr = log_reg.predict_proba(X_test_scaled)[:, 1]

print("Logistic Regression model trained.")

Logistic Regression model trained.


In [13]:
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_auc = roc_auc_score(y_test, y_prob_lr)
lr_precision = precision_score(y_test, y_pred_lr)
lr_recall = recall_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)
lr_mcc = matthews_corrcoef(y_test, y_pred_lr)

print("Accuracy:", lr_accuracy)
print("AUC:", lr_auc)
print("Precision:", lr_precision)
print("Recall:", lr_recall)
print("F1 Score:", lr_f1)
print("MCC:", lr_mcc)

Accuracy: 0.901249585314608
AUC: 0.9054433347971681
Precision: 0.643979057591623
Recall: 0.3487712665406427
F1 Score: 0.45248313917841815
MCC: 0.42644597941367


In [14]:
results = {}

results["Logistic Regression"] = {
    "Accuracy": lr_accuracy,
    "AUC": lr_auc,
    "Precision": lr_precision,
    "Recall": lr_recall,
    "F1": lr_f1,
    "MCC": lr_mcc
}

results

{'Logistic Regression': {'Accuracy': 0.901249585314608,
  'AUC': 0.9054433347971681,
  'Precision': 0.643979057591623,
  'Recall': 0.3487712665406427,
  'F1': 0.45248313917841815,
  'MCC': 0.42644597941367}}

**Model #2. DECISION TREE CLASSIFIER

In [16]:
from sklearn.tree import DecisionTreeClassifier

# Initialize model
dt = DecisionTreeClassifier(random_state=42)

# Train
dt.fit(X_train_scaled, y_train)

# Predict
y_pred_dt = dt.predict(X_test_scaled)
y_prob_dt = dt.predict_proba(X_test_scaled)[:, 1]

print("Decision Tree model trained.")

Decision Tree model trained.


In [17]:
dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_auc = roc_auc_score(y_test, y_prob_dt)
dt_precision = precision_score(y_test, y_pred_dt)
dt_recall = recall_score(y_test, y_pred_dt)
dt_f1 = f1_score(y_test, y_pred_dt)
dt_mcc = matthews_corrcoef(y_test, y_pred_dt)

print("Accuracy:", dt_accuracy)
print("AUC:", dt_auc)
print("Precision:", dt_precision)
print("Recall:", dt_recall)
print("F1 Score:", dt_f1)
print("MCC:", dt_mcc)

Accuracy: 0.8776954550481035
AUC: 0.713459783407689
Precision: 0.4782608695652174
Recall: 0.499054820415879
F1 Score: 0.4884366327474561
MCC: 0.41913981416680257


In [18]:
results["Decision Tree"] = {
    "Accuracy": dt_accuracy,
    "AUC": dt_auc,
    "Precision": dt_precision,
    "Recall": dt_recall,
    "F1": dt_f1,
    "MCC": dt_mcc
}

results

{'Logistic Regression': {'Accuracy': 0.901249585314608,
  'AUC': 0.9054433347971681,
  'Precision': 0.643979057591623,
  'Recall': 0.3487712665406427,
  'F1': 0.45248313917841815,
  'MCC': 0.42644597941367},
 'Decision Tree': {'Accuracy': 0.8776954550481035,
  'AUC': 0.713459783407689,
  'Precision': 0.4782608695652174,
  'Recall': 0.499054820415879,
  'F1': 0.4884366327474561,
  'MCC': 0.41913981416680257}}

**Model #3. K-Nearest Neighbours (KNN)

In [20]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize model
knn = KNeighborsClassifier(n_neighbors=5)

# Train
knn.fit(X_train_scaled, y_train)

# Predict
y_pred_knn = knn.predict(X_test_scaled)
y_prob_knn = knn.predict_proba(X_test_scaled)[:, 1]

print("KNN model trained.")

KNN model trained.


In [21]:
knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_auc = roc_auc_score(y_test, y_prob_knn)
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)
knn_mcc = matthews_corrcoef(y_test, y_pred_knn)

print("Accuracy:", knn_accuracy)
print("AUC:", knn_auc)
print("Precision:", knn_precision)
print("Recall:", knn_recall)
print("F1 Score:", knn_f1)
print("MCC:", knn_mcc)

Accuracy: 0.8936193741015149
AUC: 0.8083685975476229
Precision: 0.5860215053763441
Recall: 0.30907372400756145
F1 Score: 0.40470297029702973
MCC: 0.3742134073570698


In [22]:
results["KNN"] = {
    "Accuracy": knn_accuracy,
    "AUC": knn_auc,
    "Precision": knn_precision,
    "Recall": knn_recall,
    "F1": knn_f1,
    "MCC": knn_mcc
}

results

{'Logistic Regression': {'Accuracy': 0.901249585314608,
  'AUC': 0.9054433347971681,
  'Precision': 0.643979057591623,
  'Recall': 0.3487712665406427,
  'F1': 0.45248313917841815,
  'MCC': 0.42644597941367},
 'Decision Tree': {'Accuracy': 0.8776954550481035,
  'AUC': 0.713459783407689,
  'Precision': 0.4782608695652174,
  'Recall': 0.499054820415879,
  'F1': 0.4884366327474561,
  'MCC': 0.41913981416680257},
 'KNN': {'Accuracy': 0.8936193741015149,
  'AUC': 0.8083685975476229,
  'Precision': 0.5860215053763441,
  'Recall': 0.30907372400756145,
  'F1': 0.40470297029702973,
  'MCC': 0.3742134073570698}}

**Model #4. NAIVE BAYES

In [24]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

nb.fit(X_train_scaled, y_train)

y_pred_nb = nb.predict(X_test_scaled)
y_prob_nb = nb.predict_proba(X_test_scaled)[:, 1]

print("Naive Bayes model trained.")

Naive Bayes model trained.


In [25]:
nb_accuracy = accuracy_score(y_test, y_pred_nb)
nb_auc = roc_auc_score(y_test, y_prob_nb)
nb_precision = precision_score(y_test, y_pred_nb)
nb_recall = recall_score(y_test, y_pred_nb)
nb_f1 = f1_score(y_test, y_pred_nb)
nb_mcc = matthews_corrcoef(y_test, y_pred_nb)

print("Accuracy:", nb_accuracy)
print("AUC:", nb_auc)
print("Precision:", nb_precision)
print("Recall:", nb_recall)
print("F1 Score:", nb_f1)
print("MCC:", nb_mcc)

Accuracy: 0.8638726086475728
AUC: 0.8088133113481918
Precision: 0.42821576763485475
Recall: 0.4877126654064272
F1 Score: 0.45603181617322136
MCC: 0.3796553346420521


In [26]:
results["Naive Bayes"] = {
    "Accuracy": nb_accuracy,
    "AUC": nb_auc,
    "Precision": nb_precision,
    "Recall": nb_recall,
    "F1": nb_f1,
    "MCC": nb_mcc
}

results

{'Logistic Regression': {'Accuracy': 0.901249585314608,
  'AUC': 0.9054433347971681,
  'Precision': 0.643979057591623,
  'Recall': 0.3487712665406427,
  'F1': 0.45248313917841815,
  'MCC': 0.42644597941367},
 'Decision Tree': {'Accuracy': 0.8776954550481035,
  'AUC': 0.713459783407689,
  'Precision': 0.4782608695652174,
  'Recall': 0.499054820415879,
  'F1': 0.4884366327474561,
  'MCC': 0.41913981416680257},
 'KNN': {'Accuracy': 0.8936193741015149,
  'AUC': 0.8083685975476229,
  'Precision': 0.5860215053763441,
  'Recall': 0.30907372400756145,
  'F1': 0.40470297029702973,
  'MCC': 0.3742134073570698},
 'Naive Bayes': {'Accuracy': 0.8638726086475728,
  'AUC': 0.8088133113481918,
  'Precision': 0.42821576763485475,
  'Recall': 0.4877126654064272,
  'F1': 0.45603181617322136,
  'MCC': 0.3796553346420521}}

**Model #5. RANDOM FOREST

In [28]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_scaled, y_train)

y_pred_rf = rf.predict(X_test_scaled)
y_prob_rf = rf.predict_proba(X_test_scaled)[:, 1]

print("Random Forest trained.")

Random Forest trained.


In [29]:
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_auc = roc_auc_score(y_test, y_prob_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_mcc = matthews_corrcoef(y_test, y_pred_rf)

print("Accuracy:", rf_accuracy)
print("AUC:", rf_auc)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1 Score:", rf_f1)
print("MCC:", rf_mcc)

Accuracy: 0.9044564856795311
AUC: 0.9271974981445598
Precision: 0.655448717948718
Recall: 0.38657844990548207
F1 Score: 0.4863258026159334
MCC: 0.45608037914285554


In [30]:
results["Random Forest"] = {
    "Accuracy": rf_accuracy,
    "AUC": rf_auc,
    "Precision": rf_precision,
    "Recall": rf_recall,
    "F1": rf_f1,
    "MCC": rf_mcc
}

results

{'Logistic Regression': {'Accuracy': 0.901249585314608,
  'AUC': 0.9054433347971681,
  'Precision': 0.643979057591623,
  'Recall': 0.3487712665406427,
  'F1': 0.45248313917841815,
  'MCC': 0.42644597941367},
 'Decision Tree': {'Accuracy': 0.8776954550481035,
  'AUC': 0.713459783407689,
  'Precision': 0.4782608695652174,
  'Recall': 0.499054820415879,
  'F1': 0.4884366327474561,
  'MCC': 0.41913981416680257},
 'KNN': {'Accuracy': 0.8936193741015149,
  'AUC': 0.8083685975476229,
  'Precision': 0.5860215053763441,
  'Recall': 0.30907372400756145,
  'F1': 0.40470297029702973,
  'MCC': 0.3742134073570698},
 'Naive Bayes': {'Accuracy': 0.8638726086475728,
  'AUC': 0.8088133113481918,
  'Precision': 0.42821576763485475,
  'Recall': 0.4877126654064272,
  'F1': 0.45603181617322136,
  'MCC': 0.3796553346420521},
 'Random Forest': {'Accuracy': 0.9044564856795311,
  'AUC': 0.9271974981445598,
  'Precision': 0.655448717948718,
  'Recall': 0.38657844990548207,
  'F1': 0.4863258026159334,
  'MCC': 0.

**Model #6. XGBoost

In [32]:
!pip install xgboost



In [33]:
import xgboost
print("XGBoost installed successfully.")

XGBoost installed successfully.


In [34]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
    eval_metric='logloss'
)

xgb.fit(X_train_scaled, y_train)

y_pred_xgb = xgb.predict(X_test_scaled)
y_prob_xgb = xgb.predict_proba(X_test_scaled)[:, 1]

print("XGBoost model trained.")

XGBoost model trained.


In [35]:
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_auc = roc_auc_score(y_test, y_prob_xgb)
xgb_precision = precision_score(y_test, y_pred_xgb)
xgb_recall = recall_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
xgb_mcc = matthews_corrcoef(y_test, y_pred_xgb)

print("Accuracy:", xgb_accuracy)
print("AUC:", xgb_auc)
print("Precision:", xgb_precision)
print("Recall:", xgb_recall)
print("F1 Score:", xgb_f1)
print("MCC:", xgb_mcc)

Accuracy: 0.9046776512219397
AUC: 0.9240135390908994
Precision: 0.6666666666666666
Recall: 0.3705103969754253
F1 Score: 0.47630619684082626
MCC: 0.45098926384098353


In [36]:
results["XGBoost"] = {
    "Accuracy": xgb_accuracy,
    "AUC": xgb_auc,
    "Precision": xgb_precision,
    "Recall": xgb_recall,
    "F1": xgb_f1,
    "MCC": xgb_mcc
}

results

{'Logistic Regression': {'Accuracy': 0.901249585314608,
  'AUC': 0.9054433347971681,
  'Precision': 0.643979057591623,
  'Recall': 0.3487712665406427,
  'F1': 0.45248313917841815,
  'MCC': 0.42644597941367},
 'Decision Tree': {'Accuracy': 0.8776954550481035,
  'AUC': 0.713459783407689,
  'Precision': 0.4782608695652174,
  'Recall': 0.499054820415879,
  'F1': 0.4884366327474561,
  'MCC': 0.41913981416680257},
 'KNN': {'Accuracy': 0.8936193741015149,
  'AUC': 0.8083685975476229,
  'Precision': 0.5860215053763441,
  'Recall': 0.30907372400756145,
  'F1': 0.40470297029702973,
  'MCC': 0.3742134073570698},
 'Naive Bayes': {'Accuracy': 0.8638726086475728,
  'AUC': 0.8088133113481918,
  'Precision': 0.42821576763485475,
  'Recall': 0.4877126654064272,
  'F1': 0.45603181617322136,
  'MCC': 0.3796553346420521},
 'Random Forest': {'Accuracy': 0.9044564856795311,
  'AUC': 0.9271974981445598,
  'Precision': 0.655448717948718,
  'Recall': 0.38657844990548207,
  'F1': 0.4863258026159334,
  'MCC': 0.

In [37]:
import pandas as pd

results_df = pd.DataFrame(results).T
results_df

Unnamed: 0,Accuracy,AUC,Precision,Recall,F1,MCC
Logistic Regression,0.90125,0.905443,0.643979,0.348771,0.452483,0.426446
Decision Tree,0.877695,0.71346,0.478261,0.499055,0.488437,0.41914
KNN,0.893619,0.808369,0.586022,0.309074,0.404703,0.374213
Naive Bayes,0.863873,0.808813,0.428216,0.487713,0.456032,0.379655
Random Forest,0.904456,0.927197,0.655449,0.386578,0.486326,0.45608
XGBoost,0.904678,0.924014,0.666667,0.37051,0.476306,0.450989


In [38]:
import joblib
import os

# Create model folder
if not os.path.exists("model"):
    os.makedirs("model")

# Save models
joblib.dump(log_reg, "model/logistic_regression.pkl")
joblib.dump(dt, "model/decision_tree.pkl")
joblib.dump(knn, "model/knn.pkl")
joblib.dump(nb, "model/naive_bayes.pkl")
joblib.dump(rf, "model/random_forest.pkl")
joblib.dump(xgb, "model/xgboost.pkl")

# Save scaler
joblib.dump(scaler, "model/scaler.pkl")

print("All models saved successfully.")

All models saved successfully.


In [39]:
# Create small test dataset (500 rows)
test_data = df.sample(500, random_state=42)

test_data.to_csv("test_data.csv", sep=';', index=False)

print("Test data created successfully.")

Test data created successfully.


In [74]:
import os

for file in os.listdir("model_files"):
    size = os.path.getsize(f"model_files/{file}") / (1024*1024)
    print(file, ":", round(size, 2), "MB")

logistic_regression.pkl : 0.0 MB
decision_tree.pkl : 0.46 MB
knn.pkl : 11.87 MB
naive_bayes.pkl : 0.0 MB
random_forest.pkl : 55.98 MB
xgboost.pkl : 0.12 MB
scaler.pkl : 0.0 MB


# Smaller Random Forest for Deployment (Reduced Size)

In [78]:
# --------------------------------------------
# Smaller Random Forest for Deployment
# --------------------------------------------

from sklearn.ensemble import RandomForestClassifier

rf_small = RandomForestClassifier(
    n_estimators=30,
    random_state=42,
    n_jobs=-1
)

rf_small.fit(X_train_scaled, y_train)

print("Smaller Random Forest trained.")

# Save smaller model
import joblib
joblib.dump(rf_small, "model_files/random_forest.pkl")

print("Smaller Random Forest saved successfully.")

Smaller Random Forest trained.
Smaller Random Forest saved successfully.


In [80]:
import os

for file in os.listdir("model_files"):
    size = os.path.getsize(f"model_files/{file}") / (1024*1024)
    print(file, ":", round(size, 2), "MB")

logistic_regression.pkl : 0.0 MB
decision_tree.pkl : 0.46 MB
knn.pkl : 11.87 MB
naive_bayes.pkl : 0.0 MB
random_forest.pkl : 16.86 MB
xgboost.pkl : 0.12 MB
scaler.pkl : 0.0 MB
