In [1]:
import sys
import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

print("Python version:", sys.version)
print("NumPy version:", np.__version__)
print("Pandas version:", pd.__version__)
print("Scikit-learn version:", sklearn.__version__)

Python version: 3.10.19 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 16:41:31) [MSC v.1929 64 bit (AMD64)]
NumPy version: 2.2.6
Pandas version: 2.3.3
Scikit-learn version: 1.7.1


In [2]:
DATASET_PATH = "dataset/train.csv"

df = pd.read_csv(DATASET_PATH, sep=";")

print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

TARGET_COLUMN = "y"

X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

print("\nNumber of features:", X.shape[1])
print("Number of instances:", X.shape[0])

# Assignment constraint checks
print("\nConstraint checks:")
print("Features >= 12:", X.shape[1] >= 12)
print("Instances >= 500:", X.shape[0] >= 500)

print("\nTarget distribution:")
print(y.value_counts())

Dataset shape: (45211, 17)

Column names:
['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']

Number of features: 16
Number of instances: 45211

Constraint checks:
Features >= 12: True
Instances >= 500: True

Target distribution:
y
no     39922
yes     5289
Name: count, dtype: int64


In [3]:
# Preprocessing - encoding + train-test split

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

# Preprocessing pipelines
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_cols),
        ("num", numerical_transformer, numerical_cols)
    ]
)

# Encode target variable (yes/no -> 1/0)
y_encoded = y.map({"yes": 1, "no": 0})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

print("\nTrain shape:", X_train.shape)
print("Test shape:", X_test.shape)

print("\nTarget distribution in train:")
print(y_train.value_counts(normalize=True))

print("\nTarget distribution in test:")
print(y_test.value_counts(normalize=True))

Categorical columns: ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
Numerical columns: ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

Train shape: (36168, 16)
Test shape: (9043, 16)

Target distribution in train:
y
0    0.883018
1    0.116982
Name: proportion, dtype: float64

Target distribution in test:
y
0    0.883003
1    0.116997
Name: proportion, dtype: float64


In [4]:
# Logistic Regression model & evaluation

from sklearn.linear_model import LogisticRegression

# Build pipeline
log_reg_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(
            max_iter=1000,
            class_weight="balanced",
            random_state=42
        ))
    ]
)

# Train model
log_reg_pipeline.fit(X_train, y_train)

# Predictions
y_pred = log_reg_pipeline.predict(X_test)
y_proba = log_reg_pipeline.predict_proba(X_test)[:, 1]

# Metrics
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

print("Logistic Regression Metrics")
print("-" * 35)
print(f"Accuracy  : {accuracy:.4f}")
print(f"AUC       : {auc:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1 Score  : {f1:.4f}")
print(f"MCC       : {mcc:.4f}")

Logistic Regression Metrics
-----------------------------------
Accuracy  : 0.8457
AUC       : 0.9079
Precision : 0.4182
Recall    : 0.8147
F1 Score  : 0.5527
MCC       : 0.5092


In [5]:
# Decision Tree Classifier & evaluation

from sklearn.tree import DecisionTreeClassifier

dt_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", DecisionTreeClassifier(
            max_depth=10,
            min_samples_split=20,
            class_weight="balanced",
            random_state=42
        ))
    ]
)

# Train
dt_pipeline.fit(X_train, y_train)

# Predictions
y_pred_dt = dt_pipeline.predict(X_test)
y_proba_dt = dt_pipeline.predict_proba(X_test)[:, 1]

# Metrics
accuracy_dt = accuracy_score(y_test, y_pred_dt)
auc_dt = roc_auc_score(y_test, y_proba_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)
mcc_dt = matthews_corrcoef(y_test, y_pred_dt)

print("Decision Tree Metrics")
print("-" * 35)
print(f"Accuracy  : {accuracy_dt:.4f}")
print(f"AUC       : {auc_dt:.4f}")
print(f"Precision : {precision_dt:.4f}")
print(f"Recall    : {recall_dt:.4f}")
print(f"F1 Score  : {f1_dt:.4f}")
print(f"MCC       : {mcc_dt:.4f}")

Decision Tree Metrics
-----------------------------------
Accuracy  : 0.8285
AUC       : 0.8712
Precision : 0.3917
Recall    : 0.8431
F1 Score  : 0.5349
MCC       : 0.4959


In [6]:
#  k-Nearest Neighbors Classifier & evaluation

from sklearn.neighbors import KNeighborsClassifier

knn_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", KNeighborsClassifier(
            n_neighbors=7,
            weights="distance",
            metric="minkowski"
        ))
    ]
)

# Train
knn_pipeline.fit(X_train, y_train)

# Predictions
y_pred_knn = knn_pipeline.predict(X_test)
y_proba_knn = knn_pipeline.predict_proba(X_test)[:, 1]

# Metrics
accuracy_knn = accuracy_score(y_test, y_pred_knn)
auc_knn = roc_auc_score(y_test, y_proba_knn)
precision_knn = precision_score(y_test, y_pred_knn)
recall_knn = recall_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)
mcc_knn = matthews_corrcoef(y_test, y_pred_knn)

print("k-Nearest Neighbors Metrics")
print("-" * 35)
print(f"Accuracy  : {accuracy_knn:.4f}")
print(f"AUC       : {auc_knn:.4f}")
print(f"Precision : {precision_knn:.4f}")
print(f"Recall    : {recall_knn:.4f}")
print(f"F1 Score  : {f1_knn:.4f}")
print(f"MCC       : {mcc_knn:.4f}")

k-Nearest Neighbors Metrics
-----------------------------------
Accuracy  : 0.8996
AUC       : 0.8513
Precision : 0.6298
Recall    : 0.3440
F1 Score  : 0.4450
MCC       : 0.4169


In [7]:
# Naive Bayes (Gaussian) & evaluation

from sklearn.naive_bayes import GaussianNB

# Transform features
X_train_nb = preprocessor.fit_transform(X_train)
X_test_nb = preprocessor.transform(X_test)

nb_model = GaussianNB()

# Train
nb_model.fit(X_train_nb, y_train)

# Predictions
y_pred_nb = nb_model.predict(X_test_nb)
y_proba_nb = nb_model.predict_proba(X_test_nb)[:, 1]

# Metrics
accuracy_nb = accuracy_score(y_test, y_pred_nb)
auc_nb = roc_auc_score(y_test, y_proba_nb)
precision_nb = precision_score(y_test, y_pred_nb)
recall_nb = recall_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb)
mcc_nb = matthews_corrcoef(y_test, y_pred_nb)

print("Naive Bayes Metrics")
print("-" * 35)
print(f"Accuracy  : {accuracy_nb:.4f}")
print(f"AUC       : {auc_nb:.4f}")
print(f"Precision : {precision_nb:.4f}")
print(f"Recall    : {recall_nb:.4f}")
print(f"F1 Score  : {f1_nb:.4f}")
print(f"MCC       : {mcc_nb:.4f}")

Naive Bayes Metrics
-----------------------------------
Accuracy  : 0.8548
AUC       : 0.8101
Precision : 0.4059
Recall    : 0.5198
F1 Score  : 0.4559
MCC       : 0.3774


In [8]:
# Random Forest Classifier & evaluation

from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(
            n_estimators=200,
            max_depth=15,
            min_samples_split=10,
            class_weight="balanced",
            random_state=42,
            n_jobs=-1
        ))
    ]
)

# Train
rf_pipeline.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_pipeline.predict(X_test)
y_proba_rf = rf_pipeline.predict_proba(X_test)[:, 1]

# Metrics
accuracy_rf = accuracy_score(y_test, y_pred_rf)
auc_rf = roc_auc_score(y_test, y_proba_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
mcc_rf = matthews_corrcoef(y_test, y_pred_rf)

print("Random Forest Metrics")
print("-" * 35)
print(f"Accuracy  : {accuracy_rf:.4f}")
print(f"AUC       : {auc_rf:.4f}")
print(f"Precision : {precision_rf:.4f}")
print(f"Recall    : {recall_rf:.4f}")
print(f"F1 Score  : {f1_rf:.4f}")
print(f"MCC       : {mcc_rf:.4f}")

Random Forest Metrics
-----------------------------------
Accuracy  : 0.8742
AUC       : 0.9261
Precision : 0.4772
Recall    : 0.7911
F1 Score  : 0.5953
MCC       : 0.5497


In [9]:
# XGBoost Classifier & evaluation

from xgboost import XGBClassifier

xgb_pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", XGBClassifier(
            n_estimators=300,
            max_depth=6,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
            eval_metric="logloss",
            random_state=42,
            n_jobs=-1
        ))
    ]
)

# Train
xgb_pipeline.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_pipeline.predict(X_test)
y_proba_xgb = xgb_pipeline.predict_proba(X_test)[:, 1]

# Metrics
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
auc_xgb = roc_auc_score(y_test, y_proba_xgb)
precision_xgb = precision_score(y_test, y_pred_xgb)
recall_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
mcc_xgb = matthews_corrcoef(y_test, y_pred_xgb)

print("XGBoost Metrics")
print("-" * 35)
print(f"Accuracy  : {accuracy_xgb:.4f}")
print(f"AUC       : {auc_xgb:.4f}")
print(f"Precision : {precision_xgb:.4f}")
print(f"Recall    : {recall_xgb:.4f}")
print(f"F1 Score  : {f1_xgb:.4f}")
print(f"MCC       : {mcc_xgb:.4f}")

XGBoost Metrics
-----------------------------------
Accuracy  : 0.8829
AUC       : 0.9305
Precision : 0.4997
Recall    : 0.8195
F1 Score  : 0.6208
MCC       : 0.5802


In [10]:
# Save trained models for Streamlit app

import os
import joblib

# Create model directory
os.makedirs("model", exist_ok=True)

joblib.dump(log_reg_pipeline, "model/logistic_regression.pkl")
joblib.dump(dt_pipeline, "model/decision_tree.pkl")
joblib.dump(knn_pipeline, "model/knn.pkl")
joblib.dump(nb_model, "model/naive_bayes.pkl")
joblib.dump(rf_pipeline, "model/random_forest.pkl")
joblib.dump(xgb_pipeline, "model/xgboost.pkl")

# Save preprocessor separately for Naive Bayes usage
joblib.dump(preprocessor, "model/preprocessor.pkl")

print("All models saved successfully in /model directory")
print(os.listdir("model"))

All models saved successfully in /model directory
['.ipynb_checkpoints', 'decision_tree.pkl', 'knn.pkl', 'logistic_regression.pkl', 'naive_bayes.pkl', 'preprocessor.pkl', 'random_forest.pkl', 'xgboost.pkl']
