In [56]:
import numpy as np
import pandas as pd

from sklearn.base import ClassifierMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, make_pipeline as base_make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier

from matplotlib import pyplot as plt
from tabulate import tabulate

import calendar
from datetime import datetime, timedelta


In [57]:
from pathlib import Path

CATEGORICAL_FEATURES = [
    "contact",
    "day_of_week",
    "default",
    "education",
    "housing",
    "job",
    "loan",
    "marital",
    "month",
    "poutcome",
    "year",
]
NUMERICAL_FEATURES = [
    "age",
    "campaign",
    "pdays",
    "previous",
    "emp.var.rate",
    "cons.price.idx",
    "cons.conf.idx",
    "euribor3m",
    "nr.employed",
]
BINARY_FEATURES = [
    "y",
]

DATA_DIR = Path("data")
RAW_DATA_DIR = DATA_DIR / "raw"
INTERIM_DATA_DIR = DATA_DIR / "interim"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

DATA_FILENAME = "bank-additional-full.csv"
APPROACHED_DATA_FILENAME = "approached_data.csv"
NOT_APPROACHED_DATA_FILENAME = "not_approached_data.csv"

HONOLULU_BLUE = "#1F77B4"
IMPERIAL_RED = "#F0534F"
PERSIAN_GREEN = "#27A69A"

In [58]:
# Various variables used in the code
random_state = 42

In [59]:
# Load not_approached dataset
df = pd.read_csv(PROCESSED_DATA_DIR / NOT_APPROACHED_DATA_FILENAME)

In [60]:
# Define X and y for modeling
X = df.drop(columns=["y"], axis=1)
y = df["y"]

print(f"X shape: {X.shape}, y shape: {y.shape}")

X.head()

X shape: (39673, 17), y shape: (39673,)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,year
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,1.1,93.994,-36.4,4.857,5191.0,2008
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,1.1,93.994,-36.4,4.857,5191.0,2008
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,1.1,93.994,-36.4,4.857,5191.0,2008
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,1.1,93.994,-36.4,4.857,5191.0,2008
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,1.1,93.994,-36.4,4.857,5191.0,2008


In [62]:
# dertemine the split in train and test data.
print(X.groupby(['year']).size().sort_index())
# Ratio between 2010 data and the rest
print(f"Ratio of 2010 data to the rest: {X[X['year'] == 2010].shape[0] / X[X['year'] != 2010].shape[0]:.2f}")


year
2008    27655
2009    10685
2010     1333
dtype: int64
Ratio of 2010 data to the rest: 0.03


In [None]:
# Show value counts of (year, month) combinations in X_train and X_test
print("X_train (year, month) value counts:")
print(X_train.groupby(['year', 'month']).size().sort_index())

print("\nX_test (year, month) value counts:")
print(X_test.groupby(['year', 'month']).size().sort_index())

print("\ndf (year, month) value counts:")
print(df.groupby(['year']).size().sort_index())

X_train (year, month) value counts:
year  month
2008  aug      5175
      dec        10
      jul      6685
      jun      4374
      may      7763
      nov      3581
      oct        67
2009  apr      2391
      mar       278
      may      1414
dtype: int64

X_test (year, month) value counts:
year  month
2009  aug       623
      dec       126
      jul       165
      jun       643
      may      4216
      nov       246
      oct       372
      sep       211
2010  apr       126
      aug       144
      jul       206
      jun       147
      mar       178
      may       128
      nov        84
      oct       122
      sep       198
dtype: int64

df (year, month) value counts:
year
2008    27655
2009    10685
2010     1333
dtype: int64


In [50]:
# Convert weekday strings to numbers (mon=0, ..., sun=6)
weekday_map = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4}
df["weekday_num"] = df["day_of_week"].map(weekday_map)

# Convert month to proper capitalization (e.g. 'may' → 'May')
df["month"] = df["month"].str.capitalize()

df[:2000].to_csv(PROCESSED_DATA_DIR / "not_approached_data_test.csv", index=False)

In [24]:
# Define list of classifiers to be used in modeling process
classifiers = [
    [
        "Neural Net",
        MLPClassifier(
            random_state=random_state,
            max_iter=1000,
        ),
        {
            "mlpclassifier__max_iter": [200, 500, 1000],
        }
    ],
    [
        "Random Forest",
        RandomForestClassifier(random_state=random_state),
        {
            "randomforestclassifier__max_depth": [10, 20, 30],
        }
    ],
    [
        "AdaBoost",
        AdaBoostClassifier(random_state=random_state),
        {
            "adaboostclassifier__learning_rate": [1, 0.1, 0.01],
        }
    ],
    [
        "XGBoost",
        XGBClassifier(random_state=random_state),
        {
            "xgbclassifier__learning_rate": [1, 0.1, 0.01],
        }
    ],
]

In [25]:
def make_pipeline(classifier: ClassifierMixin) -> Pipeline:
    return base_make_pipeline(
        ColumnTransformer(
            [
                (
                    "categorical",
                    OneHotEncoder(
                        sparse_output=False,
                        handle_unknown="ignore",
                    ),
                    CATEGORICAL_FEATURES,
                ),
            ],
            remainder=StandardScaler(),
        ),
        # SMOTE(
        #     random_state=random_state,
        #     sampling_strategy="minority",
        # ),
        classifier,
    )

In [26]:
# Define dictionary to store predictions of each classifier
classifier_predictions = {
    name: np.zeros(len(y_train)) for name, _, _ in classifiers  # Placeholder for probabilities
}

print("Start modelling:")

# Loop through each classifier, fit, cross-validate and store results
for name, model, _ in classifiers:
    print(f"- {name}")

    pipeline = make_pipeline(model)
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

    for fold_train_index, fold_test_index in folds.split(X_train, y_train):
        # Split the data into training and test folds
        X_fold_train, X_fold_test = (
            X_train.iloc[fold_train_index],
            X_train.iloc[fold_test_index],
        )
        y_fold_train, y_fold_test = (
            y_train.iloc[fold_train_index],
            y_train.iloc[fold_test_index],
        )

        # Fit the pipeline on the training fold
        pipeline.fit(X_fold_train, y_fold_train)
        # Predict probabilities for the test fold
        y_fold_probabilities = pipeline.predict_proba(X_fold_test)[:, 1]

        # Append results per fold for each classifier into results dictionary
        classifier_predictions[name][fold_test_index] = y_fold_probabilities

print("✅ Modelling completed.")

Start modelling:
- Neural Net
- Random Forest
- AdaBoost
- XGBoost
✅ Modelling completed.


In [27]:
# Prepare headers and metrics for the table
headers = ["Metric"] + list(classifier_predictions.keys())

# Build the table using classifier_scores
metrics = [
    ("Accuracy", accuracy_score),
    ("Precision", precision_score),
    ("Recall", recall_score),
    ("F1", f1_score),
    ("ROC AUC", roc_auc_score),
]

table = []
for metric_name, metric_func in metrics:
    row = [metric_name]
    for scores in classifier_predictions.values():
        metric_value = (
            metric_func(y_train, (scores >= 0.5).astype(int))
            if metric_name != "ROC AUC"
            else metric_func(y_train, scores)
        )
        row.append(f"{metric_value:.3f}")
    table.append(row)

print(tabulate(table, headers, tablefmt="github"))

| Metric    |   Neural Net |   Random Forest |   AdaBoost |   XGBoost |
|-----------|--------------|-----------------|------------|-----------|
| Accuracy  |        0.654 |           0.688 |      0.69  |     0.699 |
| Precision |        0.718 |           0.715 |      0.706 |     0.741 |
| Recall    |        0.754 |           0.85  |      0.881 |     0.811 |
| F1        |        0.736 |           0.776 |      0.784 |     0.775 |
| ROC AUC   |        0.67  |           0.716 |      0.706 |     0.714 |


In [15]:
# fig, (auc_plot, alift_plot) = plt.subplots(1, 2, figsize=(10, 5))

# for name in classifier_predictions:
#     responses, true_positive_rates, thresholds = classifier_predictions[name]["AUC"][1]
#     auc_plot.plot(
#         responses,
#         true_positive_rates,
#         label=name,
#     )
# auc_plot.plot([0, 1], [0, 1], "k:", label="Random Classifier")
# auc_plot.set_title("ROC Curves")
# auc_plot.set_xlabel("False Positive Rate")
# auc_plot.set_ylabel("True Positive Rate")
# auc_plot.legend()


# plt.tight_layout()
# plt.show()
