In [1]:
import numpy as np
import pandas as pd

from sklearn.base import ClassifierMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, make_pipeline as base_make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier

from matplotlib import pyplot as plt
from tabulate import tabulate

from mads_telemarketing_assignment.config import (
    APPROACHED_DATA_FILENAME,
    CATEGORICAL_FEATURES,
    PROCESSED_DATA_DIR,
)

In [2]:
# Various variables used in the code
random_state = 42

In [3]:
# Load approached dataset
df = pd.read_csv(PROCESSED_DATA_DIR / APPROACHED_DATA_FILENAME)

In [4]:
# Define X and y for modeling
X = df.drop(columns=["y"], axis=1)
y = df["y"]

print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (1515, 20), y shape: (1515,)


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=random_state,
    stratify=y, # Ensure the split maintains the same proportion of classes
    test_size=0.25, # 25% of the data for testing
)

In [11]:
# Define list of classifiers to be used in modeling process
classifiers = [
    [
        "Neural Net",
        MLPClassifier(
            random_state=random_state,
            max_iter=1000,
        ),
        {
            "mlpclassifier__max_iter": [200, 500, 1000],
        }
    ],
    [
        "Random Forest",
        RandomForestClassifier(random_state=random_state),
        {
            "randomforestclassifier__max_depth": [10, 20, 30],
        }
    ],
    [
        "AdaBoost",
        AdaBoostClassifier(random_state=random_state),
        {
            "adaboostclassifier__learning_rate": [1, 0.1, 0.01],
        }
    ],
    [
        "XGBoost",
        XGBClassifier(random_state=random_state),
        {
            "xgbclassifier__learning_rate": [1, 0.1, 0.01],
        }
    ],
]

In [12]:
def make_pipeline(classifier: ClassifierMixin) -> Pipeline:
    return base_make_pipeline(
        ColumnTransformer(
            [
                (
                    "categorical",
                    OneHotEncoder(
                        sparse_output=False,
                        handle_unknown="ignore",
                    ),
                    CATEGORICAL_FEATURES,
                ),
            ],
            remainder=StandardScaler(),
        ),
        # SMOTE(
        #     random_state=random_state,
        #     sampling_strategy="minority",
        # ),
        classifier,
    )

In [13]:
# Define dictionary to store predictions of each classifier
classifier_predictions = {
    name: np.zeros(len(y_train)) for name, _, _ in classifiers  # Placeholder for probabilities
}

print("Start modelling:")

# Loop through each classifier, fit, cross-validate and store results
for name, model, _ in classifiers:
    print(f"- {name}")

    pipeline = make_pipeline(model)
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

    for fold_train_index, fold_test_index in folds.split(X_train, y_train):
        # Split the data into training and test folds
        X_fold_train, X_fold_test = (
            X_train.iloc[fold_train_index],
            X_train.iloc[fold_test_index],
        )
        y_fold_train, y_fold_test = (
            y_train.iloc[fold_train_index],
            y_train.iloc[fold_test_index],
        )

        # Fit the pipeline on the training fold
        pipeline.fit(X_fold_train, y_fold_train)
        # Predict probabilities for the test fold
        y_fold_probabilities = pipeline.predict_proba(X_fold_test)[:, 1]

        # Append results per fold for each classifier into results dictionary
        classifier_predictions[name][fold_test_index] = y_fold_probabilities

print("✅ Modelling completed.")

Start modelling:
- Neural Net
- Random Forest
- AdaBoost
- XGBoost
✅ Modelling completed.


In [14]:
# Prepare headers and metrics for the table
headers = ["Metric"] + list(classifier_predictions.keys())

# Build the table using classifier_scores
metrics = [
    ("Accuracy", accuracy_score),
    ("Precision", precision_score),
    ("Recall", recall_score),
    ("F1", f1_score),
    ("ROC AUC", roc_auc_score),
]

table = []
for metric_name, metric_func in metrics:
    row = [metric_name]
    for scores in classifier_predictions.values():
        metric_value = (
            metric_func(y_train, (scores >= 0.5).astype(int))
            if metric_name != "ROC AUC"
            else metric_func(y_train, scores)
        )
        row.append(f"{metric_value:.3f}")
    table.append(row)

print(tabulate(table, headers, tablefmt="github"))

| Metric    |   Neural Net |   Random Forest |   AdaBoost |   XGBoost |
|-----------|--------------|-----------------|------------|-----------|
| Accuracy  |        0.654 |           0.69  |      0.69  |     0.699 |
| Precision |        0.718 |           0.717 |      0.706 |     0.741 |
| Recall    |        0.754 |           0.85  |      0.881 |     0.811 |
| F1        |        0.736 |           0.778 |      0.784 |     0.775 |
| ROC AUC   |        0.67  |           0.717 |      0.706 |     0.714 |


In [15]:
# fig, (auc_plot, alift_plot) = plt.subplots(1, 2, figsize=(10, 5))

# for name in classifier_predictions:
#     responses, true_positive_rates, thresholds = classifier_predictions[name]["AUC"][1]
#     auc_plot.plot(
#         responses,
#         true_positive_rates,
#         label=name,
#     )
# auc_plot.plot([0, 1], [0, 1], "k:", label="Random Classifier")
# auc_plot.set_title("ROC Curves")
# auc_plot.set_xlabel("False Positive Rate")
# auc_plot.set_ylabel("True Positive Rate")
# auc_plot.legend()


# plt.tight_layout()
# plt.show()
