In [1]:
import time
from typing import Dict, Tuple

import numpy as np
import pandas as pd
import scipy
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

### Feature selection, according to .txt info
1. duration is removed: "should be discarded if the intention is to have a realistic predictive model"
2. pdays is removed: This attribute is highly skewed being that pdays=999 when client has not been previously contacted. For the purpose of this project, this attribute will be discarded. 

Expanding on pdays: For future work, this attribute could be used to create a new feature (contacted (1) VS not contacted (0), or a categorical binning of the pdays values).

In [2]:
NUMERICAL_FEATURES = ["campaign", "previous", "age", "emp.var.rate", "cons.price.idx", "cons.conf.idx", "euribor3m", "nr.employed"]
CATEGORICAL_FEATURES = ["job", "marital", "education", "default", "housing", "poutcome", "loan", "contact", "month", "day_of_week"]

### Data Preprocessing
1. Read CSVs extracting features and targets from each, where the target column is mapped to binary values
2. Convert categorical features to numerical using OneHotEncoding
3. Scale numerical features using StandardScaler, a popular alternative is MinMaxScaler

Future work: With more data analysis (correlation, feature importance, etc.), we can choose features more wisely. Similarly, looking closer into the data distributions, we can scale the numerical features more appropriately (MinMaxScaler VS StandardScaler).

In [3]:
def read_data(path: str) -> pd.DataFrame:
    """Load data & split features VS target"""
    df = pd.read_csv(path, sep=";")
    target = df["y"].map({"yes": 1, "no": 0})
    features = df.drop("y", axis=1)
    return features, target


def transform_features(
    train: pd.DataFrame, test: pd.DataFrame
) -> Tuple[scipy.sparse.csr_matrix, scipy.sparse.csr_matrix]:
    """Transform features into a format suitable for training"""
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder())])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, NUMERICAL_FEATURES),
            ('cat', categorical_transformer, CATEGORICAL_FEATURES),
        ]
    )
    return preprocessor.fit_transform(train), preprocessor.transform(test)

### Model training
1. Choosing three relatively simple models to start with: Logistic Regression, K-Nearest Neighbors, and Random Forest
2. Initializing each with arbitrary hyperparameters, ensuring that they train in a reasonable amount of time
3. Once each model is trained, we can evaluate their performance on the holdout set using classification_report

Further work: We can use GridSearchCV/RandomSearchCV to find the best hyperparameters for each model. Additionally, there is much more model exploration to be done outside of our three initial candidates (e.g. Ensemble methods, Deep Learning, etc.)

In [4]:
def run_training_pipeline(
    x_train: scipy.sparse.csr_matrix, 
    x_test: scipy.sparse.csr_matrix, 
    y_train: np.array, 
    y_test: np.array,
) -> Dict[str, str]:
    """Train models and return classification_report for each"""
    models = {
        'LR': LogisticRegression(),
        'KNN': KNeighborsClassifier(n_neighbors=3),
        'RF': RandomForestClassifier(n_estimators=25)
    }
    performance = {}

    for name, model in models.items():
        strt = time.time()
        print(f"Training {name}...")

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        
        performance[name] = classification_report(y_test, y_pred)
        print(f"{name} traing done in {time.time() - strt:.2f}s")

    return performance

In [5]:
base_dir = "data/bank-additional/"

# pull in data
x_train, y_train = read_data(base_dir + "bank-additional-full.csv")
x_test, y_test = read_data(base_dir + "bank-additional.csv")

# prep for training
x_train, x_test = transform_features(x_train, x_test)
y_train = y_train.values
y_test = y_test.values

# train models
performance = run_training_pipeline(x_train, x_test, y_train, y_test)

Training LR...
LR traing done in 0.07s
Training KNN...
KNN traing done in 5.30s
Training RF...
RF traing done in 4.99s


In [6]:
# Logist regression does well on the majority class (class=0) but not so well on the minority class (class=1)

print(performance['LR'])

              precision    recall  f1-score   support

           0       0.91      0.98      0.95      3668
           1       0.64      0.24      0.35       451

    accuracy                           0.90      4119
   macro avg       0.78      0.61      0.65      4119
weighted avg       0.88      0.90      0.88      4119



In [7]:
# KNN Follows a similar pattern as logistic regression, but performs slightly better on the minority class

print(performance['KNN'])

              precision    recall  f1-score   support

           0       0.94      0.98      0.96      3668
           1       0.76      0.50      0.60       451

    accuracy                           0.93      4119
   macro avg       0.85      0.74      0.78      4119
weighted avg       0.92      0.93      0.92      4119



In [8]:
# Random Forest performs well on both classes, almost surprisingly well!

print(performance['RF'])

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3668
           1       0.98      0.92      0.95       451

    accuracy                           0.99      4119
   macro avg       0.98      0.96      0.97      4119
weighted avg       0.99      0.99      0.99      4119



### Wrap up
As a final note, we see that the random forest performs much better than the other two models. All models performed well on the majority class, but the random forest was the only one to perform well on the minority class as well.

Future work: Along with all other future work mentioned above, we could possibly investigate the use of  or other techniques to balance the classes. Although, it is important to note that the random forest model is already performing well on the minority class so this additional work may not be necessary.