In [275]:
# Interactive
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Standard library
from functools import reduce
import warnings
warnings.filterwarnings("ignore")

# Data
import numpy as np
import pandas as pd
from numpy.random import seed

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, FunctionTransformer
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import RandomizedSearchCV, HalvingRandomSearchCV, ParameterGrid, StratifiedKFold
from sklearn.metrics import log_loss, f1_score, accuracy_score, make_scorer
import xgboost as xgb
import joblib

# Custom transformers
import transformers as tfs

print('XGB Version',xgb.__version__)

XGB Version 1.7.2


## Global Settings

In [97]:
# Random state
seed = 1227
rs = np.random.RandomState(seed) 

# Train and test data paths
train_path = '../data/train_test/'
test_path = '../data/train_test/'

# Other paths
model_path = '../outputs/models/'
plot_path = '../outputs/plots/'

# K-fold
folds = 5

# Top x important features to visualize
top_num_features = 15

warnings.filterwarnings("ignore")

## Data

In [174]:
train_X, train_y = pd.read_parquet(train_path + 'train_X.parquet'), pd.read_parquet(train_path + 'train_y.parquet').to_numpy().ravel()
test_X, test_y = pd.read_parquet(test_path + 'test_X.parquet'), pd.read_parquet(test_path + 'test_y.parquet').to_numpy().ravel()

In [4]:
train_X.shape, train_y.shape

((32950, 20), (32950,))

In [5]:
test_X.shape, test_y.shape

((8238, 20), (8238,))

### Random Over Sampling

In [179]:
np.unique(train_y, return_counts=True)

(array(['no', 'yes'], dtype=object), array([29238,  3712]))

In [184]:
train_X, train_y = RandomOverSampler(sampling_strategy='auto', random_state=rs).fit_resample(train_X, train_y)

In [185]:
np.unique(train_y, return_counts=True)

(array(['no', 'yes'], dtype=object), array([29238, 29238]))

In [186]:
# New shapes
train_X.shape, train_y.shape

((58476, 20), (58476,))

## Pipeline

We create the following pipelines:

* There are no explicit missing values in both the training and test data; in addition, xgboost can handle missing values by default. However, if we expect future unseen data to contain missing values that need special handling, then we may need to invest in writing and testing additional custom transformers to include in the preprocessing steps.

* Because the learners are trees, it isn't necessary to perform feature scaling or normalization.

In [236]:
preprocessor = Pipeline([
    ('drop_duration', tfs.ColumnDropperTransformer(['duration'])),
    ('cat_feature_engineer', FunctionTransformer(tfs.cat_feature_engineer, validate=False)), # One hyperparameter: encode_type
    ('num_feature_engineer', FunctionTransformer(tfs.num_feature_engineer, validate=False)), # One hyperparameter: switch
    ('recursive_feature_selection', RFE(DecisionTreeClassifier(random_state=rs), n_features_to_select=0.5, step=0.2, verbose=0))
])
label_encoder = LabelEncoder()

In [237]:
train_y = label_encoder.fit_transform(train_y)
test_y = label_encoder.transform(test_y)
train_y, test_y

(array([0, 0, 1, ..., 1, 1, 1]), array([0, 0, 0, ..., 0, 0, 0]))

In [238]:
# Write preprocessor and label encoder to disk for later use
joblib.dump(preprocessor, '../outputs/pipeline/preprocessor.joblib')
joblib.dump(label_encoder, '../outputs/pipeline/label_encoder.joblib')

['../outputs/pipeline/preprocessor.joblib']

['../outputs/pipeline/label_encoder.joblib']

Next, we construct a final modeling pipeline that includes the gradient-boosting classifier, which we can pass into grid search.

In [239]:
# We will fix these xgboost parameters
fixed_params = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'verbosity': 1,
    'random_state': rs,
    'n_jobs': -1,
    'eval_metric': make_scorer(log_loss, labels=[0, 1])
    }  

modeling_pipe = Pipeline([
    ('preprocessor', preprocessor), # This step is in and of itself a pipeline (nested)
    ('classifier', xgb.XGBClassifier(**fixed_params))
])
joblib.dump(modeling_pipe, '../outputs/pipeline/modeling_pipeline.joblib')
modeling_pipe

['../outputs/pipeline/modeling_pipeline.joblib']

Due to the nested nature of the pipeline, it will be easier to see the entire list of parameters with their tags, so we can select the hyperparameter we need to include in a grid search. 

In [240]:
hp = sorted(modeling_pipe.get_params().keys())
len(hp)

86

In [241]:
hp

['classifier',
 'classifier__base_score',
 'classifier__booster',
 'classifier__callbacks',
 'classifier__colsample_bylevel',
 'classifier__colsample_bynode',
 'classifier__colsample_bytree',
 'classifier__early_stopping_rounds',
 'classifier__enable_categorical',
 'classifier__eval_metric',
 'classifier__feature_types',
 'classifier__gamma',
 'classifier__gpu_id',
 'classifier__grow_policy',
 'classifier__importance_type',
 'classifier__interaction_constraints',
 'classifier__learning_rate',
 'classifier__max_bin',
 'classifier__max_cat_threshold',
 'classifier__max_cat_to_onehot',
 'classifier__max_delta_step',
 'classifier__max_depth',
 'classifier__max_leaves',
 'classifier__min_child_weight',
 'classifier__missing',
 'classifier__monotone_constraints',
 'classifier__n_estimators',
 'classifier__n_jobs',
 'classifier__num_parallel_tree',
 'classifier__objective',
 'classifier__predictor',
 'classifier__random_state',
 'classifier__reg_alpha',
 'classifier__reg_lambda',
 'classifier

## Hyperparameter Tuning

We will first use halving grid search cv to try to find the best combination of hyperparameter for the gradient boosting classifier.

In [242]:
# Grid of hyperparameter values
param_grid = {
    # XGBoost hyperparameter
    'classifier__learning_rate': [0.0001, 0.001, 0.01, 0.1], 
    'classifier__n_estimators': list(range(50, 350, 50)), # Number of trees
    'classifier__max_depth': list(range(3, 11, 2)), # Tree size
    'classifier__gamma': [0, 5], # Regularization complexity 
    'classifier__colsample_bytree': [0.3, 0.5, 0.7], # Column sampling (features)
    'classifier__subsample': [0.5, 0.75, 1.0] # Row sampling (training instances)
}

The total number of models that could be train is $k$ (k fold csv) times total number of parameters (product of all values in the grid):

In [243]:
len(ParameterGrid(param_grid=param_grid)) * 5

8640

The randomized halving grid search uses successive halving to reduce the training time:

In [244]:
search = HalvingRandomSearchCV(
    estimator=modeling_pipe,
    param_distributions=param_grid,
    cv=StratifiedKFold(n_splits=5),
    scoring=make_scorer(f1_score, zero_division=0), # Use f1 score
    refit=True, # Get best estimator upon completion
    random_state=rs,
    verbose=1,
    n_jobs=-1 # Use all processors
).fit(train_X, train_y)

n_iterations: 7
n_required_iterations: 7
n_possible_iterations: 8
min_resources_: 20
max_resources_: 58476
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 1728
n_resources: 20
Fitting 5 folds for each of 1728 candidates, totalling 8640 fits




----------
iter: 1
n_candidates: 576
n_resources: 60
Fitting 5 folds for each of 576 candidates, totalling 2880 fits
----------
iter: 2
n_candidates: 192
n_resources: 180
Fitting 5 folds for each of 192 candidates, totalling 960 fits
----------
iter: 3
n_candidates: 64
n_resources: 540
Fitting 5 folds for each of 64 candidates, totalling 320 fits
----------
iter: 4
n_candidates: 22
n_resources: 1620
Fitting 5 folds for each of 22 candidates, totalling 110 fits
----------
iter: 5
n_candidates: 8
n_resources: 4860
Fitting 5 folds for each of 8 candidates, totalling 40 fits
----------
iter: 6
n_candidates: 3
n_resources: 14580
Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [252]:
# Save base model to disk
joblib.dump(search.best_estimator_['classifier'], model_path + 'baseline_model.joblib.dat')

['../outputs/models/baseline_model.joblib.dat']

In [261]:
# Save stored pipeline
joblib.dump(search, '../outputs/pipeline/baseline_pipeline_fitted.joblib')

['../outputs/pipeline/baseline_pipeline_fitted.joblib']

In [290]:
# Best model
model = search.best_estimator_['classifier']
# Preprocessor trained on training data
preprocessor_trained = search.best_estimator_['preprocessor']
# Trained rfe for getting feature names
rfe_trained = search.best_estimator_['preprocessor']['recursive_feature_selection']
type(model), type(preprocessor_trained), type(rfe_trained)

(xgboost.sklearn.XGBClassifier,
 sklearn.pipeline.Pipeline,
 sklearn.feature_selection._rfe.RFE)

## Feature Importance

In [295]:
rfe_trained.get_feature_names_out()

array(['job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'campaign', 'pdays', 'previous',
       'poutcome', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx',
       'euribor3m', 'nr_employed', 'job_mean', 'job_std', 'job_last',
       'marital_mean', 'marital_std', 'education_mean', 'education_std',
       'housing_mean', 'housing_std', 'loan_std', 'contact_mean',
       'contact_std', 'month_mean', 'month_std', 'month_last',
       'day_of_week_mean', 'day_of_week_std', 'campaign_mean',
       'campaign_std', 'campaign_max', 'pdays_mean', 'pdays_std',
       'previous_mean', 'previous_std', 'poutcome_std',
       'emp_var_rate_std', 'cons_price_idx_mean', 'cons_price_idx_last',
       'cons_conf_idx_mean', 'cons_conf_idx_std', 'cons_conf_idx_last',
       'euribor3m_mean', 'euribor3m_std', 'euribor3m_min',
       'euribor3m_last', 'nr_employed_std'], dtype=object)