In [1]:
import json
import pathlib

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
df_train = pd.read_csv(pathlib.Path("../data/interim/bank_train.csv"))
df_test = pd.read_csv(pathlib.Path("../data/interim/bank_test.csv"))

In [3]:
list(df_train.columns)

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'target']

In [4]:
df_train.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
target        int64
dtype: object

In [5]:
numeric_features = [
    'age',
    'balance',
    'day',
    'campaign',
    'pdays',
    'previous',
]

In [6]:
categorical_features = [
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'month',
    'campaign',
    'poutcome'
]

In [7]:
numeric_transformer_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor_pipe = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_pipe, numeric_features),
        ('cat', categorical_transformer_pipe, categorical_features)])

In [8]:
X_train = df_train.drop('target', axis=1)
y_train = df_train['target']

X_test = df_test.drop('target', axis=1)
y_test = df_test['target']

In [9]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor_pipe),
    ('classifier', RandomForestClassifier(n_jobs=-1, n_estimators=100))])

clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                             

In [10]:
print("model score: {:.3f}".format(clf.score(X_test, y_test)))

model score: 0.897


In [11]:
print("model accuracy: {:.3f}".format(metrics.accuracy_score(y_test, clf.predict(X_test))))

print("model precision: {:.3f}".format(metrics.precision_score(y_test, clf.predict(X_test))))

print("model recall: {:.3f}".format(metrics.recall_score(y_test, clf.predict(X_test))))

print("model F1: {:.3f}".format(metrics.f1_score(y_test, clf.predict(X_test))))

print("model AuROC: {:.3f}".format(metrics.roc_auc_score(y_test, clf.predict(X_test))))

model accuracy: 0.897
model precision: 0.677
model recall: 0.235
model F1: 0.349
model AuROC: 0.610


In [12]:
print(metrics.classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.99      0.94      7985
           1       0.68      0.24      0.35      1058

    accuracy                           0.90      9043
   macro avg       0.79      0.61      0.65      9043
weighted avg       0.88      0.90      0.87      9043



In [13]:
param_grid = {
    'classifier__n_estimators': [10, 30, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=4, iid=False, scoring='recall', n_jobs=-1)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                    

In [14]:
print("model accuracy: {:.3f}".format(metrics.accuracy_score(y_test, grid_search.predict(X_test))))

print("model precision: {:.3f}".format(metrics.precision_score(y_test, grid_search.predict(X_test))))

print("model recall: {:.3f}".format(metrics.recall_score(y_test, grid_search.predict(X_test))))

print("model F1: {:.3f}".format(metrics.f1_score(y_test, grid_search.predict(X_test))))

print("model AuROC: {:.3f}".format(metrics.roc_auc_score(y_test, grid_search.predict(X_test))))

model accuracy: 0.896
model precision: 0.658
model recall: 0.227
model F1: 0.337
model AuROC: 0.606


In [15]:
cv_results = {k: v.tolist() if isinstance(v, np.ndarray) else v for k, v in grid_search.cv_results_.items()}
cv_results

{'mean_fit_time': [1.863157331943512, 2.703134298324585, 5.45463752746582],
 'std_fit_time': [0.9150895495762751, 1.0229641444366342, 1.083387159238923],
 'mean_score_time': [1.375464677810669,
  1.5881949067115784,
  0.3638303279876709],
 'std_score_time': [0.7596039491747372,
  0.8408377043163511,
  0.24675047207562786],
 'param_classifier__n_estimators': [10, 30, 100],
 'params': [{'classifier__n_estimators': 10},
  {'classifier__n_estimators': 30},
  {'classifier__n_estimators': 100}],
 'split0_test_score': [0.20793950850661624,
  0.22306238185255198,
  0.20226843100189035],
 'split1_test_score': [0.1994328922495274,
  0.2060491493383743,
  0.20982986767485823],
 'split2_test_score': [0.20226843100189035,
  0.19092627599243855,
  0.20793950850661624],
 'split3_test_score': [0.18353831598864712,
  0.17218543046357615,
  0.18732261116367077],
 'mean_test_score': [0.19829478693667027,
  0.19805580941173523,
  0.20184010458675888],
 'std_test_score': [0.0090534416088834,
  0.0187705147

In [16]:
# json.dumps converts an object into JSON string, while json.dump writes it to a file
print(json.dumps(cv_results, indent=4))

{
    "mean_fit_time": [
        1.863157331943512,
        2.703134298324585,
        5.45463752746582
    ],
    "std_fit_time": [
        0.9150895495762751,
        1.0229641444366342,
        1.083387159238923
    ],
    "mean_score_time": [
        1.375464677810669,
        1.5881949067115784,
        0.3638303279876709
    ],
    "std_score_time": [
        0.7596039491747372,
        0.8408377043163511,
        0.24675047207562786
    ],
    "param_classifier__n_estimators": [
        10,
        30,
        100
    ],
    "params": [
        {
            "classifier__n_estimators": 10
        },
        {
            "classifier__n_estimators": 30
        },
        {
            "classifier__n_estimators": 100
        }
    ],
    "split0_test_score": [
        0.20793950850661624,
        0.22306238185255198,
        0.20226843100189035
    ],
    "split1_test_score": [
        0.1994328922495274,
        0.2060491493383743,
        0.20982986767485823
    ],
    "split2_t