# Prep work

In [1]:
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_datetime64_any_dtype
from pandas.api.types import CategoricalDtype

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('notebook')
%matplotlib widget

In [2]:
df = pd.read_csv('bank-classification.csv')

# Feature engineering

It might we wise to first take a broad look at the table.

In [3]:
df.head()

Unnamed: 0,id,birth_date,job,marital,education,default,housing,loan,contact_date,contact,campaign,pdays,previous,poutcome,y
0,1,1952-03-23,housemaid,married,basic.4y,no,no,no,2008-05-12,telephone,1,999,0,nonexistent,no
1,2,1951-03-24,services,married,high.school,unknown,no,no,2008-05-26,telephone,1,999,0,nonexistent,unknown
2,3,1971-05-19,services,married,high.school,no,yes,no,2008-05-05,telephone,1,999,0,nonexistent,no
3,4,1968-01-24,admin.,married,basic.6y,no,no,no,2008-05-19,telephone,1,999,0,nonexistent,unknown
4,5,1952-05-11,services,married,high.school,no,no,yes,2008-05-19,telephone,1,999,0,nonexistent,unknown


In [4]:
df.describe(include='all', datetime_is_numeric=True)

Unnamed: 0,id,birth_date,job,marital,education,default,housing,loan,contact_date,contact,campaign,pdays,previous,poutcome,y
count,41188.0,41188,41188,41188,41188,41188,41188,41188,41188,41188,41188.0,41188.0,41188.0,41188,41188
unique,,13290,12,4,8,3,3,3,552,2,,,,3,3
top,,1977-07-11,admin.,married,university.degree,no,yes,no,2008-05-21,cellular,,,,nonexistent,unknown
freq,,16,10422,24928,12168,32588,21576,33950,457,26144,,,,35563,20389
mean,20594.5,,,,,,,,,,2.567593,962.475454,0.172963,,
std,11890.09578,,,,,,,,,,2.770014,186.910907,0.494901,,
min,1.0,,,,,,,,,,1.0,0.0,0.0,,
25%,10297.75,,,,,,,,,,1.0,999.0,0.0,,
50%,20594.5,,,,,,,,,,2.0,999.0,0.0,,
75%,30891.25,,,,,,,,,,3.0,999.0,0.0,,


Before we get to the more sophisticated analysis:
- `id` is not a feature, and besides *pandas* already stores it;
- we should also specify the types of features for cleaner further processing;

In [45]:
df = df.astype({
    'birth_date': 'datetime64',
    'job': 'category',
    'marital': 'category',
    'education': 'category',
    'default': 'category',
    'housing': 'category',
    'loan': 'category',
    'contact_date': 'datetime64',
    'contact': 'category',
    'campaign': 'int64',
    'pdays': 'int64',
    'previous': 'int64',
    'poutcome': 'category',
    'y': 'category'
})
df.pop('id');

Let's also split the features and the labels.

In [46]:
features = df
labels = features.pop('y')

Our goals are:
- to encode the non-numeric features numerically;
- possibly come up with artificial features to augment the model.

## Booleans
`default`, `housing` and `loan` are essentially boolean columns; let us therefore convert them.

In [47]:
bool_cols = ['default', 'housing', 'loan']
bool_type = CategoricalDtype(categories=['no', 'yes'],
                             ordered=True)
for col in bool_cols:
    features[col + '_bool'] = features[col].astype(bool_type).cat.codes.astype('bool')

## Dates
Let's start with encoding these as days since the minimum value. 

In [48]:
date_cols = [col for col, dtype in features.dtypes.items()
                 if is_datetime64_any_dtype(dtype)]
for col in date_cols:
    days = (features[col] - features[col].min()) / np.timedelta64(1, 'D')
    features[col + '_days'] = days

Now, let's check the `y` column against these (normalized)

In [49]:
fig, axes = plt.subplots(2, len(date_cols))
known = features.loc[labels != 'unknown']
for i, col in enumerate(date_cols):
    sns.histplot(x=known[col], hue=labels,
                 multiple='stack', ax=axes[0][i])
    sns.histplot(x=known[col], hue=labels,
                 multiple='fill', ax=axes[1][i])
fig.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

So, there would, in fact, seem to be a number of correlations:
- "middle-aged" people are less likely to subscribe the term deposit;
- latter campaigns seemingly were more effective (though they also did contact less people in general).

**For now** we will add squares of the dates, so as to model degree-2 polynomials in linear models.

In [50]:
# for col in date_cols:
#     features[col + '_days_sq'] = features[col+'_days']**2

## `pdays`
If we read the description of the dataset, `pdays==999` means that there was no prior contact - let us create a new feature with that information explicitly. We shall fill the `999`s with the mean of the valid values. **Note to self: could make it a hyperparameter.**

In [51]:
features['pcontacted'] = features['pdays'] != 999
features.loc[-features['pcontacted'], 'pdays'] = features.loc[features['pcontacted'], 'pdays'].mean()

## Categorical data
We could try to extract some more sophisticated features from the categorical ones, _but_, given how small the value sets are, we could just one-hot encode them.

In [52]:
cat_cols = [col for col, dtype in features.dtypes.items()
                if isinstance(dtype, CategoricalDtype)]

for col in cat_cols:
    cols = pd.get_dummies(features[col], prefix=col)
    features = pd.concat([features, cols], axis=1)

And we get, in the end, following features/columns:

In [53]:
features.dtypes

birth_date                       datetime64[ns]
job                                    category
marital                                category
education                              category
default                                category
housing                                category
loan                                   category
contact_date                     datetime64[ns]
contact                                category
campaign                                  int64
pdays                                   float64
previous                                  int64
poutcome                               category
default_bool                               bool
housing_bool                               bool
loan_bool                                  bool
birth_date_days                         float64
contact_date_days                       float64
pcontacted                                 bool
job_admin.                                uint8
job_blue-collar                         

## Train/Test datasets preparation

If `y` is `unknown`, the record belongs to the test set (we may also remove `y` for good measure); otherwise, we split real features and `y` and treat them as train features and train labels.

In [54]:
train_features = features[labels != 'unknown'].copy()

train_labels = labels[labels != 'unknown']
train_labels = pd.DataFrame(train_labels.astype(bool_type).cat.codes,
                            columns=['y'])

test_features = features[labels == 'unknown']

As it stands, the features' datasets may contain a lot of stuff that is unparsable for our models (i.e. non-numeric); we must discard such features.

In [55]:
def extract_num(_df):
    numeric_cols = _df.dtypes[_df.dtypes.apply(is_numeric_dtype)]
    return _df[numeric_cols.index].astype('float64')

train_features = extract_num(train_features)
test_features = extract_num(test_features)    

# Model

In [56]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

X = train_features.values
y = np.ravel(train_labels.values)

def split(train=0.2):
    if train >= 1:
        I = np.random.permutation(len(y))
        return X[I], None, y[I], None
    else:
        return train_test_split(X, y, train_size=train)

## Sklearn

In [57]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, RandomizedSearchCV,\
    GridSearchCV

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifierCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import\
    AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier,\
    StackingClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

import scipy.stats

Let's do a preliminary investigation of some estimators.

In [23]:
base = {
#     'svc': {
#         'clf': SVC(kernel='rbf', C=1),
#         'grid': {
#             'kernel': ['rbf', 'poly', 'sigmoid', 'linear'],
#             'C': [0.1, 1, 10, 40, 100],
#             'gamma': ['auto', 1, 0.1, 0.01]
#         }
#     },
    'rf': {
        'clf': RandomForestClassifier(),
        'grid': {
            'max_features': ['auto', 'sqrt', 'log2'],
            'n_estimators': [10, 100, 250, 1000],
        }
    },
    'et': {
        'clf': ExtraTreesClassifier(),
        'grid': {
            'n_estimators': [*range(50, 250+1, 50), 1000],
            'max_features': ['auto', 'sqrt', 'log2'],
            'min_samples_leaf': [*range(1, 10+1, 2), *range(20, 50+1, 5)],
            'min_samples_split': [*range(1, 10+1, 2), *range(15, 35+1, 5)]
        }
    },
    'gbc': {
        'clf': GradientBoostingClassifier(),
        'grid': {
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'n_estimators': [10, 100, 250, 1000],
            'min_samples_split': [100, 250, 500, 1000],
            'max_features': ['auto', 'log2', 'sqrt'],
            'max_depth': [*range(5, 8+1)],
            'subsample': [0.5, 0.7, 1],
        }
    },
    'ada': {
        'clf': AdaBoostClassifier(),
        'grid': {
            'n_estimators': [10, 50, 100, 500],
            'learning_rate': [0.01, 0.1, 0.5, 1, 2],
            'base_estimator': [DecisionTreeClassifier(max_depth = n)
                                    for n in [*range(1, 16+1)]]    
        }
    },
    'xgb': {
        'clf': XGBClassifier(use_label_encoder=False,
                             eval_metric='logloss'),
        'grid': {
            'n_estimators': [10, 50, 100, 500],
            'learning_rate': [.02, .05, .1],
            'max_depth': [4, 6, 8, 10],
        }
    },
    'bayes': {
        'clf': GaussianNB(),
        'grid': {}
    }
}

cv_args = {
    'cv': 5,
    'scoring': ['roc_auc']
}

In [18]:
import pickle
estimators = pickle.load(open('sklearn-models', 'rb'))

In [19]:
# for name, params in estimators.items():
#     X_train, X_test, y_train, y_test = split(0.85)
#     print(name)
#     if params['grid'] != {}:
#         cv = RandomizedSearchCV(params['clf'], params['grid'], n_iter=200,
#                                 scoring='roc_auc', cv=5, n_jobs=-1, verbose=10)
#         result = cv.fit(X_train, y_train)
#         estimators[name]['result'] = result
#         estimators[name]['clf'] = result.best_estimator_

In [73]:
clf = StackingClassifier(
    estimators=[(name, params['clf'])
                for name, params in estimators.items()],
    final_estimator=GradientBoostingClassifier(),
    n_jobs=1)

X_train, X_test, y_train, y_test = split(1)
score = cross_validate(clf, X_train, y_train,
                       cv=5, scoring=['roc_auc'], verbose=10,
                       return_estimator=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................


KeyboardInterrupt: 

In [71]:
clf_base = Pipeline([
    ('stacked', StackingClassifier(
        estimators=[(name, params['clf'])
                    for name, params in base.items()],
        final_estimator=GradientBoostingClassifier()))
])

X_train, X_test, y_train, y_test = split(0.9)
score_base = cross_validate(clf_base, X_train, y_train,
                            cv=5, scoring=['roc_auc'], verbose=10,
                            return_estimator=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................
[CV] .................................. , roc_auc=0.754, total= 3.2min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.2min remaining:    0.0s


[CV] .................................. , roc_auc=0.764, total= 3.1min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  6.3min remaining:    0.0s


KeyboardInterrupt: 

In [74]:
scores = {name: cross_validate(clf['clf'], X_train, y_train, **cv_args)
          for name, clf in estimators.items()}

score_list = []
for name, score in scores.items():
    for stat in ['fit_time', 'score_time', 'test_roc_auc']:
        for val in score[stat]:
            score_list.append([name, stat, val])

score_df = pd.DataFrame(data=score_list,
                        columns=['est', 'stat', 'val'])

sns.catplot(data=score_df, x='est', hue='stat', y='val',
            kind='box')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<seaborn.axisgrid.FacetGrid at 0x7fa178396d60>

## Tensorflow

In [58]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [68]:
EPOCHS = 100
BATCH_SIZE = 2048

def create_model(data, bias=None):
    if bias is not None:
        bias = keras.initializers.Constant(bias)
    
    model = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(data.shape[-1],)),
        layers.Dense(128, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='sigmoid', bias_initializer=bias)
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=[keras.metrics.AUC(name='auc')])
    
    return model    

In [69]:
neg_features = X_train[y_train == 0]
neg_labels = y_train[y_train == 0]
pos_features = X_train[y_train == 1]
pos_labels = y_train[y_train == 1]

def make_ds(features, labels):
    ds = tf.data.Dataset.from_tensor_slices((features, labels))
    ds = ds.shuffle(100000).repeat()
    return ds

neg_ds = make_ds(neg_features, neg_labels)
pos_ds = make_ds(pos_features, pos_labels)

resampled = tf.data.experimental.\
    sample_from_datasets([neg_ds, pos_ds], weights=[0.5, 0.5])
resampled = resampled.batch(BATCH_SIZE).prefetch(2)

pos = np.sum(y_train)
total = np.size(y_train)
neg = total - pos
resampled_steps_per_epoch = np.ceil(2.0*neg/BATCH_SIZE)

In [70]:
model = create_model(X_train)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

val_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).cache()
val_ds = val_ds.batch(BATCH_SIZE).prefetch(2)

history = model.fit(
    resampled,
    epochs=EPOCHS,
    steps_per_epoch=resampled_steps_per_epoch,
    callbacks=[early_stopping],
    validation_data=val_ds,
    verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 00018: early stopping


## Predict values for test set.
Let us now test the chosen estimator.

In [76]:
final_clf = None

pred_df = pd.DataFrame(columns=['id','y'])
pred_df['id'] = test_features.index+1
pred_df['y'] = final_clf.predict_proba(test_features)[:,1]
pred_df.to_csv('pred.rf.csv', index=False)