# Prep work

In [1]:
import numpy as np
np.random.seed(9001)

I must be honest; I couldn't for the life of me figure out how to make it _precisely_ reproducible. Perhaps there are some other settings (specifically in `sklearn`) that use some other randomness source. The `model` used to generate the csv is precisely the same, though. I very much apologise for this inconvenience. At any rate,

In [1]:
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
sns.set_context('notebook')
%matplotlib widget

I worked on it in Jupyter Lab. In my estimation, `ipyml` (for `%matplotlib widget`) is fairly common -- if, however, there are problems with the graphics, I could prepare a Docker container for it (there was too little time as of writing this to procure such a container, however)

In [2]:
df = pd.read_csv('bank-classification.csv')

# Feature engineering

Let's look at the data first.

In [3]:
df

Unnamed: 0,id,birth_date,job,marital,education,default,housing,loan,contact_date,contact,campaign,pdays,previous,poutcome,y
0,1,1952-03-23,housemaid,married,basic.4y,no,no,no,2008-05-12,telephone,1,999,0,nonexistent,no
1,2,1951-03-24,services,married,high.school,unknown,no,no,2008-05-26,telephone,1,999,0,nonexistent,unknown
2,3,1971-05-19,services,married,high.school,no,yes,no,2008-05-05,telephone,1,999,0,nonexistent,no
3,4,1968-01-24,admin.,married,basic.6y,no,no,no,2008-05-19,telephone,1,999,0,nonexistent,unknown
4,5,1952-05-11,services,married,high.school,no,no,yes,2008-05-19,telephone,1,999,0,nonexistent,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,41184,1938-03-19,retired,married,professional.course,no,yes,no,2010-11-19,cellular,1,999,0,nonexistent,unknown
41184,41185,1964-10-10,blue-collar,married,professional.course,no,no,no,2010-11-12,cellular,1,999,0,nonexistent,unknown
41185,41186,1954-10-06,retired,married,university.degree,no,yes,no,2010-11-12,cellular,2,999,0,nonexistent,no
41186,41187,1967-03-15,technician,married,professional.course,no,no,no,2010-11-26,cellular,1,999,0,nonexistent,unknown


In [4]:
df.describe(include='all')

Unnamed: 0,id,birth_date,job,marital,education,default,housing,loan,contact_date,contact,campaign,pdays,previous,poutcome,y
count,41188.0,41188,41188,41188,41188,41188,41188,41188,41188,41188,41188.0,41188.0,41188.0,41188,41188
unique,,13290,12,4,8,3,3,3,552,2,,,,3,3
top,,1977-07-11,admin.,married,university.degree,no,yes,no,2008-05-21,cellular,,,,nonexistent,unknown
freq,,16,10422,24928,12168,32588,21576,33950,457,26144,,,,35563,20389
mean,20594.5,,,,,,,,,,2.567593,962.475454,0.172963,,
std,11890.09578,,,,,,,,,,2.770014,186.910907,0.494901,,
min,1.0,,,,,,,,,,1.0,0.0,0.0,,
25%,10297.75,,,,,,,,,,1.0,999.0,0.0,,
50%,20594.5,,,,,,,,,,2.0,999.0,0.0,,
75%,30891.25,,,,,,,,,,3.0,999.0,0.0,,


In [5]:
df.dtypes

id               int64
birth_date      object
job             object
marital         object
education       object
default         object
housing         object
loan            object
contact_date    object
contact         object
campaign         int64
pdays            int64
previous         int64
poutcome        object
y               object
dtype: object

First, clearly the types could be set better.

In [6]:
integral = ['id', 'campaign', 'pdays', 'previous']
datelike = ['birth_date', 'contact_date']
categorical = ['job', 'marital', 'education', 'default',
               'housing', 'contact', 'poutcome', 'loan', 'y']

dtypes = {
    **{col: 'int64' for col in integral},
    **{col: 'datetime64' for col in datelike},
    **{col: 'category' for col in categorical}
}

df = df.astype(dtypes)

In [7]:
df.dtypes

id                       int64
birth_date      datetime64[ns]
job                   category
marital               category
education             category
default               category
housing               category
loan                  category
contact_date    datetime64[ns]
contact               category
campaign                 int64
pdays                    int64
previous                 int64
poutcome              category
y                     category
dtype: object

We need to:
- preprocess the columns for use in models;
- possibly add and/or remove new features.

Our strategy is essentially as follows:
- `id` is useless;
- `birth_date` and `contact_date` must be converted to numerical values, for example days since the minimum;
- `job`, `marital`, `education`, `default`, `housing`, `contact` and `poutcome` are categorical, and since the # of unique values is small, we will one-hot encode
them;
- `pdays` has a special "missing" value `999`;
- `y` is the label.

We *could* also input some extraneous values -- to that end, let us investigate the values more closely; specifically in relation to `y` value.

In [8]:
import ipywidgets as widgets
%gui asyncio

cols = [*integral, *datelike, *categorical]
selector = widgets.Select(
    options=cols,
    value=cols[0],
    description='Column: ',
    disabled=False)

render_btn = widgets.Button(
    description='Render',
    disabled=False,
    button_style='success',
    tooltip='Render',
    icon='check')

display(selector, render_btn)

fig, (ax_abs, ax_rel) = plt.subplots(2, 1)

known_df = df[df['y'] != 'unknown']

def render(col):
    ax_abs.clear()
    sns.histplot(data=known_df, x=col, hue='y',
                 multiple='stack', ax=ax_abs)    
    ax_rel.clear()
    sns.histplot(data=known_df, x=col, hue='y',
                 multiple='fill', ax=ax_rel)
    plt.show()

def on_click(_):
    render_btn.description = 'Rendering...'
    render(selector.value)
    render_btn.description = 'Render'
render_btn.on_click(on_click)

Select(description='Column: ', options=('id', 'campaign', 'pdays', 'previous', 'birth_date', 'contact_date', '…

Button(button_style='success', description='Render', icon='check', style=ButtonStyle(), tooltip='Render')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

So, clearly there _are_ correlations; we cannot, however, use them explicitly (since the model would presumably overfit, and if anything be unrealistic, or at least that's what I got from the tests). The full scope of the feature engineering is as follows:
- for dates, we add days since minimum (with \log(1+x) and \sqrt{x}), along with day of the week, day, month and year, to cover any possibly occuring periodic patterns.
- for `pdays`, `999` represents the missing value; we shall replace it with the mean, and also add an indicator feature;
- categorical features shall be one-hot encoded;
- `id` will be dropped (there seems to actually be correlation with `y` at the first glance, if we look at the graph above, but the variable obviously cannot be indicative of anything but the order in the dataset).

We will implement it all as a `sklearn` Transformer, for convenience.

In [9]:
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

class DateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        self.names = []
        dfs = []
        for name in X:
            col = X[name]
            
            days = (col-col.min()) / np.timedelta64(1, 'D')
            days_log = np.log(1+days)
            days_sqrt = np.sqrt(days)
            day_of_week = col.dt.dayofweek
            day = col.dt.day
            month = col.dt.month
            year = col.dt.year
            
            names = ['days', 'days_log', 'days_sqrt', 'day_of_week',
                     'day', 'month', 'year']
            cols = [days, days_log, days_sqrt, day_of_week,
                    day, month, year]
            
            self.names = [*self.names,
                          *['{}_{}'.format(name, n) for n in names]]
            dfs = [*dfs, *cols]
        
        return pd.concat(dfs, axis=1)
    
    def get_feature_names(self):
        return self.names

In [10]:
class PDaysTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, **kwargs):
        self.base = SimpleImputer(missing_values=999.0,
                                  strategy='mean',
                                  add_indicator=True)
    
    def fit(self, X, **fit_params):
        self.base.fit(X, **fit_params)
        return self
    
    def transform(self, X, **fit_params):
        self.names = []
        for name in X:
            self.names.append(name)
            self.names.append('{}_missing'.format(name))
        
        return self.base.transform(X, **fit_params)
    
    def get_feature_names(self):
        return self.names

In [59]:
class StandardTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.base = ColumnTransformer([
                *[('{}_cat'.format(name), OneHotEncoder(), [name])
                  for name in categorical if name != 'y'],
                *[('{}_D'.format(name), DateTransformer(), [name])
                  for name in datelike],
                ('pdays', PDaysTransformer(), ['pdays']),
                ('id', 'drop', ['id'])
            ],
            remainder='passthrough')
    
    def fit(self, X, **fit_params):
        self.base.fit(X, **fit_params)
        return self
    
    def transform(self, X, **fit_params):
        Xt = self.base.transform(X, **fit_params)
        return pd.DataFrame(data=Xt, columns=self.get_feature_names())
    
    def get_feature_names(self):
        return self.base.get_feature_names()

# Model

## Datasets

In [60]:
features = df.copy()
labels = features.pop('y')
features = StandardTransformer().fit_transform(features)

In [61]:
train_Ix = (labels != 'unknown')
train_features = features[train_Ix]
train_labels = labels[train_Ix]

from pandas.api.types import CategoricalDtype
yn_type = CategoricalDtype(categories=['no', 'yes'], ordered=True)
train_labels = pd.DataFrame(train_labels.astype(yn_type).cat.codes,
                            columns=['y'])

test_Ix = labels == 'unknown'
test_features = features[test_Ix]

Let us also define some utility functions for further splitting.

In [62]:
from sklearn.model_selection import train_test_split

def _choose_Ix(train, n):
    train_n = int(np.ceil(train*n))
    train_Ix = np.random.choice(n, train_n, replace=False)
    test_Ix = np.setdiff1d(np.arange(n), train_Ix)
    return train_Ix, test_Ix

def split(train, X=train_features, y=train_labels):
    train_Ix, test_Ix = _choose_Ix(train, X.shape[0])
    np.random.shuffle(train_Ix)
    np.random.shuffle(test_Ix)    
    
    return X.iloc[train_Ix,:], X.iloc[test_Ix,:],\
           y.iloc[train_Ix,:], y.iloc[test_Ix,:]

## Model № 1 (`sklearn`) - Final model

In [15]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, RandomizedSearchCV,\
    GridSearchCV
import tempfile

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifierCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import\
    AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier,\
    StackingClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

from xgboost import XGBClassifier

Following are the definitions of classifiers and grid parameters; in the previous version a `StackingClassifier` was used, but it was too slow and also yielded worse results (presumably it overfitted?)

In [99]:
base = {
#     'rf': {
#         'clf': RandomForestClassifier(),
#         'grid': {
#             'max_features': ['auto', 'sqrt', 'log2'],
#             'n_estimators': [10, 100, 250, 1000],
#         }
#     },
#     'et': {
#         'clf': ExtraTreesClassifier(),
#         'grid': {
#             'n_estimators': [*range(50, 250+1, 50), 1000],
#             'max_features': ['auto', 'sqrt', 'log2'],
#             'min_samples_leaf': [*range(1, 10+1, 2), *range(20, 50+1, 5)],
#             'min_samples_split': [*range(1, 10+1, 2), *range(15, 35+1, 5)]
#         }
#     },
    'gbc': {
        'clf': GradientBoostingClassifier(),
        'grid': {
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'n_estimators': [100, 200, 250, 300],
            'max_features': ['auto', 'log2', 'sqrt'],
            'max_depth': [*range(2, 5)],
            'subsample': [0.7, 0.8, 1],
        }
    },
    'ada': {
        'clf': AdaBoostClassifier(),
        'grid': {
            'n_estimators': [10, 50, 100, 500],
            'learning_rate': [0.01, 0.1, 0.5, 1, 2],
            'base_estimator': [DecisionTreeClassifier(max_depth = n)
                                    for n in [*range(1, 16+1)]]    
        }
    },
    'xgb': {
        'clf': XGBClassifier(use_label_encoder=False,
                             eval_metric='logloss',
                             subsample=0.8),
        'grid': {
#             'n_estimators': [10, 50, 100, 500],
#             'learning_rate': [.02, .05, .1],
#             'max_depth': [4, 6, 8, 10],
        }
    },
#     'bayes': {
#         'clf': GaussianNB(),
#         'grid': {}
#     }
}

This part implements randomized search (for temporal reasons); the resultant model seemed suboptimal to the hand-picked values, though they were "inspired" by them. We optimize `GradientBoostingClassifier`, which seemed to work best.

In [102]:
# # estimators = base
# # clf = Pipeline([
# #     ('stacked', StackingClassifier(
# #         estimators=[(name, params['clf'])
# #                     for name, params in estimators.items()],
# #         final_estimator=GradientBoostingClassifier(),
# #         n_jobs=2))
# # ])
# # clf = AdaBoostClassifier(n_estimators=250)

# search = RandomizedSearchCV(base['gbc']['clf'], base['gbc']['grid'],
#                             scoring='roc_auc', cv=5, n_jobs=-1, verbose=10,
#                             n_iter=100)

# X_train, _, y_train, _ = split(1)
# y_train = np.ravel(y_train)
# result = search.fit(X_train, y_train)

# # score = cross_validate(clf, X_train, y_train,
# #                        cv=5, scoring=['roc_auc'], verbose=10,
# #                        return_estimator=True)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  6

And here we pick the model to run and save output to.

In [74]:
def save_pred_sk(clf, name):
    pred_df = pd.DataFrame(columns=['id', 'y'])
    pred_df['id'] = test_features.index+1
    pred_df['y'] = clf.predict_proba(test_features)[:,1]
    pred_df.to_csv('pred.{}.csv'.format(name), index=False)

In [114]:
# best = result.best_estimator_
# best = GradientBoostingClassifier(learning_rate=0.05, max_depth=4,
#                                   max_features='auto',
#                                   subsample=0.7)
best = GradientBoostingClassifier(n_estimators=250, subsample=0.8)
save_pred_sk(best.fit(X_train, y_train), 'just')

Beyond this point are unused models, mostly related to Tensorflow and Keras, which I toyed with (but which yielded results inferior to `sklearn`); mostly I wanted to learn how to use Keras if/when the time comes I need it.

## Model № 2: `tf`

In [63]:
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers
# from tensorflow.keras.layers.experimental import preprocessing
# from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

Let's start with the definition of the model.

In [90]:
# EPOCHS = 100
# BATCH_SIZE = 2048
# DROPOUT = 0.5

# def block(x_in, size):
#     x = layers.Dense(size)(x_in)
#     x = layers.BatchNormalization()(x)
#     x = layers.Activation('relu')(x)
#     x = layers.Dropout(DROPOUT)(x)
#     return x

# def create_model(shape):
#     x_in = layers.Input(shape=shape)
# #     x = block(x_in, 512)
#     x = block(x_in, 512)
# #     x = block(x, 512)
# #     x = block(x, 128)
#     x_out = layers.Dense(1, activation='sigmoid')(x)
    
#     model = keras.Model(inputs=x_in, outputs=x_out)    
#     model.compile(
#         optimizer=keras.optimizers.Adam(lr=1e-3),
#         loss=keras.losses.BinaryCrossentropy(),
#         metrics=[keras.metrics.AUC(name='auc')])
    
#     return model

We will want to balance the predictors in this particular case.

In [91]:
# X_train, X_test, y_train, y_test = split(0.8, train_features, train_labels)

# resample = False

# if resample:
#     neg_features = X_train[y_train == 0]
#     neg_labels = y_train[y_train == 0]
#     pos_features = X_train[y_train == 1]
#     pos_labels = y_train[y_train == 1]

#     def make_ds(features, labels):
#         ds = tf.data.Dataset.from_tensor_slices((features, labels))
#         ds = ds.shuffle(100000).repeat()
#         return ds

#     neg_ds = make_ds(neg_features, neg_labels)
#     pos_ds = make_ds(pos_features, pos_labels)

#     train_ds = tf.data.experimental.\
#         sample_from_datasets([neg_ds, pos_ds], weights=[0.5, 0.5])
#     train_ds = train_ds.batch(BATCH_SIZE).prefetch(2)
    
#     pos = np.sum(y_train)
#     total = np.size(y_train)
#     neg = total - pos
#     resampled_steps_per_epoch = np.ceil(2.0*neg/BATCH_SIZE)
# else:    
#     train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).cache()
#     train_ds = train_ds.batch(BATCH_SIZE).prefetch(2)

# val_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).cache()
# val_ds = val_ds.batch(BATCH_SIZE).prefetch(2)

Here we train the model.

In [92]:
# model = create_model(X_train.shape[1])

# early_stopping = tf.keras.callbacks.EarlyStopping(
#     monitor='val_auc', 
#     verbose=1,
#     patience=10,
#     mode='max',
#     restore_best_weights=True)

# history = model.fit(
#     train_ds,
#     epochs=EPOCHS,
# #     steps_per_epoch=resampled_steps_per_epoch,
# #     callbacks=[early_stopping],
#     validation_data=val_ds,
#     verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Finally, let us save the predictions to a .csv file.

In [89]:
# def save_pred_tf(model, name):
#     pred_df = pd.DataFrame(columns=['id', 'y'])
#     pred_df['id'] = test_features.index+1
#     pred_df['y'] = model.predict(test_features)
#     pred_df.to_csv('pred.{}.csv'.format(name), index=False)

## Model № 3: `tf` *and* `sklearn`

The idea (a perhaps-bad one) is for a Keras model to construct a "latent representation" of the feature space, which we will add to the usual feature space and all that to the `sklearn` classifier from before.

In [22]:
# from sklearn.model_selection import StratifiedShuffleSplit

# def make_sets():
#     sss = StratifiedShuffleSplit(n_splits=5, test_size=0.25)
#     sss.get_n_splits(train_features, train_labels)

#     for train_Ix, test_Ix in sss.split(train_features, train_labels):
#         X_train = train_features.iloc[train_Ix,:]
#         y_train = train_labels.iloc[train_Ix,:]
#         X_test = train_features.iloc[test_Ix,:]
#         y_test = train_labels.iloc[test_Ix,:]
#         yield X_train, X_test, y_train, y_test

# def make_ds(X, y):
#     ds = tf.data.Dataset.from_tensor_slices((X, y)).cache()
#     return ds.batch(BATCH_SIZE).prefetch(2)
        
# def train_keras_model(X_train, X_test, y_train, y_test):
#     train_ds = make_ds(X_train, y_train)
#     val_ds = make_ds(X_test, y_test)
    
#     model = create_model(X_train.shape[1])
#     history = model.fit(
#         train_ds,
#         epochs=EPOCHS,
#         validation_data=val_ds)
    
#     return model

# def train_sklearn_clf(X_train, y_train):
#     estimators = base
#     clf = Pipeline([
#         ('stacked', StackingClassifier(
#             estimators=[(name, params['clf'])
#                         for name, params in estimators.items()],
#             final_estimator=GradientBoostingClassifier(),
#             n_jobs=2, verbose=1))
#     ])
#     return clf.fit(X_train, np.ravel(y_train))

# def encoder_for(model):
#     return keras.models.Model(model.input, model.layers[-2].output)

# def add_latent(encoder, X):
#     latent = encoder.predict(X)
#     cols = ['L{}'.format(i) for i in range(latent.shape[1])]
#     latent = pd.DataFrame(index=X.index, data=latent, columns=cols)
#     return pd.concat([X, latent], axis=1)

# for i, (X_train, X_test, y_train, y_test) in enumerate(make_sets(), 1):
#     print('CV #{}'.format(i))
    
#     model = train_keras_model(X_train, X_test, y_train, y_test)
#     encoder = encoder_for(model)
    
#     X_train = add_latent(encoder, X_train)
#     X_test = add_latent(encoder, X_test)
    
#     clf = train_sklearn_clf(X_train, y_train)
    
#     X_test_pred = clf.predict_proba(X_test)[:,1]
#     score = roc_auc_score(y_test, X_test_pred)
#     print('roc_auc_score -> {}'.format(score))

CV #1
Epoch 1/100
1/8 [==>...........................] - ETA: 0s - loss: 1.1316 - auc: 0.5404

KeyboardInterrupt: 

In [24]:
# X_train, X_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1)

# model = train_keras_model(X_train, X_test, y_train, y_test)
# model.save('model2')
# encoder = encoder_for(model)

# X_train = add_latent(encoder, X_train)
# clf = train_sklearn_clf(X_train, y_train)

# lat_test_features = add_latent(encoder, test_features)

# name = 'synth'
# pred_df = pd.DataFrame(columns=['id', 'y'])
# pred_df['id'] = test_features.index+1
# pred_df['y'] = clf.predict_proba(lat_test_features)[:,1]
# pred_df.to_csv('pred.{}.csv'.format(name), index=False)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78