# This notebook uses the UCI banknote dataset
- https://archive.ics.uci.edu/ml/datasets/banknote+authentication

In [None]:
import pandas as pd
import matplotlib.pyplot as pyplot
import itertools
from mpl_toolkits import mplot3d
from scipy.stats import zscore
import keras
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

%matplotlib inline

## Data set

In [None]:
df = pd.read_csv('../dataset/data_banknote_authentication.csv')
feats = list(df)[:-1]
df.drop_duplicates(keep='first', inplace=True, ignore_index=True, subset=feats)
neg, pos = [df.loc[df['class'] == arg] for arg in (0, 1)]
total = len(df)
print("negative: {:.0%}, positive: {:.0%}".format(len(neg) / total, len(pos) / total))
df.describe()

## Neural Net

In [None]:
#standardize data
for f in feats:
    df.update({f: zscore(df[f])})

#Plot standardized features
xs = range(len(df))     
args = [(f, xs, df[f], c) for f, c in zip(feats, ('ro', 'bo', 'go', 'yo'))]     
for a in args:         
    ar = a[1:]         
    fig, ax = pyplot.subplots()         
    ax.plot(*ar)
    fig.suptitle(a[0])

- The distributions of the features overlap from [-2, 2].
- outliars for curtosis and entropy are preserved.

In [None]:
def simple_model(kernel_regularizer):
    model = Sequential()
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=kernel_regularizer))
    model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])
    return model

def complex_model(kernel_regularizer):
    model = Sequential()
    model.add(Dense(3, input_dim=4, activation='relu', kernel_regularizer=kernel_regularizer))
    model.add(Dense(1, activation='sigmoid', kernel_regularizer=kernel_regularizer))
    model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])
    return model

#arguments for cross validation
xs, ys = np.asarray(df[list(df)[:-1]]), np.asarray(df[list(df)[-1]])
model_params = [(simple_model, {'epochs': [5, 10], 'batch_size': [3, 4],
                                'verbose': [0], 'kernel_regularizer': ['l1', 'l2']}),
                (complex_model, {'epochs': [10, 15], 'batch_size': [3, 4], 'verbose': [0],
                                 'kernel_regularizer': ['l1', 'l2']})]
train_params = {'epochs', 'verbose', 'batch_size'}
build_model_params = {'kernel_regularizer'}

- The simple model is a vector of weights equal in length to the input vector
- The complex model has two hidden layers with the first being a 3x4 matrix
- ```model_params``` defines the model space, where each model's arguments will be expanded via cartesian product

In [None]:
#5 fold cross validate each model in model space, return a data frame of results
def evaluate(model_params, xs, ys, tr_params, bm_params):
    st_kf = StratifiedKFold(n_splits=5)
    row_keys = ['train_loss', 'train_accuracy', 'test_loss', 'test_accuracy', 'train_params', 
                'model_params', 'model_name', 'ys_ps', 'auc']
    colors = ['b', 'g', 'c', 'm', 'y', 'k', 'r']
    ci = 0
    cv_results = {k: [] for k in row_keys}
    ys_ps = {mod_func.__name__: [] for mod_func, _ in model_params}
    for mod_func, params in model_params:
        print("Training {}".format(mod_func.__name__))
        for ps in tqdm(list(ParameterGrid(params))):
            for tr_ind, tst_ind in st_kf.split(xs, ys):
                tr_x, tr_y = xs[tr_ind], ys[tr_ind]
                tst_x, tst_y = xs[tst_ind], ys[tst_ind]
                bmps = {k: ps[k] for k in bm_params}
                tps = {k: ps[k] for k in tr_params}
                train_model = mod_func(**bmps)
                th = train_model.fit(tr_x, tr_y, **tps)
                tr_l, tr_a = th.history['loss'][-1], th.history['accuracy'][-1]
                tst_l, tst_a = train_model.evaluate(tst_x, tst_y, verbose=0)
                
                cv_results[row_keys[0]].append(tr_l)
                cv_results[row_keys[1]].append(tr_a)
                cv_results[row_keys[2]].append(tst_l)
                cv_results[row_keys[3]].append(tst_a)
                cv_results[row_keys[4]].append(tuple(tps.items()))
                cv_results[row_keys[5]].append(tuple(bmps.items()))
                cv_results[row_keys[6]].append(mod_func.__name__)
                ys_ps = (tst_y, train_model.predict(tst_x), colors[ci])
                cv_results[row_keys[7]].append(ys_ps)
                fpr, tpr, _ = metrics.roc_curve(np.array(ys_ps[0]), np.array(ys_ps[1]))
                cv_results[row_keys[8]].append(metrics.auc(fpr, tpr))
                ci = (ci + 1) % len(colors)
    return pd.DataFrame(cv_results)

#Group by model and aggregate according columns
def cv_results(cv_data):
    agg_funcs = ['mean', 'std', 'min', 'max']
    output = cv_data.groupby(['model_name', 'model_params', 'train_params'])\
    .agg({'test_accuracy': agg_funcs,
          'train_accuracy': agg_funcs,
          'test_loss': agg_funcs,
          'train_loss': agg_funcs,
          'model_name': ['first'],
          'model_params': ['first'],
          'train_params': ['first'],
          'ys_ps': ['sum'],
          'auc': agg_funcs})
    return output

cv_data = evaluate(model_params, xs, ys, train_params, build_model_params)
res = cv_results(cv_data)

#output results
print("CV accuracy results:\n", res[[('test_accuracy', 'mean')]]\
      .sort_values(by=[('test_accuracy', 'mean')], ascending=False))

print("\nCV auc results:\n", res[[('auc', 'mean')]]\
      .sort_values(by=[('auc', 'mean')], ascending=False), "\n")

In [None]:
def roc_curve(ys_ps, model_name):
    rates = [(fpr, tpr, th, c) for (fpr, tpr, th), c \
             in [(metrics.roc_curve(np.array(ys), np.array(ps)), c) for ys, ps, c in ys_ps]]
    sk_fig, sk_ax = pyplot.subplots()
    for f, t, _, c in rates:
        sk_ax.plot(f, t, c)
        sk_ax.plot([0, 1], [0, 1], '--r')
        sk_ax.set_xlabel('FPR')
        sk_ax.set_ylabel('TPR')
        auc = [metrics.auc(fpr, tpr) for fpr, tpr, _, _ in rates]
        avg_auc, std_auc, mi, mx = np.average(auc), np.std(auc), np.min(auc), np.max(auc)
        sk_ax.legend(["AVG AUC = {:.03}\nSTD AUC = {:.03}\nMIN AUC = {:.03}\nMAX AUC = {:.03}"\
                      .format(avg_auc, std_auc, mi, mx)], loc='lower right')
        sk_fig.suptitle(model_name)

#generate ROC curves
def all_curves(cv_data):
    assoc_ys_ps = lambda ys_ps: [tuple(ys_ps[si:si+3]) for si in range(0, len(ys_ps), 3)]
    tup_to_str = lambda t: "_".join(["{}_{}".format(n, v) for n, v in t])
    for r in range(len(cv_data)):
        ys_ps = assoc_ys_ps(cv_data.iloc[r][('ys_ps', 'sum')])
        fn_params = "_".join([tup_to_str(cv_data.iloc[r][(col, 'first')]) \
                              for col in ('model_params', 'train_params')])
        fn = "{}_{}".format(cv_data.iloc[r][('model_name', 'first')], fn_params)
        roc_curve(ys_ps, fn)

all_curves(res)

- The complex model looks to overfit the data as the ROC curves are nearly perfect.
- The simple model with 5 epochs appears to not generalize as well as the other models.
- The simple model with 10 epochs looks to be a good choice because it does as well as the complex model and has low variance. Batch size and regularization type appear to make little difference.