## Introduction

In this notebook, we will train and tune a feed forward neural network classification model implemented in Keras. The goal is to predict which of the transactions are fradulent based on some numerical features. Since the dataset is highly imbalanced, we will use the area under the precision recall curve as the main metric  to evaluate the quality of the models.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import keras

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import confusion_matrix, precision_recall_curve

import optuna
import joblib

from typing import Dict, Optional, List, Tuple
from numbers import Number

plt.style.use("ggplot")
plt.rcParams.update(**{'figure.dpi':150})

## Loading the data

In [None]:
raw_df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
raw_df.head()

In [None]:
print(f'Number of observations: {raw_df.shape[0]}')

In [None]:
neg, pos = np.bincount(raw_df['Class'])
total = neg + pos
print(f'Number of positive observations: {pos} ({100*pos / total:.2f}% of total)')

## Splitting the datasets

In [None]:
train_df, test_df = train_test_split(raw_df, test_size=0.2, random_state=1, stratify=raw_df['Class'])
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=2, stratify=train_df['Class'])

In [None]:
y_train = train_df.pop('Class').values
y_val = val_df.pop('Class').values
y_test = test_df.pop('Class').values

In [None]:
for tag, labels in zip(['training', 'validation', 'test'], [y_train, y_val, y_test]):
    print(f'Percentage of positive class observations in {tag} set: {labels.mean()*100:.3f}')

## Basic EDA

In [None]:
def filter_greater_than(series:pd.Series,threshold:Number) -> pd.Series:
    '''
    Returns series elements greater than threshold. This funtion can be
    used with the .pipe methods
    '''
    return series[series>threshold]


def get_missing_percentage(df:pd.DataFrame) -> pd.Series:
    '''
    Returns the percentages of missing values for columns 
    in `df`that have atleast one missing entry
    '''
    
    return (
        (df.isnull().sum()/df.shape[0]*100)
        .sort_values(ascending=False)
        .pipe(filter_greater_than,threshold=0)
        .round(3)
    )


for tag, df in zip(['training', 'validation', 'test'], [train_df, val_df, test_df]):
    print(f'Percentage of missing entries per column in {tag} set (if any):')
    print(get_missing_percentage(df))
    print()

In [None]:
skew_columns = train_df.skew()
pos_skew = {}
neg_skew = {}

for column, skew in skew_columns.items():
    if skew > 1:
        pos_skew[column] = skew
    elif skew < -1:
        neg_skew[column] = skew
        
print(f'Number of columns that are positively skewed: {len(pos_skew)}')
print(f'Number of columns that are negatively skewed: {len(neg_skew)}')

In [None]:
n_rows = 2
n_cols = 3 

fig, axs = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 3*n_rows))
for idx, col in enumerate(pos_skew.keys()):
    i, j = divmod(idx, n_cols)
    _ = sns.histplot(data=train_df, x = col, bins=20, ax=axs[i,j])
    _ = axs[i,j].set_yscale('log')
    _ = axs[i,j].set_title(f'Skew: {pos_skew[col]:.2f}') 
fig.tight_layout()

In [None]:
n_rows = 3
n_cols = 4 

fig, axs = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 3*n_rows))
for idx, col in enumerate(neg_skew.keys()):
    i, j = divmod(idx, n_cols)
    _ = sns.histplot(data=train_df, x = col, bins=20, ax=axs[i,j])
    _ = axs[i,j].set_yscale('log')
    _ = axs[i,j].set_title(f'Skew: {neg_skew[col]:.2f}')
        
fig.tight_layout()

For simplicity, we preprocess all the columns through the `QuantileTransformer` in scikit-learn, so that the transformed features are (roughly) normally distributed across the training set.

In [None]:
qt_transform = QuantileTransformer(output_distribution='normal')
X_train = qt_transform.fit_transform(train_df)
X_val = qt_transform.transform(val_df)
X_test = qt_transform.transform(test_df)

## Feed forward neural network

In [None]:
def construct_model(params:Dict,input_dim:int,output_dim:int) -> keras.Model:
    """
    Build a feed forward neural network classification model with optional residual 
    connections from the inputs to the output.
    
    Arguments
    ---------
    params: dict
        Dictionary containing the hyperparameters of the model including learning rate,
        momentum, batch size and whether to include the residual connections
        
    input_dim: int
        Number of features
    
    output_dim: int
        Number of labels
        
    Returns
    ---------
    model: keras.Model
        A compiled keras model
    """
    n_hidden = params.get('n_hidden', 3)
    n_hidden_list = [params.get(f'hsize{i}', 32) for i in range(1,n_hidden+1)]
    dropout_list = [params.get(f'dropout{i}', 0.05)  for i in range(1,n_hidden+1)]
    
    inputs = keras.Input(shape=(input_dim,))

    x = inputs
    for i in range(n_hidden):
        if params.get('activation','selu') == 'selu':
            x = keras.layers.Dense(n_hidden_list[i], activation='selu', kernel_initializer='lecun_normal')(x)
            x = keras.layers.AlphaDropout(dropout_list[i])(x)
        else:
            # use the relu activation with batch norm
            x = keras.layers.Dense(n_hidden_list[i], activation=None)(x)
            x = keras.layers.BatchNormalization()(x)
            x = keras.layers.ReLU()(x)
            x = keras.layers.Dropout(dropout_list[i])(x)
        
    if params.get('include_res_connections', 0): 
        # concatenate input with the mlp_output
        x = keras.layers.concatenate([inputs, x])
    
    final_output = keras.layers.Dense(output_dim,activation='sigmoid')(x)
    
    # final model
    model = keras.Model(inputs,final_output)

    model.compile(
        optimizer=keras.optimizers.Adam(
            learning_rate=params.get('learning_rate', 1e-3),
            #momentum=params.get('momentum', 0.9)
        ),
        loss='binary_crossentropy',
        metrics=[keras.metrics.AUC(curve='PR', name='auc_pr')]
    )

    return model


def build_and_fit(
    params:Dict = {}, verbose:int=0,
) -> Tuple[keras.Model, keras.callbacks.History]:
    
    EPOCHS = 100
    BATCH_SIZE = params.get('batch_size', 128)
    
    # class weights
    weight_for_class0 = (total / neg) / 2.0
    weight_for_class1 = (total / pos) / 2.0
    class_weight = {
        0: weight_for_class0, 1: weight_for_class1
    }
    
    # callbacks - reduce lr on plateau and early stopping
    early_stopping = keras.callbacks.EarlyStopping(
        monitor='val_auc_pr',
        mode='max',
        verbose=verbose,
        patience=15,
        restore_best_weights=True
    )
    reduce_lr = keras.callbacks.ReduceLROnPlateau(
        monitor='val_auc_pr',
        mode='max',
        verbose=verbose,
        patience=5,
        factor= 0.5,
        min_lr = 1e-5
    )
        
    # create model
    model = construct_model(params, X_train.shape[-1], 1)

    # train model
    history = model.fit(
        X_train,
        y_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        callbacks=[early_stopping, reduce_lr],
        validation_data=(X_val, y_val),
        verbose=verbose,
        class_weight=class_weight
    )
    
    return model, history

## Tuning hyperparameters with Optuna

Neural network models are usually sensitive to the choice of their hyperparameters. We will now tune the following hyperparameters of the network using Optuna:

1. learning rate for the Adam optimizer
2. the batch size
3. whether to include residual connections from the input or not
4. the number of hidden layers (1 - 4)
5. the activation function ('relu' or 'selu')
6. the number of units in each hidden layer
7. the dropout rates for each hidden layer

In [None]:
def objective(trial):
    keras.backend.clear_session()
    config = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 0.1,log=True),
        'batch_size':trial.suggest_int('batch_size',128, 1024),
        'n_hidden':trial.suggest_int('n_hidden',1, 4),
        'activation':trial.suggest_categorical('activation', choices=['selu', 'relu']),
        'include_res_connections': trial.suggest_categorical('include_res_connections', choices = [0, 1])
    }
    
    for i in range(1,config['n_hidden']+1):
        # layer sizes
        config['hsize%d'%i] = trial.suggest_int('hsize%d'%i, 8, 512,log=True) 
        config['dropout%d'%i] = trial.suggest_float('dropout%d'%i,0,0.25)
        
    _, history = build_and_fit(config)
    
    return max(history.history['val_auc_pr'])

# the first `n_startup_trials` trials are drawn through QMC sampling
sampler = optuna.samplers.TPESampler(
    n_startup_trials=10,seed=0
)

study = optuna.create_study(
    directions=['maximize'],sampler=sampler,study_name='mlp'
)

try:
    study.optimize(objective, n_trials=100, timeout=1500) 
except Exception as e:
    print(e)


In [None]:
results = study.trials_dataframe(attrs=('number','duration','value','params'))
results = results.rename({'value': 'auc_pr'}, axis=1)
results['duration'] = results['duration']/np.timedelta64(1, 's')
results = results.sort_values(by='auc_pr',ascending=False)
results.to_csv('cv_loss_history.csv',index=False)
# print the top 10 results
results.head(10)

## Final model

In [None]:
keras.backend.clear_session()
model, history = build_and_fit(study.best_params, verbose = 2)

In [None]:
# save model and preprocessor
model.save('fnn_model.h5')
joblib.dump(qt_transform, 'input_preprocessor.pkl')

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(8,3))
for i, metric in enumerate(['loss', 'auc_pr']):
    _ = axs[i].plot(history.epoch, history.history[f'{metric}'], label='Training')
    _ = axs[i].plot(history.epoch, history.history[f'val_{metric}'], label='Validation')                 
    _ = axs[i].legend()
    _ = axs[i].set_xlabel('Epoch')
    _ = axs[i].set_ylabel(metric)
fig.tight_layout()

## Evaluate on test set

In [None]:
test_pred_prob = model.predict(X_test).ravel()
test_auc_pr = keras.metrics.AUC(curve='PR')(y_test, test_pred_prob).numpy()
print()
print(f'Area under PR curve for test set: {test_auc_pr:.3f}')

### Precision Recall Curves

In [None]:
train_pred_prob = model.predict(X_train).ravel()
val_pred_prob = model.predict(X_val).ravel()

In [None]:
def plot_prc(name, labels, predictions, **kwargs):
    precision, recall, _ = precision_recall_curve(labels, predictions)

    plt.plot(recall, precision, label=name, linewidth=2, **kwargs)
    plt.ylabel('Precision')
    plt.xlabel('Recall')
    plt.grid(True)
    ax = plt.gca()
    ax.set_aspect('equal')

plot_prc("Training", y_train, train_pred_prob)
plot_prc("Validation", y_val, val_pred_prob)
plot_prc("Test", y_test, test_pred_prob)
plt.legend(loc='lower left');

### Confusion matrix

In [None]:
thresholds = [0.01, 0.1, 0.5]

fig, axs = plt.subplots(1, len(thresholds), figsize=(4.5*len(thresholds), 4))
for i, threshold in enumerate(thresholds):
    _ = sns.heatmap(confusion_matrix(y_test, test_pred_prob > threshold), annot=True, ax=axs[i])
    _ = axs[i].set_ylabel('Actual')
    _ = axs[i].set_xlabel('Predicted')
    _ = axs[i].set_title(f'Confusion matrix @ {threshold}')
    
fig.tight_layout()