#### Copyright 2019 The TensorFlow Authors.

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

In [2]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

In [3]:
import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_auc_score, average_precision_score

from imblearn.over_sampling import SMOTE, RandomOverSampler

In [4]:
"""
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""
"""

my_devices = tf.config.experimental.list_physical_devices()
print(my_devices)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'), PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [5]:
# raw_df = pd.read_csv('https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv')
raw_df = pd.read_csv("/home/yannick/.kaggle/creditcard.csv")
raw_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Split the dataframe into train, validation, and test

Split the dataset into train, validation, and test sets. The validation set is used during the model fitting to evaluate the loss and any metrics, however the model is not fit with this data. The test set is completely unused during the training phase and is only used at the end to evaluate how well the model generalizes to new data. This is especially important with imbalanced datasets where [overfitting](https://developers.google.com/machine-learning/crash-course/generalization/peril-of-overfitting) is a significant concern from the lack of training data.

In [9]:
# Use a utility from sklearn to split and shuffle our dataset.
train_df, test_df = train_test_split(raw_df, test_size=0.33, random_state=42)

# Form np arrays of labels and features.
train_labels = np.array(train_df.pop('Class'))
test_labels = np.array(test_df.pop('Class'))

train_features = np.array(train_df)
test_features = np.array(test_df)

# Normalize the input features using the sklearn StandardScaler.
# This will set the mean to 0 and standard deviation to 1.
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
test_features = scaler.transform(test_features)

print('Training labels shape:', train_labels.shape)
print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Test features shape:', test_features.shape)

Training labels shape: (190820,)
Test labels shape: (93987,)
Training features shape: (190820, 30)
Test features shape: (93987, 30)


In [28]:
def make_model():
  model = keras.Sequential([
      keras.layers.Dense(512, activation='relu',
                         input_shape=(train_features.shape[-1],)),
      keras.layers.Dense(256, activation='relu'),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(128, activation='relu'),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(64, activation='relu'),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(32, activation='relu'),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(1, activation='sigmoid'),
  ])

  metrics = [
      keras.metrics.AUC(name='auc')
  ]

  model.compile(
      optimizer='adam',
      loss='binary_crossentropy',
      metrics=metrics)
  
  return model

In [29]:
%%time

model = make_model()

EPOCHS = 40
BATCH_SIZE = 2048

history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0,
    verbose=2)

y_hat = model.predict_proba(test_features)
print("test score = %.3f" % average_precision_score(test_labels, y_hat))

Train on 190820 samples
Epoch 1/40
190820/190820 - 1s - loss: 0.0778 - auc: 0.6813
Epoch 2/40
190820/190820 - 1s - loss: 0.0081 - auc: 0.9258
Epoch 3/40
190820/190820 - 1s - loss: 0.0060 - auc: 0.9376
Epoch 4/40
190820/190820 - 1s - loss: 0.0060 - auc: 0.9496
Epoch 5/40
190820/190820 - 1s - loss: 0.0049 - auc: 0.9518
Epoch 6/40
190820/190820 - 1s - loss: 0.0048 - auc: 0.9563
Epoch 7/40
190820/190820 - 1s - loss: 0.0041 - auc: 0.9625
Epoch 8/40
190820/190820 - 1s - loss: 0.0040 - auc: 0.9608
Epoch 9/40
190820/190820 - 1s - loss: 0.0034 - auc: 0.9685
Epoch 10/40
190820/190820 - 1s - loss: 0.0036 - auc: 0.9700
Epoch 11/40
190820/190820 - 1s - loss: 0.0032 - auc: 0.9730
Epoch 12/40
190820/190820 - 1s - loss: 0.0030 - auc: 0.9790
Epoch 13/40
190820/190820 - 1s - loss: 0.0033 - auc: 0.9788
Epoch 14/40
190820/190820 - 1s - loss: 0.0029 - auc: 0.9806
Epoch 15/40
190820/190820 - 1s - loss: 0.0028 - auc: 0.9790
Epoch 16/40
190820/190820 - 1s - loss: 0.0027 - auc: 0.9806
Epoch 17/40
190820/190820

In [30]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 512)               15872     
_________________________________________________________________
dense_36 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_21 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_37 (Dense)             (None, 128)               32896     
_________________________________________________________________
dropout_22 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_38 (Dense)             (None, 64)                8256      
_________________________________________________________________
dropout_23 (Dropout)         (None, 64)               

In [32]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [33]:
clf = KerasClassifier(build_fn=make_model,
                      batch_size=BATCH_SIZE,
                      epochs=EPOCHS,
                      validation_split=0)

clf.fit(
    train_features,
    train_labels)

y_hat = clf.predict_proba(test_features)[:,1]
print("test score = %.3f" % average_precision_score(test_labels, y_hat))

Train on 190820 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
test score = 0.840
