In [6]:
import csv
import numpy as np

# Get the real data from https://www.kaggle.com/mlg-ulb/creditcardfraud/
fname = "/content/drive/MyDrive/PIAIC/credit_card/creditcard.csv"

all_features = []
all_targets = []
with open(fname) as f:
    for i, line in enumerate(f):
        if i == 0:
            print("HEADER:", line.strip())
            continue  # Skip header
        fields = line.strip().split(",")
        all_features.append([float(v.replace('"', "")) for v in fields[:-1]])
        all_targets.append([int(fields[-1].replace('"', ""))])
        if i == 1:
            print("EXAMPLE FEATURES:", all_features[-1])

features = np.array(all_features, dtype="float32")
targets = np.array(all_targets, dtype="uint8")
print("features.shape:", features.shape)
print("targets.shape:", targets.shape)

HEADER: "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class"
EXAMPLE FEATURES: [0.0, -1.3598071336738, -0.0727811733098497, 2.53634673796914, 1.37815522427443, -0.338320769942518, 0.462387777762292, 0.239598554061257, 0.0986979012610507, 0.363786969611213, 0.0907941719789316, -0.551599533260813, -0.617800855762348, -0.991389847235408, -0.311169353699879, 1.46817697209427, -0.470400525259478, 0.207971241929242, 0.0257905801985591, 0.403992960255733, 0.251412098239705, -0.018306777944153, 0.277837575558899, -0.110473910188767, 0.0669280749146731, 0.128539358273528, -0.189114843888824, 0.133558376740387, -0.0210530534538215, 149.62]
features.shape: (284807, 30)
targets.shape: (284807, 1)


In [7]:
# from google.colab import drive
# drive.mount('/content/drive')

In [8]:
num_val_samples = int(len(features) * 0.2)
train_features = features[:-num_val_samples]
train_targets = targets[:-num_val_samples]
val_features = features[-num_val_samples:]
val_targets = targets[-num_val_samples:]

print("Number of training samples:", len(train_features))
print("Number of validation samples:", len(val_features))

Number of training samples: 227846
Number of validation samples: 56961


In [9]:
counts = np.bincount(train_targets[:, 0])
print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(train_targets)
    )
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

Number of positive samples in training data: 417 (0.18% of total)


In [10]:
weight_for_0

4.396976638863118e-06

In [11]:
weight_for_1

0.002398081534772182

In [12]:
mean = np.mean(train_features, axis=0)
train_features -= mean
val_features -= mean
std = np.std(train_features, axis=0)
train_features /= std
val_features /= std 

In [20]:
from tensorflow import keras

model = keras.Sequential(
    [
        keras.layers.Dense(
            256, activation="relu", input_shape=(train_features.shape[-1],)
        ),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 256)               7936      
_________________________________________________________________
dense_5 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 257       
Total params: 139,777
Trainable params: 139,777
Non-trainable params: 0
________________________________________________

In [21]:
%tensorflow_version 2.x
import tensorflow as tf

In [22]:
with tf.device('/device:GPU:0'):
  metrics = [
      keras.metrics.FalseNegatives(name="fn"),
      keras.metrics.FalsePositives(name="fp"),
      keras.metrics.TrueNegatives(name="tn"),
      keras.metrics.TruePositives(name="tp"),
      keras.metrics.Precision(name="precision"),
      keras.metrics.Recall(name="recall"),
  ]

  model.compile(
      optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
  )

  callbacks = [keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")]
  class_weight = {0: weight_for_0, 1: weight_for_1}

  model.fit(
      train_features,
      train_targets,
      batch_size=2048,
      epochs=100,
      verbose=2,
      callbacks=callbacks,
      validation_data=(val_features, val_targets),
      class_weight=class_weight,
  )

Epoch 1/100
112/112 - 2s - loss: 2.2255e-06 - fn: 51.0000 - fp: 19344.0000 - tn: 208085.0000 - tp: 366.0000 - precision: 0.0186 - recall: 0.8777 - val_loss: 0.3441 - val_fn: 2.0000 - val_fp: 7111.0000 - val_tn: 49775.0000 - val_tp: 73.0000 - val_precision: 0.0102 - val_recall: 0.9733
Epoch 2/100
112/112 - 1s - loss: 1.3727e-06 - fn: 29.0000 - fp: 9266.0000 - tn: 218163.0000 - tp: 388.0000 - precision: 0.0402 - recall: 0.9305 - val_loss: 0.0362 - val_fn: 13.0000 - val_fp: 271.0000 - val_tn: 56615.0000 - val_tp: 62.0000 - val_precision: 0.1862 - val_recall: 0.8267
Epoch 3/100
112/112 - 1s - loss: 1.0529e-06 - fn: 26.0000 - fp: 5882.0000 - tn: 221547.0000 - tp: 391.0000 - precision: 0.0623 - recall: 0.9376 - val_loss: 0.0440 - val_fn: 10.0000 - val_fp: 385.0000 - val_tn: 56501.0000 - val_tp: 65.0000 - val_precision: 0.1444 - val_recall: 0.8667
Epoch 4/100
112/112 - 1s - loss: 1.1788e-06 - fn: 26.0000 - fp: 8442.0000 - tn: 218987.0000 - tp: 391.0000 - precision: 0.0443 - recall: 0.9376 - v

In [23]:
F_score = (0.3088 + 0.8400)/ 2

In [24]:
F_score

0.5744

Conclusions
At the end of training, out of 56,961 validation transactions, we are:

Correctly identifying 63 of them as fraudulent
Missing 12 fraudulent transactions
At the cost of incorrectly flagging 141 legitimate transactions
In the real world, one would put an even higher weight on class 1, so as to reflect that False Negatives are more costly than False Positives.

Next time your credit card gets declined in an online purchase -- this is why