In [2]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
import tensorflow as tf
import matplotlib.pyplot as plt 
import io
import keras.backend as K
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils import class_weight


# Import data

In [3]:
# Store the input data in a dataframe
fraud_data = pd.read_csv("creditcard.csv")


# Preprocess data

In [4]:
# shuffle the dataframe so that the inputs are in a random order
df = fraud_data.sample(frac=1).reset_index(drop=True)
#
# Scale the time -1 and 1, since the rest of the features are scaled
robust_scaler = RobustScaler()
scaled_amount = robust_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
scaled_time = robust_scaler.fit_transform(df['Time'].values.reshape(-1,1))
df.drop(['Time','Amount'], axis=1, inplace=True)

# Insert into beginning of df
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

# Show the result
df.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,1.546846,0.873553,-1.798448,-0.830291,2.185209,-1.820608,-0.854633,0.905317,-1.245491,-1.233351,...,-1.194357,0.891557,-1.315785,-0.237302,0.646676,0.134546,1.252547,0.345407,-0.046461,0
1,2.400335,-0.058412,0.973212,-0.874979,-0.448158,-1.121518,-0.847891,-1.471452,0.391356,-0.351814,...,0.209336,0.058744,-0.062798,-0.320673,0.4484,0.713861,0.078804,-0.056332,0.032475,0
2,-0.028645,0.875856,0.082126,0.655715,-2.612569,-1.396482,3.139501,2.977854,0.344041,0.93719,...,-0.215205,-0.127424,-0.341526,0.30108,0.608449,-0.746785,0.185163,-0.375947,-0.286098,0
3,-0.064976,-0.15765,1.211461,-0.098931,0.371881,0.056012,-0.714667,-0.940265,-0.145232,-0.057228,...,-0.106237,0.078102,0.144624,-0.033397,0.600429,0.284776,1.042134,-0.097524,-0.006914,0
4,0.237546,-0.865623,1.259835,-0.196615,0.467977,0.162764,-0.575144,-0.386814,-0.518503,-0.07623,...,-0.097335,-0.12377,-0.21212,-0.122714,-0.475957,0.262548,1.085564,-0.104039,-0.001713,0


In [5]:
# Split the data into training (80%) and testing (20%)
train, test = train_test_split(df, test_size=0.2)

# Split data into features and labels
train_features = np.array(train.values[:,:30])
train_labels = np.array(train.values[:,-1])
test_features = np.array(test.values[:,:30])
test_labels = np.array(test.values[:,-1])

print(train_features.shape)
print(test_features.shape)


(227845, 30)
(56962, 30)


# Set how much we care about each mis-classification data

In [6]:
# Set How much we value each mis-classification
class_weights = class_weight.compute_class_weight("balanced", np.unique(train_labels), train_labels)
class_weights[1] *=2
        

# Create the model

In [7]:
model = Sequential()

model.add(Dense(units=50, kernel_initializer='uniform', input_dim=train_features.shape[1], activation='relu'))
model.add(Dropout(.2))
model.add(Dense(units=25, kernel_initializer='uniform', activation='relu'))
model.add(Dropout(.2))
model.add(Dense(15, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))

# Including the custom loss fxn
model.compile(optimizer='adam', metrics=['accuracy'], loss="binary_crossentropy")


# Train the model

In [28]:
model.fit(train_features, train_labels, batch_size=50, epochs=25, verbose=1, class_weight=class_weights)



Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


accuracy= 0.9994557499885559
Predicted      0   1
Actual              
0.0        56869   9
1.0           22  62
True Negative: 0.9983673326077034
True Positive: 0.7380952380952381
False Negative: 0.0003862223938766195
False Positive: 0.10714285714285714


In [None]:

# Get model accuracy 
scores = model.evaluate(test_features, test_labels)
print('\n')
print('accuracy=',scores[1])

# Get predictions from model
output = model.predict_classes(test_features)

# Show confusion matrix
y_actu = pd.Series(test_labels, name='Actual')
y_pred = pd.Series(np.ndarray.flatten(output), name='Predicted')
df_confusion = pd.crosstab(y_actu, y_pred)
print(df_confusion)


TN = df_confusion[0][0]
TP = df_confusion[1][1]
FN = df_confusion[0][1]
FP = df_confusion[1][0]


num_positives =  (np.count_nonzero(y_actu))
num_negatives = y_actu.size - num_positives
print("Num Fraud: {}".format(num_positives))
print("True Negative: {}".format(TN/num_negatives))
print("True Positive: {}".format(TP/num_positives))
print("False Negative: {}".format(FN/(TP + FN)))
print("False Positive: {}".format(FP/(FP + TN)))