In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE

## Oversampling technique using SMOTE

In [2]:
# Load the data
data = pd.read_csv('creditcard.csv')

In [3]:
data = data.drop(['Time'],axis=1)
data = data.drop(['Amount'],axis=1)
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


In [4]:
X = data.iloc[:, data.columns != 'Class']
y = data.iloc[:, data.columns == 'Class']  # Response variable determining if fraudulent or not

In [5]:
X_resample, y_resample = SMOTE().fit_resample(X,y.values.ravel())

In [6]:
X_resample

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,-2.261857,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-1.232622,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568625,0.080336,1.753815,-4.585452,3.030099,-0.703105,-0.688791,-2.407306,1.003981,-0.683689,-4.805479,...,0.024933,0.321407,0.621130,0.672167,-0.194684,0.324781,0.400954,-0.296751,0.632128,0.282417
568626,-3.515275,1.706033,-5.034653,1.714435,-1.027144,-2.222518,-2.741867,1.164951,-1.666647,-4.968971,...,0.757983,0.225446,0.728270,0.318337,-0.434215,-0.346688,-0.269902,0.594888,0.182170,-0.059084
568627,-18.542693,8.838664,-18.109620,8.906077,-14.607899,-1.947970,-18.372043,5.037072,-9.223515,-14.969396,...,3.020126,-0.423917,0.312915,0.619494,-0.490193,0.717340,-0.216534,-0.028461,-3.095152,-1.027596
568628,-2.283455,1.659386,-3.068982,2.584341,1.419369,-2.034624,-2.968712,-2.221369,-1.608618,-3.460412,...,0.061699,0.375638,-0.920467,0.785759,-0.830480,-0.170213,0.086671,-0.305330,0.225444,0.314754


In [7]:
y_resample

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [8]:
X_resample.shape[0]

568630

In [9]:
X_resample = X_resample.values.reshape(X_resample.shape[0], X_resample.shape[1],1)

# CNN

In [10]:
# defining the CNN model
def cnn_model():
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=3, activation="relu", input_shape=X_resample[0].shape))
    model.add(Conv1D(filters=16, kernel_size=3, activation="relu"))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(24, activation="relu"))
    model.add(Dense(24, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
    return model

## Using Keras Classifier

In [11]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [12]:
# create model
keras_model = KerasClassifier(build_fn=cnn_model, epochs=5, batch_size=16, verbose=0)

In [13]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

# define the pipeline to include scaling and the model. 
#This pipeline will be the input to cross_val_score, instead of the model. 
from sklearn.preprocessing import MinMaxScaler

In [14]:
# defining steps of the pipeline
steps = list()
#steps.append(('scaler', MinMaxScaler()))
steps.append(('model', keras_model))
pipeline = Pipeline(steps=steps)

In [15]:
# Define the crossvalidation process to be used inside cross_val_score evaluation
cv = KFold(n_splits=10, random_state=42, shuffle=True)

In [16]:
# evaluate the model - 
scores = cross_val_score(pipeline, X_resample, y_resample, scoring='accuracy', cv=cv, n_jobs=-1)

In [17]:
for score in scores:
    print("Score for this split is: ", score)

# report performance
print('Accuracy: ', (np.mean(scores)))

Score for this split is:  0.9978193201202891
Score for this split is:  0.9990327629565798
Score for this split is:  0.9983469039621546
Score for this split is:  0.9989800045723933
Score for this split is:  0.9970103582294286
Score for this split is:  0.9990679352127042
Score for this split is:  0.9983644900902169
Score for this split is:  0.9991206935968908
Score for this split is:  0.9982941455779681
Score for this split is:  0.9988920739320823
Accuracy:  0.9984928688250708
