## Credit Card Fraud Detection
Anonymized credit card transactions labeled as fraudulent or genuine

Note that most of the features in the dataset had already been transformed using PCA.

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('creditcard.csv')
df.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


### Data exploration

In [2]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.16598e-15,3.416908e-16,-1.37315e-15,2.086869e-15,9.604066e-16,1.490107e-15,-5.556467e-16,1.177556e-16,-2.406455e-15,...,1.656562e-16,-3.44485e-16,2.578648e-16,4.471968e-15,5.340915e-16,1.687098e-15,-3.666453e-16,-1.220404e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [6]:
df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

This dataset is highly skewed considering less than 1% of the transactions are fraudulent. Hence, training and test data must be carefully sampled

In [7]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

### Data Engineering
Resampling skewed data

About 99.8% of the dataset are class 'false' while only 0.2% are 'true'. We will now remove some of false class to balance the dataset

In [206]:
from random import sample
from sklearn.utils import shuffle

def resample(df):
    indices_0 = df.index[df['Class'] == 0].tolist()
    indices_1 = df.index[df['Class'] == 1].tolist()
    indices_0 =  sample(indices_0, len(indices_1)*1) # select integer multiples of 
    df_0 = df.iloc[ indices_0]
    df_1 = df.iloc[ indices_1]
    df_resampled = pd.concat([df_0, df_1], ignore_index=True)
    return shuffle(df_resampled, random_state=0)

In [207]:
df_2 = resample(df)
df_2

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
231,36522.0,1.220480,-0.006770,-0.176154,0.975853,0.404602,0.869978,-0.192340,0.323839,0.407235,...,-0.162225,-0.344752,-0.218724,-1.385056,0.751952,-0.238833,0.022613,-0.012459,6.10,0
522,12093.0,-4.696795,2.693867,-4.475133,5.467685,-1.556758,-1.549420,-4.104215,0.553934,-1.498468,...,0.573898,-0.080163,0.318408,-0.245862,0.338238,0.032271,-1.508458,0.608075,0.00,1
27,48303.0,-0.727817,0.660857,1.939541,1.344661,0.441792,0.583337,0.215672,0.092201,-0.062299,...,0.020355,0.567745,-0.371446,-0.003447,0.009221,-0.182928,-0.054808,-0.106563,22.00,0
630,41851.0,-19.139733,9.286847,-20.134992,7.818673,-15.652208,-1.668348,-21.340478,0.641900,-8.550110,...,-2.182692,0.520543,-0.760556,0.662767,-0.948454,0.121796,-3.381843,-1.256524,139.90,1
930,150949.0,-2.423535,1.659093,-3.071421,2.588033,1.135791,-1.892388,-2.588418,-2.226592,-1.670173,...,-0.934127,0.922038,-0.180255,-0.281719,0.299285,-0.263801,0.150156,0.292112,9.29,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,102619.0,-2.488363,4.359019,-7.776410,5.364027,-1.823877,-2.445140,-4.964221,1.484890,-2.947899,...,1.325672,1.021226,-0.266476,-0.370880,0.365535,0.081372,0.184983,-0.211582,1.00,1
192,137351.0,0.062807,0.359092,1.539570,2.274677,0.316283,0.756500,-1.084735,-0.531611,-0.088392,...,0.222471,-1.836462,-1.320091,-0.457059,0.881731,-0.433342,0.232919,0.200724,59.33,0
629,41791.0,-7.222731,6.155773,-10.826460,4.180779,-6.123555,-3.114136,-6.895112,5.161516,-2.516477,...,0.912700,-0.630358,0.190887,-0.061263,0.379775,-0.266845,1.193695,0.257468,99.99,1
559,26931.0,-22.561699,13.208904,-24.643819,6.232532,-16.905611,-4.497439,-16.810184,14.955107,-3.871297,...,1.765987,-1.635517,-0.998317,0.138972,1.559350,-0.222125,1.504425,0.445920,99.99,1


### Modelling

In [211]:
def metric(y_true,y_pred):
    
    indices_tp = np.where(y_true==1)
    predict_1 = y_pred[indices_tp]
    tp = sum(predict_1==1)
    fn = sum(predict_1==0)
    
    indices_tn = np.where(y_true==0)
    predict_0 = y_pred[indices_tn]
    tn = sum(predict_0==0)
    fp = sum(predict_0==1)
    
    print('Summary:')
    print('-------')
    print('%3.2f percent of frauds predicted correctly'%(tp/len(predict_1)*100))
    print('%3.2f percent of frauds misclassified as valid'%(fn/len(predict_1)*100))
    print('%3.2f percent of valids predicted correctly'%(tn/len(predict_0)*100))
    print('%3.2f percent of valids misclassified as fraud'%(fp/len(predict_0)*100))
    print('Acc. -- %3.2f percent'%((tp+tn)/(tp+tn+fn+fp)*100))
    print('F1 score -- %2.2f'%(2*tp/(2*tp+fn+fp)))

#### 1. Let's train without resampling (using umbalanced data)

In [214]:
from sklearn.model_selection import train_test_split

X = df.drop(['Time','Class'],axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)
sum(y_test==1) # check the proportion of fraudulent cases in test data

134

In [215]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error

scaler = MinMaxScaler()
lr = LogisticRegression()
xgc = XGBClassifier() # n_estimators=70, learning_rate=0.008, gamma=0, max_depth=6)
mlp = MLPClassifier(alpha=1, max_iter=500)
gnb = GaussianNB()
rfc =  RandomForestClassifier(max_depth=5, n_estimators=2)

pipeline = Pipeline(steps=[('Scale',scaler),('Classifier',xgc)])
model = pipeline.fit(X_train,y_train)

In [217]:
y_pred = model.predict(X_test)
sum(y_pred==1)

117

In [218]:
metric(y_test,y_pred)

Summary:
-------
81.34 percent of frauds predicted correctly
18.66 percent of frauds misclassified as valid
99.99 percent of valids predicted correctly
0.01 percent of valids misclassified as fraud
Acc. -- 99.96 percent
F1 score -- 0.87


#### 2. Let's train with resampled(balanced data)

In [220]:
X2 = df_2.drop(['Time','Class'],axis=1)
y2 = df_2['Class']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=2)
y2.value_counts()

1    492
0    492
Name: Class, dtype: int64

In [221]:
pipeline2 = Pipeline(steps=[('Scale',scaler),('Classifier',xgc)])
model2 = pipeline2.fit(X_train2,y_train2)

In [222]:
y_pred2 = model2.predict(X_test2)
metric(y_test2,y_pred2)

Summary:
-------
93.53 percent of frauds predicted correctly
6.47 percent of frauds misclassified as valid
92.99 percent of valids predicted correctly
7.01 percent of valids misclassified as fraud
Acc. -- 93.24 percent
F1 score -- 0.93


There's much better performance in fraud detection!

### DNN Model

In [203]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
seed = 2
np.random.seed(seed)

dnn_model = Sequential()

dnn_model.add(Dense(20, input_shape=(X_train.shape[1],), activation='relu'))
#dnn_model.add(Dropout(0.3))
dnn_model.add(Dense(20, activation = 'relu'))
dnn_model.add(Dense(1, activation='sigmoid'))

dnn_model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001),
                 metrics=['accuracy'])

In [204]:
sc = MinMaxScaler().fit(X_train)
scaled_X = sc.transform(X_train)
sc_Xt = sc.transform(X_test)

dnn_model.fit(scaled_X,y_train, batch_size=10, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1b400d12470>

In [205]:
y_pd = dnn_model.predict(sc_Xt)
y_pd = np.rint(y_pd)
metric(y_test,y_pd)

Summary:
-------
86.62 percent of frauds predicted correctly
13.38 percent of frauds misclassified as valid
98.34 percent of valids predicted correctly
1.66 percent of valids misclassified as fraud
Acc. -- 94.58 percent
F1 score -- 0.91
