In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import datetime
import math
import seaborn as sns
sns.set()
%matplotlib inline
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.3f' % x) #avoid scientific notation

from matplotlib import pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Dropout, GRU
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score
from IPython.display import Image

In [2]:
dataset = pd.read_csv("/content/card_transdata.csv")
dataset

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.878,0.311,1.946,1.000,1.000,0.000,0.000,0.000
1,10.830,0.176,1.294,1.000,0.000,0.000,0.000,0.000
2,5.091,0.805,0.428,1.000,0.000,0.000,1.000,0.000
3,2.248,5.600,0.363,1.000,1.000,0.000,1.000,0.000
4,44.191,0.566,2.223,1.000,1.000,0.000,1.000,0.000
...,...,...,...,...,...,...,...,...
999995,2.207,0.113,1.627,1.000,1.000,0.000,0.000,0.000
999996,19.873,2.684,2.778,1.000,1.000,0.000,0.000,0.000
999997,2.915,1.473,0.218,1.000,1.000,0.000,1.000,0.000
999998,4.259,0.242,0.476,1.000,0.000,0.000,1.000,0.000


In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   distance_from_home              1000000 non-null  float64
 1   distance_from_last_transaction  1000000 non-null  float64
 2   ratio_to_median_purchase_price  1000000 non-null  float64
 3   repeat_retailer                 1000000 non-null  float64
 4   used_chip                       1000000 non-null  float64
 5   used_pin_number                 1000000 non-null  float64
 6   online_order                    1000000 non-null  float64
 7   fraud                           1000000 non-null  float64
dtypes: float64(8)
memory usage: 61.0 MB


In [4]:
dataset.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,26.629,5.037,1.824,0.882,0.35,0.101,0.651,0.087
std,65.391,25.843,2.8,0.323,0.477,0.301,0.477,0.282
min,0.005,0.0,0.004,0.0,0.0,0.0,0.0,0.0
25%,3.878,0.297,0.476,1.0,0.0,0.0,0.0,0.0
50%,9.968,0.999,0.998,1.0,0.0,0.0,1.0,0.0
75%,25.744,3.356,2.096,1.0,1.0,0.0,1.0,0.0
max,10632.724,11851.105,267.803,1.0,1.0,1.0,1.0,1.0


In [None]:
dataset.isna().sum()

In [None]:
# dataset.dropna(thresh=2)

In [6]:
dataset['fraud'] = dataset['fraud'].astype(int)

In [7]:
for col in dataset.columns[3:]:
    print(f'Value counts in {col}:\n{dataset[col].value_counts()}')

Value counts in repeat_retailer:
1.000    881536
0.000    118464
Name: repeat_retailer, dtype: int64
Value counts in used_chip:
0.000    649601
1.000    350399
Name: used_chip, dtype: int64
Value counts in used_pin_number:
0.000    899392
1.000    100608
Name: used_pin_number, dtype: int64
Value counts in online_order:
1.000    650552
0.000    349448
Name: online_order, dtype: int64
Value counts in fraud:
0    912597
1     87403
Name: fraud, dtype: int64


In [8]:
corr = dataset.corr()
corr

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
distance_from_home,1.0,0.0,-0.001,0.143,-0.001,-0.002,-0.001,0.188
distance_from_last_transaction,0.0,1.0,0.001,-0.001,0.002,-0.001,0.0,0.092
ratio_to_median_purchase_price,-0.001,0.001,1.0,0.001,0.001,0.001,-0.0,0.462
repeat_retailer,0.143,-0.001,0.001,1.0,-0.001,-0.0,-0.001,-0.001
used_chip,-0.001,0.002,0.001,-0.001,1.0,-0.001,-0.0,-0.061
used_pin_number,-0.002,-0.001,0.001,-0.0,-0.001,1.0,-0.0,-0.1
online_order,-0.001,0.0,-0.0,-0.001,-0.0,-0.0,1.0,0.192
fraud,0.188,0.092,0.462,-0.001,-0.061,-0.1,0.192,1.0


In [9]:
corr.style.background_gradient(cmap='Greens')

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
distance_from_home,1.0,0.000193,-0.001374,0.143124,-0.000697,-0.001622,-0.001301,0.187571
distance_from_last_transaction,0.000193,1.0,0.001013,-0.000928,0.002055,-0.000899,0.000141,0.091917
ratio_to_median_purchase_price,-0.001374,0.001013,1.0,0.001374,0.000587,0.000942,-0.00033,0.462305
repeat_retailer,0.143124,-0.000928,0.001374,1.0,-0.001345,-0.000417,-0.000532,-0.001357
used_chip,-0.000697,0.002055,0.000587,-0.001345,1.0,-0.001393,-0.000219,-0.060975
used_pin_number,-0.001622,-0.000899,0.000942,-0.000417,-0.001393,1.0,-0.000291,-0.100293
online_order,-0.001301,0.000141,-0.00033,-0.000532,-0.000219,-0.000291,1.0,0.191973
fraud,0.187571,0.091917,0.462305,-0.001357,-0.060975,-0.100293,0.191973,1.0


Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
y = dataset['fraud']
X = dataset.drop('fraud', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, stratify=y)
X_train, validation_data, y_train, validation_targets = train_test_split(X_train, y_train, test_size=.2)
X_train.shape, X_test.shape

((560000, 7), (300000, 7))

In [12]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [13]:
# List of continuous features
conts = list(dataset.columns[:3])
# List of categorical (0/1) features
cats = list(dataset.columns[3:-1])

transformations = [('continuous', StandardScaler(), conts), ('categorical', OrdinalEncoder(), cats)]
pipeline = ColumnTransformer(transformations)
pipeline

ColumnTransformer(transformers=[('continuous', StandardScaler(),
                                 ['distance_from_home',
                                  'distance_from_last_transaction',
                                  'ratio_to_median_purchase_price']),
                                ('categorical', OrdinalEncoder(),
                                 ['repeat_retailer', 'used_chip',
                                  'used_pin_number', 'online_order'])])

In [14]:
train_transformed = pipeline.fit_transform(X_train)
validation_transformed = pipeline.fit_transform(validation_data)
test_transformed = pipeline.fit_transform(X_test)

In [17]:
# Building the RNN

n_features = train_transformed.shape[1]

# activation settings # softsign , softplus, relu 
activation1 = 'relu'    # Hidden Layers

activation2 = 'sigmoid' # Input and Output Layer 

# dropout for the hidden layers
dropout = 0.2

# number of hidden layers
n_layers = 4 

# number of neurons of the hidden layers
n_neurons = [64, 64, 64, 32]

# optimizer settings with parameters
optimizer= tf.keras.optimizers.Adam(learning_rate=0.013, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

# Initialising the RNN
model = Sequential()

# Input layer
model.add(Dense(units=16,
                  input_shape=(n_features,),
                  activation= activation2, # activation function used for the RNN (softsign, relu, sigmoid)
                  use_bias=True,
                  kernel_initializer="glorot_uniform",
                  bias_initializer="zeros")) 

# Hidden layers
for i in range(n_layers):
    model.add(Dropout(dropout))
    model.add(Dense(units=n_neurons[i],
                      activation=activation1, # activation function used for the RNN (softsign, relu, sigmoid)
                      use_bias=True, 
                      kernel_initializer="glorot_uniform",  
                      bias_initializer="zeros"))
    
# Adding the output layer
model.add(Dense(units=1, activation=activation2))

# Compiling the RNN
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=[tf.keras.metrics.BinaryAccuracy(name='binary_accuracy')]) 

# Model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 16)                128       
                                                                 
 dropout_4 (Dropout)         (None, 16)                0         
                                                                 
 dense_7 (Dense)             (None, 64)                1088      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_8 (Dense)             (None, 64)                4160      
                                                                 
 dropout_6 (Dropout)         (None, 64)                0         
                                                                 
 dense_9 (Dense)             (None, 64)               

In [18]:
results = model.fit(train_transformed, 
                    y_train, 
                    batch_size=64, 
                    epochs=100, 
                    validation_data=(validation_transformed, validation_targets))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [19]:
val_pred = np.around(model.predict(validation_transformed))

In [20]:
results = model.evaluate(validation_transformed, validation_targets, batch_size=128)
print("test loss, test acc:", results)

test loss, test acc: [0.018046921119093895, 0.995114266872406]


In [21]:
from sklearn.metrics import roc_auc_score, classification_report

roc_auc_score(validation_targets, val_pred) # AUC score for Dev Data --> 0,5: Worst < 1: Best

0.9765687924023082

In [22]:
print(classification_report(validation_targets, val_pred)) # Classification Report for Dev Data

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    127561
           1       0.99      0.95      0.97     12439

    accuracy                           1.00    140000
   macro avg       0.99      0.98      0.98    140000
weighted avg       1.00      1.00      1.00    140000



In [23]:
results = model.evaluate(test_transformed, y_test, batch_size=128)
print("test loss, test acc:", results)

test loss, test acc: [0.021375052630901337, 0.9959033131599426]


In [24]:
test_preds = np.around(model.predict(test_transformed))

In [25]:
roc_auc_score(y_test, test_preds) # AUC score for Test Data

0.9801510037676414

In [26]:
print(classification_report(y_test, test_preds)) # Classification Report for Test Data

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    273779
           1       0.99      0.96      0.98     26221

    accuracy                           1.00    300000
   macro avg       0.99      0.98      0.99    300000
weighted avg       1.00      1.00      1.00    300000

