In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import regularizers
import tensorboard
from pylab import rcParams

In [None]:
data = pd.read_csv("creditcard.csv")
data

In [None]:
print(data.count())

In [None]:
# Configuring Modules
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale = 1.5)
rcParams['figure.figsize'] = 14, 8
Random_Seed = 42

In [None]:
# Renaming Columns
data.rename(columns = {"Class": "Fraud"}, inplace = True)

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
# Checking Data For Null Values
data.isnull().sum()

In [None]:
data.isna().sum()

In [None]:
# Checking Correlations
corr = data.corr()

# generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# figure size
f, ax = plt.subplots(figsize=(20, 10))

# generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap='CMRmap_r', vmax=.3, center=0, square=True,
            linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
# Transaction Class Distribution Plot
labels = ['Valid', 'Fraud']
plt.figure(figsize = (10,10))
count_classes = pd.value_counts(data['Fraud'], sort = True)
count_classes.plot(kind = 'bar', rot=0, color = 'red', alpha = 0.5)
plt.xticks(range(2), labels)
plt.title("Transaction Class Distribution Graph")
plt.xlabel("Class")
plt.ylabel("Number of Observations")
plt.grid()
plt.show()

In [None]:
len(data)

In [None]:
len_fraud = len(data[data.Fraud ==  1])
len_fraud

In [None]:
len_valid = 284807 - 492
len_valid

In [None]:
perc_fraud = (len_fraud * 100) / len(data)
perc_fraud

As we see there are total 284315 records and records of fraud transactions are only 492 which is the very low and 17 percent of total data. now we have the very imbalanced data to handle.

In [None]:
# Making DataFrames Of Valid And Fraud Transactions To Check Important Stats which we already check on whole set already
Fraud_df = data[data.Fraud == 1]
Valid_df = data[data.Fraud == 0]

In [None]:
# Checking Important Stats of Amount used in Fraud Transactions
Fraud_df.Amount.describe()

In [None]:
# Checking Imprtant Stats of Amount used in Valid Transactions
Valid_df.Amount.describe()

In [None]:
# Amount per transaction by class
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount Per Transaction Graph')

bins = 50

ax1.hist(Fraud_df.Amount, bins = bins, color='blue')
ax1.set_title('Fraud Transactions')

ax2.hist(Valid_df.Amount, bins = bins, color='red', alpha = 0.5)
ax2.set_title('Valid Transactions')

plt.xlabel('Amount ($)')
plt.ylabel('Number of Transactions')
plt.xlim((0, 20000))
plt.yscale('log')
plt.show();

In [None]:
# Time of transaction vs Amount by class
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Time of Transaction VS Amount of Transaction Graph')

ax1.scatter(Fraud_df.Time, Fraud_df.Amount)
ax1.set_title('Fraud Transactions')

ax2.scatter(Valid_df.Time,Valid_df.Amount)
ax2.set_title('Valid Transactions')

plt.xlabel('Time (In Seconds)')
plt.ylabel('Amount')
plt.show()

In [None]:
# Normalizing Data
# We Need To Normalize Two Features: Time And Amount
from sklearn.preprocessing import StandardScaler
df_norm = data
df_norm['Time'] = StandardScaler().fit_transform(df_norm['Time'].values.reshape(-1, 1))
df_norm['Amount'] = StandardScaler().fit_transform(df_norm['Amount'].values.reshape(-1, 1))

## Time to split data into test and split set

In [None]:
# Training Data
train_data = df_norm.iloc[:199365,:]
train_data.shape

In [None]:
# Testing Data
test_data = df_norm.iloc[199365:,:]
test_data.shape

In [None]:
# Shuffling Training And Testing Data
train_data = train_data.sample(frac=1)
train_data.head()

In [None]:
test_data = test_data.sample(frac = 1)
test_data.head()

In [None]:
# Droping Labels From Training And Testing Data
train_labels = train_data.pop('Fraud')
test_labels = test_data.pop('Fraud')

In [None]:
model = Sequential()
model.add(Dense(10, activation = 'relu', input_shape = [len(train_data.keys())]))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(6, activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()

In [None]:
from keras.callbacks import ModelCheckpoint, TensorBoard
checkpointer = ModelCheckpoint(filepath = 'CreditCardFraudDetectionModel.h5', verbose = 0, save_best_only = True)
tensorboard = TensorBoard(log_dir = './logs', histogram_freq = 0, write_graph = True, write_images = True)

In [None]:
# Fitting Base Model On 100 Epochs
history = model.fit(train_data, train_labels, epochs = 100, 
                    batch_size = 32, shuffle=True, validation_split=0.2,
                    verbose = 1).history

In [None]:
# Storing History In A Data Frame
history_df = pd.DataFrame(history)
history_df.tail()

In [None]:
# Plotting Loss and Accuracy
plt.plot(history_df['loss'])
plt.plot(history_df['accuracy'])
plt.plot(history_df['val_loss'])
plt.plot(history_df['val_accuracy'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training Loss', 'Training Accuracy', 'Validation Loss', 'Validation Accuracy'], loc='upper right');

In [None]:
# Making Predictions On Test Data
predictions = model.predict(test_data)
predictions = predictions.flatten()
predictions

In [None]:
# Testing Model On Testing Data
testing = model.evaluate(test_data, test_labels)

In [None]:
print(f"Test Loss: {testing[0]}")
print(f"Test Accuracy: {testing[1]}")

In [None]:
# Plotting Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
# y_pred = model.predict(X_test)
y_test = pd.DataFrame(testing)
cm = confusion_matrix(test_labels, predictions.round())
sns.heatmap(cm, annot=True, fmt='.0f', cmap='cividis_r')
plt.title("Confusion Matrix")
plt.show()

## Unbalanced Data
As we can see that our model is overfitted because it is most probably unable to discriminate between valid transactions and fraud transactions because the mass of fraud transactions is very low as compared to the valid transactions and our model can treat them as normal transactions. Lets overcome this problem

## Building Second Model
There are many techniques to solve this problem but here I am using SMOTE Algorithm. SMOTE (synthetic minority oversampling technique) is one of the most commonly used oversampling methods to solve the imbalance problem. It aims to balance class distribution by randomly increasing minority class examples by replicating them. SMOTE synthesises new minority instances between existing minority instances

In [None]:
# Checking Dataset To Ensure That It IS Not Changed
data.head()

In [None]:
# Standardizing The Amount Feature
data['NormAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data = data.drop(['Time','Amount'],axis=1)

In [None]:
# Separating Test Data
X = data.iloc[:, data.columns != 'Class']
y = data.iloc[:, data.columns == 'Class']

In [None]:
# Importing SMOTE 
from imblearn.over_sampling import SMOTE
np.random.seed(9)

In [None]:
# Counting The Number Of Fraud And Valid Transactions
all_records= len(data)
number_records_fraud = len(data[data.Class == 1])
print(f"No. Of Transactions: {all_records}, Fraud Transactions: {number_records_fraud}")

In [None]:
# Applying SMOTE
X_resample, y_resample = SMOTE().fit_sample(X, y.values.ravel())

In [None]:
# Transforming Into Pandas DataFrame
y_resample = pd.DataFrame(y_resample)
X_resample = pd.DataFrame(X_resample)

In [None]:
# Splitting The Dataset Into Training And Testing Data
X_train, X_test, Y_train, Y_test = train_test_split(X_resample, y_resample, test_size=0.3, random_state=0)

In [None]:
# Making Second Model
model2 = Sequential()
model2.add(Dense(16, kernel_initializer='uniform', activation = 'relu', input_shape = (29,)))
model2.add(Dense(18, kernel_initializer='uniform', activation = 'relu'))
model2.add(Dropout(0.25))
model2.add(Dense(20, kernel_initializer = 'uniform', activation='relu'))
model2.add(Dense(24, kernel_initializer='uniform', activation='relu'))
model2.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
model2.summary()

In [None]:
# Compilation Step
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Fitting Second Model On 5 Epochs
model2.fit(np.array(X_train), np.array(Y_train), batch_size=15, epochs=5)

In [None]:
# Testing Second Model On Test Data
score = model2.evaluate(np.array(X_test), np.array(Y_test), batch_size=128)
print('\nTesting Score Is: ', score[1] * 100, '%')

In [None]:
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report 

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        1#print('Confusion matrix, without normalization')

    #print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True Lable')
    plt.xlabel('Predicted Label')

In [None]:
y_pred = model2.predict(np.array(X_test))

In [None]:
Y_test = pd.DataFrame(Y_test)
Y_test.shape

In [None]:
# Plotting Confusion Matrix
cnf_matrix = confusion_matrix(Y_test,y_pred.round())
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()

In [None]:
y_pred = model2.predict(np.array(X))

In [None]:
# Plotting Confusion Matrix

cnf_matrix = confusion_matrix(y,y_pred.round())
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()