##### Import necessary libraries

In [1]:

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE
import numpy as np 
import pandas as pd
from sklearn.preprocessing import RobustScaler


**Tasks Performed:**
1. EDA
2. Scaling of data
3. Handle imbalance dataset
4. Train model with different techniques

In [2]:
## Read the dataset, data can be get from https://www.kaggle.com/mlg-ulb/creditcardfraud
path = "creditcard.csv"
dataset = pd.read_csv(path)
dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
## There are two classes where 0 is non-fraud and 1 fraud data
dataset['Class'].unique()

In [None]:
## Check total number of fraud and non-fraud cases
print('Total number of non-fraud data: {}'.format(dataset[dataset['Class'] == 0].shape[0]))
print('Total number of fraud data: {}'.format(dataset[dataset['Class'] == 1].shape[0]))

print('Percentage of non-fraud data: {}'.format(round((dataset[dataset['Class'] == 0].shape[0]/dataset.shape[0]) * 100,2)))
print('Percentage of fraud data: {}'.format(round((dataset[dataset['Class'] == 1].shape[0]/dataset.shape[0]) * 100, 2)))

Conclusion: We can see that the data is heavily imbalanced with fraud data of just 0.17%

In [None]:
## Total number of null entries
dataset.isna().sum().sum()

Conclusion: There are no null entries

In [None]:
## Countplot for Class attribute
sns.countplot('Class', data=dataset)
plt.title("Count plot for fraud data \n (0 = Non-Fraud, 1 = Fraud)")

In [None]:
## Distribution of amount
sns.distplot(dataset['Amount'])

In [None]:
fraud_dataset = dataset[dataset['Class'] == 1]

## Distribution of amount for fraud data
sns.distplot(fraud_dataset['Amount'], bins=10)
sns.boxplot(x='Amount', data=fraud_dataset)
fraud_dataset['Amount'].describe()

Conclusion: From above we can see max fraud data is till 106 amount

In [None]:
## Distribution of amount for normal data records
non_fraud_dataset = dataset[dataset['Class'] == 0]
sns.distplot(non_fraud_dataset['Amount'], bins=10)
non_fraud_dataset['Amount'].describe()

**Conclusion: here we can see non-fraud amount is less than 77 and its max is 35691**

In [None]:
## Overall distribution of time
sns.distplot(dataset['Time'])

## Distribution of time for fraud records
sns.distplot(fraud_dataset['Time'])

## Distribution of time for non-fraud records
sns.distplot(non_fraud_dataset['Time'])

In [None]:
## As Time and amount have long range, lets scale them to get best result
robust_scaler = RobustScaler()

dataset['scaled_amount'] = robust_scaler.fit_transform(dataset['Amount'].values.reshape(-1,1))
dataset['scaled_time'] = robust_scaler.fit_transform(dataset['Time'].values.reshape(-1,1))

dataset.drop(['Time','Amount'], axis=1, inplace=True)

**Let us now handle the imbalance dataset through below techniques:**

Below are undersampling techniques:
1. Random undersampling
2. Imblearn Randomundersampling
3. Imblearn Tomelink undersampling

Below are oversampling techniques:
1. Random Oversampling
2. Imblearn Random oversampling
3. Imblearn SMOTE

In [None]:
non_fraud_count, fraud_count = dataset['Class'].value_counts()
print("Total number of non-fraud cases are {}, and fraud cases are {} ".format(non_fraud_count, fraud_count))

non_fraud = dataset[dataset['Class']==0]
fraud = dataset[dataset['Class']==1]

**Random Undersampling**

In [None]:
non_fraud_undersample = non_fraud.sample(fraud_count)
## Create new dataset with fraud data and non-fraud undersampled data
random_sampling_data = fraud.append(non_fraud_undersample)
x_random_undersample = random_sampling_data.drop('Class', axis=1)
y_random_undersample = random_sampling_data['Class']
sns.countplot(x='Class', data=random_sampling_data)

**Imbalance randomundersampling**

In [None]:
X = dataset.drop('Class', axis=1)
y = dataset['Class']
under_sampler = RandomUnderSampler(replacement=True)
x_imb_random_undersample, y_imb_random_undersample= under_sampler.fit_resample(X,y)

**Random Oversampling**

In [None]:
fraud_oversampling_data = fraud.sample(non_fraud_count, replace=True)
oversampling_data = fraud_oversampling_data.append(non_fraud)
x_random_oversample = oversampling_data.drop(['Class'], axis=1)
y_random_oversample = oversampling_data['Class']

**Imbalance Random oversampling**

In [None]:
X = dataset.drop('Class', axis=1)
y = dataset['Class']
imb_random_oversampler = RandomOverSampler()
x_imb_random_oversample, y_imb_random_oversample= imb_random_oversampler.fit_resample(X,y)

**Imbalance SMOTE**

In [None]:
smote = SMOTE()
x_imb_smote, y_imb_smote = smote.fit_resample(X,y)

In [None]:
## Different models to check performance
def createLogisticModel(x,y):
    model = LogisticRegression();
    model.fit(x,y)
    return model

def createKnnModel(x,y):
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(x,y)
    return model

def createSvmModel(x,y):
    model = SVC()
    model.fit(x,y)
    return model

def createSGDClassifier(x,y):
    model = SGDClassifier()
    model.fit(x,y)
    return model

def createDecisionTreeClassifier(x,y):
    model = DecisionTreeClassifier()
    model.fit(x,y)
    return model

def createRandomForestClassifier(x,y):
    model = RandomForestClassifier()
    model.fit(x,y)
    return model

def createAdaBoostClassifier(x,y):
    model = AdaBoostClassifier()
    model.fit(x,y)
    return model

def createGradientBoostingClassifier(x,y):
    model = GradientBoostingClassifier()
    model.fit(x,y)
    return model

def createMLPClassifier(x,y):
    model = MLPClassifier(random_state=48, hidden_layer_sizes=(150, 100, 50), max_iter=150, activation='relu',
                              solver='adam')
    model.fit(x,y)
    return model

In [None]:
## Different metrics to measure performance
def performanceMetrics(X_test, y_test, model, name):
    y_pred = model.predict(X_test)
    print("The confusion matrix for the {} model is: {}".format(name, confusion_matrix(y_test, y_pred)))
    print("The accuracy score for the {} model is: {}".format(name, accuracy_score(y_test, y_pred)))
    print("The precision score for the {} model is: {}".format(name, precision_score(y_test, y_pred)))
    print("The recall score for the {} model is: {}".format(name, recall_score(y_test, y_pred)))
    print("The f1 score for the {} model is: {}".format(name, f1_score(y_test, y_pred)))
    

In [None]:
## Train model and check performance
def trainModelAndCheckPerformance(x, y):
    X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2)
    ## Create Different model and see their performance
    model = createLogisticModel(X_train,y_train)
    performanceMetrics(X_test, y_test, model, "Logistic Regression")
    print("--------------------------------------------------------------------------------------")
    model = createSvmModel(X_train,y_train)
    performanceMetrics(X_test, y_test, model, "Support Vector Machine")
    print("--------------------------------------------------------------------------------------")
    model = createDecisionTreeClassifier(X_train,y_train)
    performanceMetrics(X_test, y_test, model, "Decision Tree Classifier")
    print("--------------------------------------------------------------------------------------")
    model = createAdaBoostClassifier(X_train,y_train)
    performanceMetrics(X_test, y_test, model, "Ada Boost classifier")
    print("--------------------------------------------------------------------------------------")
    model = createGradientBoostingClassifier(X_train,y_train)
    performanceMetrics(X_test, y_test, model, "Gradient Boosting classifier")
    print("--------------------------------------------------------------------------------------")
    model = createKnnModel(X_train,y_train)
    performanceMetrics(X_test, y_test, model, "K nearest neighbour")
    print("--------------------------------------------------------------------------------------")
    model = createMLPClassifier(X_train,y_train)
    performanceMetrics(X_test, y_test, model, "Multilayer perceptron")
    print("--------------------------------------------------------------------------------------")
    model = createSvmModel(X_train,y_train)
    performanceMetrics(X_test, y_test, model, "Support Vector Machine")
    print("--------------------------------------------------------------------------------------")

In [None]:
print("Random undersampling ----------------------------------- ")
trainModelAndCheckPerformance(x_random_undersample, y_random_undersample)
print("Imbalance undersampling ----------------------------------- ")
trainModelAndCheckPerformance(x_imb_random_undersample, y_imb_random_undersample)
print("Random Oversampling ----------------------------------- ")
trainModelAndCheckPerformance(x_random_oversample, y_random_oversample)
print("Imbalance randome oversampling ----------------------------------- ")
trainModelAndCheckPerformance(x_imb_random_oversample, y_imb_random_oversample)
print("Imbalance smote ----------------------------------- ")
trainModelAndCheckPerformance(x_imb_smote, y_imb_smote)