In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
df = pd.read_csv('creditcard.csv')

In [3]:
df.shape

(284807, 31)

In [4]:
#grouping the columns

X = df.iloc[:, :30]
y = df['Class']

In [5]:
#splitting the dataset into 70 30 portion

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)  

In [6]:
#shape of the train and test dataset
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (199364, 30)
Number transactions y_train dataset:  (199364,)
Number transactions X_test dataset:  (85443, 30)
Number transactions y_test dataset:  (85443,)


In [7]:
#using standard scaler

X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

### USING SMOTE OVERSAMPLING TO BALANCE THE DATASET

In [8]:
#class label count after oversampling

print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

Before OverSampling, counts of label '1': 345
Before OverSampling, counts of label '0': 199019 



In [9]:
#importing the library

from imblearn.over_sampling import SMOTE

In [10]:
#smote oversampling operation

sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.values.ravel())

In [11]:
#shape of traings after oversampling
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {}'.format(y_train_res.shape))

After OverSampling, the shape of train_X: (398038, 30)
After OverSampling, the shape of train_y: (398038,)


In [12]:
#counts of class labels after over sampling

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

After OverSampling, counts of label '1': 199019
After OverSampling, counts of label '0': 199019


### Using Logistic Regression

In [13]:
#importing libraries

from sklearn.linear_model import LogisticRegression

In [14]:
#fitting and training the model

lr = LogisticRegression()
lr.fit(X_train_res, y_train_res)


LogisticRegression()

In [15]:
#predicting the model
pred_lr = lr.predict(X_test)

In [16]:
print(pred_lr)

[0 0 0 ... 0 0 0]


In [52]:
cm_lr = confusion_matrix(y_test, pred_lr)
print(cm_lr)

[[83273  2023]
 [   12   135]]


In [53]:
print(classification_report(y_test, pred_lr))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     85296
           1       0.06      0.92      0.12       147

    accuracy                           0.98     85443
   macro avg       0.53      0.95      0.55     85443
weighted avg       1.00      0.98      0.99     85443



### Random Forest



In [18]:
#importing libraries

from sklearn.ensemble import RandomForestClassifier

In [49]:
#fitting the model

rf_res = RandomForestClassifier(n_estimators = 35)
rf_res.fit(X_train_res, y_train_res.ravel())

RandomForestClassifier(n_estimators=35)

In [50]:
#predicting x_test
pred_rf = rf_res.predict(X_test)

In [51]:
#Confusion Matrix

cm_rf = confusion_matrix(y_test, pred_rf)
print(cm_rf)

[[85282    14]
 [   29   118]]


In [54]:
rf_res = RandomForestClassifier(n_estimators = 37)
rf_res.fit(X_train_res, y_train_res.ravel())
pred_rf = rf_res.predict(X_test)

cm_rf = confusion_matrix(y_test, pred_rf)
print(cm_rf)

print(classification_report(y_test, pred_rf))

[[85281    15]
 [   28   119]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.89      0.81      0.85       147

    accuracy                           1.00     85443
   macro avg       0.94      0.90      0.92     85443
weighted avg       1.00      1.00      1.00     85443



In [55]:
rf_res = RandomForestClassifier(n_estimators = 40)
rf_res.fit(X_train_res, y_train_res.ravel())
pred_rf = rf_res.predict(X_test)

cm_rf = confusion_matrix(y_test, pred_rf)
print(cm_rf)

print(classification_report(y_test, pred_rf))

[[85280    16]
 [   26   121]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.88      0.82      0.85       147

    accuracy                           1.00     85443
   macro avg       0.94      0.91      0.93     85443
weighted avg       1.00      1.00      1.00     85443



In [56]:
rf_res = RandomForestClassifier(n_estimators = 45)
rf_res.fit(X_train_res, y_train_res.ravel())
pred_rf = rf_res.predict(X_test)

cm_rf = confusion_matrix(y_test, pred_rf)
print(cm_rf)

print(classification_report(y_test, pred_rf))

[[85281    15]
 [   26   121]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.89      0.82      0.86       147

    accuracy                           1.00     85443
   macro avg       0.94      0.91      0.93     85443
weighted avg       1.00      1.00      1.00     85443



In [57]:
rf_res = RandomForestClassifier(n_estimators = 50)
rf_res.fit(X_train_res, y_train_res.ravel())
pred_rf = rf_res.predict(X_test)

cm_rf = confusion_matrix(y_test, pred_rf)
print(cm_rf)

print(classification_report(y_test, pred_rf))

[[85280    16]
 [   28   119]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.88      0.81      0.84       147

    accuracy                           1.00     85443
   macro avg       0.94      0.90      0.92     85443
weighted avg       1.00      1.00      1.00     85443



In [58]:
rf_res = RandomForestClassifier(n_estimators = 55)
rf_res.fit(X_train_res, y_train_res.ravel())
pred_rf = rf_res.predict(X_test)

cm_rf = confusion_matrix(y_test, pred_rf)
print(cm_rf)

print(classification_report(y_test, pred_rf))

[[85281    15]
 [   26   121]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.89      0.82      0.86       147

    accuracy                           1.00     85443
   macro avg       0.94      0.91      0.93     85443
weighted avg       1.00      1.00      1.00     85443



In [59]:
rf_res = RandomForestClassifier(n_estimators = 25)
rf_res.fit(X_train_res, y_train_res.ravel())
pred_rf = rf_res.predict(X_test)

cm_rf = confusion_matrix(y_test, pred_rf)
print(cm_rf)

print(classification_report(y_test, pred_rf))

[[85280    16]
 [   26   121]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.88      0.82      0.85       147

    accuracy                           1.00     85443
   macro avg       0.94      0.91      0.93     85443
weighted avg       1.00      1.00      1.00     85443



    n_estimators = 25, error > 30
    n_estimators = 10, error = 46
    n_estimators = 15, error = 45
    n_estimators = 30, error = 33
    n_estimators = 35, error = 43
    n_estimators = 40, error =
    n_estimators = 45, error = 
    n_estimators = 50, error =
    n_estimators = 55, error = 
    n_estimators = 60, error =

In [29]:
#classification report

print(classification_report(y_test, pred_rf))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.87      0.80      0.84       147

    accuracy                           1.00     85443
   macro avg       0.94      0.90      0.92     85443
weighted avg       1.00      1.00      1.00     85443



### Support Vector Machine Algorithm

In [60]:
#importing libraries
from sklearn.svm import SVC

In [61]:
#fitting the model and training

svc_res = SVC(kernel = 'linear', random_state = 0)
svc_res.fit(X_train_res, y_train_res)

SVC(kernel='linear', random_state=0)

In [62]:
#predicting the test set result

pred_svc = svc_res.predict(X_test)

In [63]:
#confusion matric

cm_svc = confusion_matrix(y_test, pred_svc)
print(cm_svc)

[[83517  1779]
 [   12   135]]


In [64]:
#classification report

print(classification_report(y_test, pred_svc))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     85296
           1       0.07      0.92      0.13       147

    accuracy                           0.98     85443
   macro avg       0.54      0.95      0.56     85443
weighted avg       1.00      0.98      0.99     85443

