In [26]:
# IMPORTING PACKAGE

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [27]:
#IMPORTING DATA

df = pd.read_csv('creditcard.csv')

In [28]:
print(df.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...         V21       V22       V23       V24  \
0  0.098698  0.363787  ...   -0.018307  0.277838 -0.110474  0.066928   
1  0.085102 -0.255425  ...   -0.225775 -0.638672  0.101288 -0.339846   
2  0.247676 -1.514654  ...    0.247998  0.771679  0.909412 -0.689281   
3  0.377436 -1.387024  ...   -0.108300  0.005274 -0.190321 -1.175575   
4 -0.270533  0.817739  ...   -0.009431  0.798278 -0.137458  0.141267   

        V25       V26       V27       V28  Amount  Class  
0  0.128539 -0.189115

In [29]:
#DROP TIME COLUMN

df.drop('Time',axis = 1, inplace = True)

In [30]:
print(df.head())

         V1        V2        V3        V4        V5        V6        V7  \
0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9       V10  ...         V21       V22       V23  \
0  0.098698  0.363787  0.090794  ...   -0.018307  0.277838 -0.110474   
1  0.085102 -0.255425 -0.166974  ...   -0.225775 -0.638672  0.101288   
2  0.247676 -1.514654  0.207643  ...    0.247998  0.771679  0.909412   
3  0.377436 -1.387024 -0.054952  ...   -0.108300  0.005274 -0.190321   
4 -0.270533  0.817739  0.753074  ...   -0.009431  0.798278 -0.137458   

        V24       V25       V26       V27       V28  Amount  Class  
0  0.066928  0.128539 -0.189115  0.133558 -0.02

In [31]:
#CHECK FOR MISSING DATA

df.isnull().any().any()

False

In [32]:
#FRAUD CASES AND NON-FRAUD CASES

cases =len(df)
fraud_count = len(df[df.Class == 1])
non_fraud_count = len(df[df.Class == 0])
fraud_percentage = round((fraud_count / non_fraud_count) * 100, 2)

#PRINT

print('Total number of cases are {}'.format(cases))
print('Number of fraud cases are {}'.format(fraud_count))
print('Total number Non-fraud cases are {}'.format(non_fraud_count))
print('Percentage of fraud cases are {}'.format(fraud_percentage))

Total number of cases are 284807
Number of fraud cases are 492
Total number Non-fraud cases are 284315
Percentage of fraud cases are 0.17


In [33]:
#STATISTICAL VIEW OF FRAUD CASES TRANSACTION AMOUNT 

fraud_cases = df[df.Class == 1]

print('FRAUD CASES AMOUNT STATS')
print(fraud_cases.Amount.describe())

FRAUD CASES AMOUNT STATS
count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64


In [34]:
#STATISTICAL VIEW OF NON-FRAUD CASES TRANSACTION AMOUNT 

non_fraud_cases = df[df.Class == 0]

print('NON-FRAUD CASES AMOUNT STATS')
print(non_fraud_cases.Amount.describe())

NON-FRAUD CASES AMOUNT STATS
count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64


In [46]:
#CHECK FOR SKEWNESS AS AMOUNT VARY ENORMOUSLY

df['Amount'].skew()

16.977724453761024

In [47]:
#NORMALIZE AMOUNT

amount = df['Amount'].values
df['Amount'] = StandardScaler().fit_transform(amount.reshape(-1, 1))

In [48]:
print(df['Amount'].head(15))

0     0.244964
1    -0.342475
2     1.160686
3     0.140534
4    -0.073403
5    -0.338556
6    -0.333279
7    -0.190107
8     0.019392
9    -0.338516
10   -0.322044
11   -0.313289
12    0.132538
13   -0.243282
14   -0.118142
Name: Amount, dtype: float64


In [49]:
#DATA SPLITING

X = df.drop('Class', axis = 1).values
y = df['Class'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [50]:
print('X_train samples: ', X_train[:1])
print('X_test samples: ', X_test[:1])
print('y_train samples: ', y_train[:10])
print('y_test samples: ', y_test[:10])

X_train samples:  [[-1.11504743  1.03558276  0.80071244 -1.06039825  0.03262117  0.85342216
  -0.61424348 -3.23116112  1.53994798 -0.81690879 -1.30559201  0.1081772
  -0.85960958 -0.07193421  0.90665563 -1.72092961  0.79785322 -0.0067594
   1.95677806 -0.64489556  3.02038533 -0.53961798  0.03315649 -0.77494577
   0.10586781 -0.43085348  0.22973694 -0.0705913  -0.30145418]]
X_test samples:  [[-0.32333357  1.05745525 -0.04834115 -0.60720431  1.25982115 -0.09176072
   1.1591015  -0.12433461 -0.17463954 -1.64440065 -1.11886302  0.20264731
   1.14596495 -1.80235956 -0.24717793 -0.06094535  0.84660574  0.37945439
   0.84726224  0.18640942 -0.20709827 -0.43389027 -0.26161328 -0.04665061
   0.2115123   0.00829721  0.10849443  0.16113917 -0.19330595]]
y_train samples:  [0 0 0 0 0 0 0 0 0 0]
y_test samples:  [0 0 0 0 0 0 0 0 0 0]


In [51]:
#BUILDING LOGISTIC REGRESSION MODEL

logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)
logistic_regression_res = logistic_regression.predict(X_test)



In [52]:
#BUILDING K-NEAREST NEIGHBORS MODEL

n = 5
knn = KNeighborsClassifier(n_neighbors = n)
knn.fit(X_train, y_train)
knn_res = knn.predict(X_test)

In [53]:
#ACCURACY OF LOGISTIC REGRESSION MODEL

print('Accuracy of logistic regression model is ', accuracy_score(y_test, logistic_regression_res))

Accuracy of logistic regression model is  0.9991924440855307


In [54]:
#ACCURACY OF K-NEAREST NEIGHBORS MODEL

print('Accuracy of K-Nearest Neighbors model is ', accuracy_score(y_test, knn_res))

Accuracy of K-Nearest Neighbors model is  0.9995259997893332


In [55]:
#F1 SCORE OF LOGISTIC REGRESSION MODEL

print('F1 score of logistic regression model is',f1_score(y_test, logistic_regression_res))

F1 score of logistic regression model is 0.7356321839080459


In [56]:
#F1 SCORE OF K-NEAREST NEIGHBORS MODEL

print('F1 score of K-Nearest Neighbors model is',f1_score(y_test, knn_res))

F1 score of K-Nearest Neighbors model is 0.8571428571428572
