In [1]:
# Necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.preprocessing import  PolynomialFeatures, KBinsDiscretizer, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, OrdinalEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, export_text
from sklearn.ensemble import BaggingClassifier, BaggingRegressor,RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor 
from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
# Reading and observing data
data = pd.read_csv(r"C:\Users\yigit\AppData\Local\Programs\Python\Python39\Scripts\datasets\credit card\creditcard.csv")
print(data.head())
print(data.shape)

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [3]:
# Number of Fraudulent - Normal Transactions
total_transactions = len(data)
normal = len(data[data.Class == 0])
fraudulent = len(data[data.Class == 1])
fraud_percentage = round(fraudulent/normal*100, 2)
print('Total number of Transactions are : ' + str(total_transactions))
print('Number of Normal Transactions are : ' + str(normal))
print('Number of fraudulent Transactions are : ' + str(fraudulent))
print('Percentage of fraud Transactions is : ' + str(fraud_percentage))

Total number of Transactions are : 284807
Number of Normal Transactions are : 284315
Number of fraudulent Transactions are : 492
Percentage of fraud Transactions is : 0.17


In [4]:

data.info()

min(data['Amount']),max(data['Amount'])


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

(0.0, 25691.16)

In [5]:
# Scaling 'Amount' column so the values will be similar type
sc = StandardScaler()
amount = data['Amount'].values
data['Amount'] = sc.fit_transform(amount.reshape(-1, 1))
# Dropping 'Time' column since it won't make huge effect
data.drop(['Time'], axis=1, inplace=True)
# Dropping duplicates if there are any
data.drop_duplicates(inplace=True)
data.shape

(275663, 30)

In [6]:
# Train - Test Split
X = data.drop('Class', axis = 1).values
y = data['Class'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

In [7]:
# Decision Tree
DT = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
DT.fit(X_train, y_train)
dt_yhat = DT.predict(X_test)

print('Accuracy score of the Decision Tree model is : ' + str(accuracy_score(y_test, dt_yhat)))
print('F1 score of the Decision Tree model is : ' + str(f1_score(y_test, dt_yhat)))
confusion_matrix(y_test, dt_yhat, labels = [0, 1])

Accuracy score of the Decision Tree model is : 0.9991583957281328
F1 score of the Decision Tree model is : 0.7521367521367521


array([[68770,    18],
       [   40,    88]], dtype=int64)

In [8]:
# KNN
n = 7
KNN = KNeighborsClassifier(n_neighbors = n)
KNN.fit(X_train, y_train)
knn_yhat = KNN.predict(X_test)

print('Accuracy score of the K-Nearest Neighbors model is : ' + str(accuracy_score(y_test, knn_yhat)))
print('F1 score of the K-Nearest Neighbors model is : ' + str(f1_score(y_test, knn_yhat)))


Accuracy score of the K-Nearest Neighbors model is : 0.999288989494457
F1 score of the K-Nearest Neighbors model is : 0.7949790794979079


In [9]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_yhat = lr.predict(X_test)

print('Accuracy score of the Logistic Regression model is : ' + str(accuracy_score(y_test, lr_yhat)))
print('F1 score of the Logistic Regression model is : ' + str(f1_score(y_test, lr_yhat)))

Accuracy score of the Logistic Regression model is : 0.9989552498694062
F1 score of the Logistic Regression model is : 0.6666666666666666


In [10]:
# SVM
svm = SVC()
svm.fit(X_train, y_train)
svm_yhat = svm.predict(X_test)

print('Accuracy score of the Support Vector Machines model is : ' + str(accuracy_score(y_test, svm_yhat)))
print('F1 score of the Support Vector Machines model is : ' + str(f1_score(y_test, svm_yhat)))

Accuracy score of the Support Vector Machines model is : 0.999318010331418
F1 score of the Support Vector Machines model is : 0.7813953488372093


In [11]:
# Random Forest
rf = RandomForestClassifier(max_depth = 4)
rf.fit(X_train, y_train)
rf_yhat = rf.predict(X_test)

print('Accuracy score of the Random Forest model is : ' + str(accuracy_score(y_test, rf_yhat)))
print('F1 score of the Random Forest model is : ' + str(f1_score(y_test, rf_yhat)))

Accuracy score of the Random Forest model is : 0.9991583957281328
F1 score of the Random Forest model is : 0.7339449541284404


In [12]:
# XGBoost
xgb = XGBClassifier(max_depth = 4)
xgb.fit(X_train, y_train)
xgb_yhat = xgb.predict(X_test)

print('Accuracy score of the XGBoost model is : ' + str(accuracy_score(y_test, xgb_yhat)))
print('F1 score of the XGBoost model is : ' + str(f1_score(y_test, xgb_yhat)))



Accuracy score of the XGBoost model is : 0.999506645771664
F1 score of the XGBoost model is : 0.8495575221238937
