In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('./dataset/creditcard.csv')
new_data = pd.read_csv('./dataset/new_creditcard.csv')

In [3]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

sc = StandardScaler()
rb = RobustScaler()
mn = MinMaxScaler()

data['Amount'] = rb.fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = mn.fit_transform(data['Time'].values.reshape(-1, 1))

new_data['Amount'] = rb.fit_transform(new_data['Amount'].values.reshape(-1, 1))
new_data['Time'] = mn.fit_transform(new_data['Time'].values.reshape(-1, 1))

data.drop_duplicates(inplace=True)
new_data.drop_duplicates(inplace=True)

### Feature Selection

In [4]:
data_sf = data[['V16', 'V14', 'V12', 'V9', 'V8', 'V4', 'V11', 'V10', 'Class']]
new_sf = new_data[['V18', 'V16','V14', 'V12', 'V10', 'V5', 'V4', 'V3', 'V2', 'V17', 'V11','Class']]

In [6]:
print(data_sf.shape)
print(new_sf.shape)

(283726, 9)
(567549, 12)


# data_sf 모델 비교

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [20]:
X = data_sf.iloc[:,:-1]
Y = data_sf.iloc[:,-1]

In [21]:
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,random_state=2,test_size=0.4, stratify=Y)

In [22]:
def mymodel(model):
    model.fit(xtrain,ytrain)
    ypred=model.predict(xtest)
    
    tr=model.score(xtrain,ytrain)
    te=model.score(xtest,ytest)
    
    print('F1 score',f1_score(ytest, ypred))
    print(str(model)[:-2],'Accuracy: ',accuracy_score(ytest,ypred),\
          '\nConfusion Matrix: \n',confusion_matrix(ytest,ypred))
    print("Classification report: \n",classification_report(ytest,ypred))
    print(f'Training Accuracy: {tr}\nTesting Accuracy: {te}')
    
    return model

In [23]:
knn=mymodel(KNeighborsClassifier())

F1 score 0.8106508875739645
KNeighborsClassifier Accuracy:  0.9994360786317857 
Confusion Matrix: 
 [[113290     12]
 [    52    137]]
Classification report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    113302
           1       0.92      0.72      0.81       189

    accuracy                           1.00    113491
   macro avg       0.96      0.86      0.91    113491
weighted avg       1.00      1.00      1.00    113491

Training Accuracy: 0.9996475460392986
Testing Accuracy: 0.9994360786317857


In [25]:
dt=mymodel(DecisionTreeClassifier())

F1 score 0.7075208913649025
DecisionTreeClassifier Accuracy:  0.9990748165052735 
Confusion Matrix: 
 [[113259     43]
 [    62    127]]
Classification report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    113302
           1       0.75      0.67      0.71       189

    accuracy                           1.00    113491
   macro avg       0.87      0.84      0.85    113491
weighted avg       1.00      1.00      1.00    113491

Training Accuracy: 1.0
Testing Accuracy: 0.9990748165052735


In [26]:
gnb=mymodel(GaussianNB())

F1 score 0.2107466852756455
GaussianNB Accuracy:  0.9900344520710893 
Confusion Matrix: 
 [[112209   1093]
 [    38    151]]
Classification report: 
               precision    recall  f1-score   support

           0       1.00      0.99      0.99    113302
           1       0.12      0.80      0.21       189

    accuracy                           0.99    113491
   macro avg       0.56      0.89      0.60    113491
weighted avg       1.00      0.99      0.99    113491

Training Accuracy: 0.9899080682585837
Testing Accuracy: 0.9900344520710893


In [27]:
lr=mymodel(LogisticRegression())

F1 score 0.689873417721519
LogisticRegression Accuracy:  0.999136495404922 
Confusion Matrix: 
 [[113284     18]
 [    80    109]]
Classification report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    113302
           1       0.86      0.58      0.69       189

    accuracy                           1.00    113491
   macro avg       0.93      0.79      0.84    113491
weighted avg       1.00      1.00      1.00    113491

Training Accuracy: 0.9991541104943167
Testing Accuracy: 0.999136495404922


# new_sf 모델 비교

In [28]:
X = new_sf.iloc[:,:-1]
Y = new_sf.iloc[:,-1]

In [29]:
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,random_state=2,test_size=0.4, stratify=Y)

In [30]:
def mymodel(model):
    model.fit(xtrain,ytrain)
    ypred=model.predict(xtest)
    
    tr=model.score(xtrain,ytrain)
    te=model.score(xtest,ytest)
    
    print('F1 score',f1_score(ytest, ypred))
    print(str(model)[:-2],'Accuracy: ',accuracy_score(ytest,ypred),\
          '\nConfusion Matrix: \n',confusion_matrix(ytest,ypred))
    print("Classification report: \n",classification_report(ytest,ypred))
    print(f'Training Accuracy: {tr}\nTesting Accuracy: {te}')
    
    return model

In [31]:
knn=mymodel(KNeighborsClassifier())

F1 score 0.9996393765612357
KNeighborsClassifier Accuracy:  0.9996387983437582 
Confusion Matrix: 
 [[113287     14]
 [    68 113651]]
Classification report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    113301
           1       1.00      1.00      1.00    113719

    accuracy                           1.00    227020
   macro avg       1.00      1.00      1.00    227020
weighted avg       1.00      1.00      1.00    227020

Training Accuracy: 0.9996828463948738
Testing Accuracy: 0.9996387983437582


In [32]:
dt=mymodel(DecisionTreeClassifier())

F1 score 0.9989800942533587
DecisionTreeClassifier Accuracy:  0.9989780636067307 
Confusion Matrix: 
 [[113168    133]
 [    99 113620]]
Classification report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    113301
           1       1.00      1.00      1.00    113719

    accuracy                           1.00    227020
   macro avg       1.00      1.00      1.00    227020
weighted avg       1.00      1.00      1.00    227020

Training Accuracy: 1.0
Testing Accuracy: 0.9989780636067307


In [33]:
gnb=mymodel(GaussianNB())

F1 score 0.9926572022281593
GaussianNB Accuracy:  0.9925909611487974 
Confusion Matrix: 
 [[111645   1656]
 [    26 113693]]
Classification report: 
               precision    recall  f1-score   support

           0       1.00      0.99      0.99    113301
           1       0.99      1.00      0.99    113719

    accuracy                           0.99    227020
   macro avg       0.99      0.99      0.99    227020
weighted avg       0.99      0.99      0.99    227020

Training Accuracy: 0.9924118063366115
Testing Accuracy: 0.9925909611487974


In [34]:
lr=mymodel(LogisticRegression())

F1 score 0.9994150374947771
LogisticRegression Accuracy:  0.9994141485331689 
Confusion Matrix: 
 [[113271     30]
 [   103 113616]]
Classification report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    113301
           1       1.00      1.00      1.00    113719

    accuracy                           1.00    227020
   macro avg       1.00      1.00      1.00    227020
weighted avg       1.00      1.00      1.00    227020

Training Accuracy: 0.9993979954717513
Testing Accuracy: 0.9994141485331689


In [36]:
rf = mymodel(RandomForestClassifier())

F1 score 0.9996922211083558
RandomForestClassifier Accuracy:  0.9996916571227205 
Confusion Matrix: 
 [[113267     34]
 [    36 113683]]
Classification report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    113301
           1       1.00      1.00      1.00    113719

    accuracy                           1.00    227020
   macro avg       1.00      1.00      1.00    227020
weighted avg       1.00      1.00      1.00    227020

Training Accuracy: 1.0
Testing Accuracy: 0.9996916571227205
