In [4]:
import warnings

import numpy as np  
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from sklearn.preprocessing import StandardScaler, RobustScaler

# X와 y 데이터 셋 만들기
from sklearn.model_selection import train_test_split
# from sklearn.model_selection import StratifiedShuffleSplit
# from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, recall_score
from imblearn.metrics import classification_report_imbalanced

In [1]:
# 2021-11-23

In [6]:
## 1. Data Load

In [7]:
get_ipython().run_line_magic('matplotlib', 'inline')
warnings.filterwarnings("ignore")
# 파일 불러오기
filepath = 'https://github.com/mchoimis/financialml/raw/main/fraud/'
DF = pd.read_csv(filepath + 'fraud.csv')
print(DF.shape)
DF.head()  

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [9]:
### Missing 여부 확인하기
#DF.isnull().sum()
#### 모두 0인 것을 확인할 수 있다.

In [10]:
### 불러온 데이터의 클래스 분포 확인하기
#### 99.8% 정상, 0.1% 비정상 이다
DF.groupby(by=['Class']).count()
print('Target class is ', '{0:0.4f}'. format(492/(284315+492)*100), '%')

Target class is  0.1727 %


In [11]:
################################################################################
## 2. Preprocess
################################################################################
### 2.1 Normalized the Data
#### robust scaler를 사용하여 scale을 조정
# 데이터 스케일 조정하기
rob_scaler = RobustScaler() 
DF['scaled_amount'] = rob_scaler.fit_transform(DF['Amount'].values.reshape(-1,1))
DF['scaled_time'] = rob_scaler.fit_transform(DF['Time'].values.reshape(-1,1))
DF.drop(['Time','Amount'], axis=1, inplace=True)

scaled_amount = DF['scaled_amount']
scaled_time = DF['scaled_time']
DF.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
DF.insert(0, 'scaled_amount', scaled_amount)
DF.insert(1, 'scaled_time', scaled_time)
DF.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,1.783274,-0.994983,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,-0.269825,-0.994983,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,4.983721,-0.994972,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,1.418291,-0.994972,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,0.670579,-0.99496,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


In [None]:
# 2.X undersampling 
#### target data 의 skew 정도가 매우 높기에 클래스 간 분포를 맞추는 것이 필요합니다.

In [22]:
DF = DF.sample(frac=1)
DF_1 = DF.loc[DF['Class'] == 1]
DF_0 = DF.loc[DF['Class'] == 0][:DF_1.shape[0]]
print(DF_0.shape, DF_1.shape)

(492, 31) (492, 31)


In [24]:
DF = pd.concat([DF_0, DF_1], axis = 0)
DF = DF.sample(frac = 1, random_state = 0)
DF.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
114610,-0.153706,-0.130934,-1.810249,0.288898,1.287987,-1.792334,-1.396449,-0.141126,-1.246051,1.206756,-1.081516,0.036736,0.831882,-0.630931,-1.296564,0.471338,0.155428,1.841071,0.241813,-0.472113,-0.402029,-0.166434,0.587575,1.253521,-0.180986,0.023556,-0.004132,-0.18855,-0.13816,-0.037239,0
143336,3.226717,0.006967,-6.713407,3.921104,-9.746678,5.148263,-5.151563,-2.099389,-5.937767,3.57878,-4.684952,-8.537758,6.348979,-8.681609,0.251179,-11.608002,-0.351569,-5.363566,-11.939092,-3.583603,0.897402,0.135711,0.954272,-0.451086,0.127214,-0.33945,0.394096,1.075295,1.649906,-0.394905,1
241394,-0.097813,0.779109,-1.845963,1.363048,1.055604,-1.365115,0.539691,1.464316,-0.079571,0.958088,0.138758,-0.451385,-0.487363,0.604479,1.045759,-0.171927,1.331936,-0.155597,0.112658,-1.466601,-1.226,0.203489,-0.164867,-0.319166,-0.102898,-0.548571,0.205613,0.503777,0.255659,0.156684,0
116139,0.767694,-0.123744,-1.548788,1.808698,-0.953509,2.213085,-2.015728,-0.913457,-2.356013,1.197169,-1.678374,-3.53865,3.10209,-3.993373,-1.937411,-3.822894,0.83097,-2.475359,-5.211875,-0.413872,0.933262,0.390786,0.855138,0.774745,0.059037,0.3432,-0.468938,-0.278338,0.625922,0.395573,1
154286,-0.294557,0.19219,-1.465316,-1.093377,-0.059768,1.064785,11.095089,-5.430971,-9.378025,-0.446456,1.99211,1.785922,1.368585,-1.471697,-0.724759,3.442422,-0.957403,-1.626129,1.418215,-1.417917,-1.651766,-1.45761,1.160623,-1.259697,-15.981649,-0.88367,-3.536716,-0.592965,0.675525,0.424849,1


In [36]:
# 3. Split the Data
X = DF.drop('Class', axis=1)
y = DF['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=.2,
                                                    random_state = 0,
                                                    stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(787, 30) (197, 30) (787,) (197,)


In [37]:
#### 데이터셋에서는 비율도 확인
print(y_train.value_counts()/y_train.shape *100)
print(y_test.value_counts()/y_test.shape *100)

1    50.063532
0    49.936468
Name: Class, dtype: float64
0    50.253807
1    49.746193
Name: Class, dtype: float64


In [38]:
# 모델 인풋에 들어가기 위한 데이터의 형태 바꾸기
xTrainNP = X_train.values
xTestNP  = X_test.values
yTrainNP = y_train.values
yTestNP  = y_test.values

In [39]:
# 학습시킬 모델 로드하기
classifiers = {
    "Logisitic Regression": LogisticRegression(),
    "K Nearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "LightGBM Classifier": LGBMClassifier()
}

In [40]:
# 모델별 cross validation 한 결과의 평균 정확도 점수 출력하기
for key, classifier in classifiers.items():
    classifier.fit(xTrainNP, yTrainNP)
    training_score = cross_val_score(classifier, xTrainNP, yTrainNP, cv=5)
    print(classifier.__class__.__name__, ':', round(training_score.mean(), 2) * 100, '% accuracy')

LogisticRegression : 94.0 % accuracy
KNeighborsClassifier : 92.0 % accuracy
SVC : 93.0 % accuracy
DecisionTreeClassifier : 89.0 % accuracy
RandomForestClassifier : 94.0 % accuracy
GradientBoostingClassifier : 93.0 % accuracy
LGBMClassifier : 94.0 % accuracy


In [42]:
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, confusion_matrix

In [45]:
### 올바른 예

# 모델별 분류결과 확인하기 (올바른 예)
for key, classifier in classifiers.items():
    y_pred = classifier.predict(X_test)  ####
    results = classification_report(y_test, y_pred)  ####
    print(classifier.__class__.__name__, '-------','\n', results)

LogisticRegression ------- 
               precision    recall  f1-score   support

           0       0.91      0.97      0.94        99
           1       0.97      0.90      0.93        98

    accuracy                           0.93       197
   macro avg       0.94      0.93      0.93       197
weighted avg       0.94      0.93      0.93       197

KNeighborsClassifier ------- 
               precision    recall  f1-score   support

           0       0.91      0.97      0.94        99
           1       0.97      0.90      0.93        98

    accuracy                           0.93       197
   macro avg       0.94      0.93      0.93       197
weighted avg       0.94      0.93      0.93       197

SVC ------- 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95        99
           1       1.00      0.90      0.95        98

    accuracy                           0.95       197
   macro avg       0.95      0.95      0.95       197


In [46]:
# 모델별 Confusion Matrix 확인하기 (올바른 예)
for key, classifier in classifiers.items():
    y_pred = classifier.predict(X_test) ####
    cm = confusion_matrix(y_test, y_pred)  ####
    print(classifier.__class__.__name__, '\n', cm, '\n')

LogisticRegression 
 [[96  3]
 [10 88]] 

KNeighborsClassifier 
 [[96  3]
 [10 88]] 

SVC 
 [[99  0]
 [10 88]] 

DecisionTreeClassifier 
 [[80 19]
 [10 88]] 

RandomForestClassifier 
 [[99  0]
 [12 86]] 

GradientBoostingClassifier 
 [[94  5]
 [11 87]] 

LGBMClassifier 
 [[94  5]
 [10 88]] 



In [47]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

In [49]:
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    print('오차행렬')
    print(confusion)
    print('정확도:{0:.4f}, 정밀도:{1:.4f}, 재현율:{2:.4f}'.format(accuracy,
                                                                    precision,
                                                                    recall))


In [50]:
get_clf_eval(y_test, y_pred)

오차행렬
[[94  5]
 [10 88]]
정확도:0.9239, 정밀도:0.9462, 재현율:0.8980


In [55]:
# 모델별 Confusion Matrix 확인하기 (올바른 예)
for key, classifier in classifiers.items():
    y_pred = classifier.predict(X_test) ####
    print(classifier.__class__.__name__)
    get_clf_eval(y_test, y_pred)
    print('\n')

LogisticRegression
오차행렬
[[96  3]
 [10 88]]
정확도:0.9340, 정밀도:0.9670, 재현율:0.8980


KNeighborsClassifier
오차행렬
[[96  3]
 [10 88]]
정확도:0.9340, 정밀도:0.9670, 재현율:0.8980


SVC
오차행렬
[[99  0]
 [10 88]]
정확도:0.9492, 정밀도:1.0000, 재현율:0.8980


DecisionTreeClassifier
오차행렬
[[80 19]
 [10 88]]
정확도:0.8528, 정밀도:0.8224, 재현율:0.8980


RandomForestClassifier
오차행렬
[[99  0]
 [12 86]]
정확도:0.9391, 정밀도:1.0000, 재현율:0.8776


GradientBoostingClassifier
오차행렬
[[94  5]
 [11 87]]
정확도:0.9188, 정밀도:0.9457, 재현율:0.8878


LGBMClassifier
오차행렬
[[94  5]
 [10 88]]
정확도:0.9239, 정밀도:0.9462, 재현율:0.8980


