In [26]:


import warnings

import numpy as np  
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from sklearn.preprocessing import StandardScaler, RobustScaler

# X와 y 데이터 셋 만들기
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier


from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, recall_score
from imblearn.metrics import classification_report_imbalanced


In [27]:
get_ipython().run_line_magic('matplotlib', 'inline')
warnings.filterwarnings("ignore")
################################################################################
## 1. Data Load
################################################################################
# 파일 불러오기
filepath = 'https://github.com/mchoimis/financialml/raw/main/fraud/'
DF = pd.read_csv(filepath + 'fraud.csv')
print(DF.shape)
DF.head()

### Missing 여부 확인하기
DF.isnull().sum()

### 불러온 데이터의 클래스 분포 확인하기
#### 99.8% 정상, 0.1% 비정상 이다
DF.groupby(by=['Class']).count()
print('Target class is ', '{0:0.4f}'. format(492/(284315+492)*100), '%')

################################################################################
## 2. Preprocess
################################################################################
### 2.1 Normalized the Data
#### robust scaler를 사용하여 scale을 조정
# 데이터 스케일 조정하기
rob_scaler = RobustScaler() 
DF['scaled_amount'] = rob_scaler.fit_transform(DF['Amount'].values.reshape(-1,1))
DF['scaled_time'] = rob_scaler.fit_transform(DF['Time'].values.reshape(-1,1))
DF.drop(['Time','Amount'], axis=1, inplace=True)

scaled_amount = DF['scaled_amount']
scaled_time = DF['scaled_time']
DF.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
DF.insert(0, 'scaled_amount', scaled_amount)
DF.insert(1, 'scaled_time', scaled_time)
DF.head()


### 2.2 Split the Data
X = DF.drop('Class', axis=1)
y = DF['Class']

# 데이터 나누기
sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    xTrain, xTest = X.iloc[train_index], X.iloc[test_index]
    yTrain, yTest = y.iloc[train_index], y.iloc[test_index]




(284807, 31)
Target class is  0.1727 %


In [28]:


### 2.3 UnderSampling
DF = DF.sample(frac=1) ################ 왜 이걸 안하면 학습이 100% 정확도를 보이지?

# 클래스의 skew 정도가 매우 높기 때문에 클래스간 분포를 맞추는 것이 필요합니다.
# subsample 구축 전 셔플링을 통해 레이블이 한쪽에 몰려있지 않도록 하겠습니다.

# 데이터 준비
fraudDF  = DF.loc[DF['Class'] == 1]
normalDF = DF.loc[DF['Class'] == 0][:492]
underDF = pd.concat([fraudDF, normalDF])

# 데이터 셔플하기
underDF = underDF.sample(frac=1, random_state=0)
underDF.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
102782,-0.033676,-0.191908,1.232604,-0.548931,1.087873,0.894082,-1.433055,-0.356797,-0.717492,0.003167,-0.100397,0.543187,-1.039417,0.285262,-0.206007,-0.498522,-1.064108,-2.156037,0.564761,0.837857,-0.72899,-0.576274,-0.448671,-0.517568,0.012833,0.699217,0.527258,-0.322607,0.080805,0.035427,1
30,-0.065954,-0.994702,1.237429,0.061043,0.380526,0.761564,-0.359771,-0.494084,0.006494,-0.133862,0.43881,-0.207358,-0.929182,0.527106,0.348676,-0.152535,-0.218386,-0.191552,-0.116581,-0.633791,0.348416,-0.066351,-0.245682,-0.5309,-0.044265,0.079168,0.509136,0.288858,-0.022705,0.011836,0
8335,-0.29344,-0.864214,-1.426623,4.141986,-9.804103,6.666273,-4.749527,-2.073129,-10.089931,2.791345,-3.249516,-11.420451,10.853012,-15.969208,0.54669,-14.690729,0.912337,-12.227189,-18.587366,-6.920762,3.166999,1.410678,1.865679,0.407809,0.605809,-0.769348,-1.746337,0.50204,1.977258,0.711607,1
138,-0.139873,-0.993985,1.327884,-0.735012,1.502449,-0.544105,-1.90935,-0.712498,-1.29282,-0.002715,-0.246028,0.572523,0.008547,-0.336586,0.675825,-0.671876,1.253877,1.591792,0.050392,-0.785112,-0.024286,0.111537,0.461092,1.308647,-0.046031,0.764375,0.281841,-0.063753,0.060406,0.037296,0
438,0.670579,-0.991283,-1.014693,0.444918,2.259005,0.336946,-0.492326,-0.649977,0.54246,-0.201798,0.491987,0.013239,-0.171667,0.717766,0.855574,-0.890738,-0.241276,-0.435455,-0.083671,-0.326212,-0.143874,0.399388,0.090761,0.829358,-0.105669,1.005763,-0.098694,0.396781,0.307894,0.098746,0


In [29]:

################################################################################
## 3. Train 
################################################################################
# 재구축한 데이터의 클래스 분포 확인하기

underDF.groupby(by=['Class']).count()

Unnamed: 0_level_0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492
1,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492,492


In [30]:



# X와 y 데이터 셋 만들기
X = underDF.drop('Class', axis=1)
y = underDF['Class']


# 언더샘플링을 위한 샘플 데이터 구축
xMTrain, xMTest, yMTrain, yMTest = train_test_split(X, y, test_size=0.2, random_state=0)


# 모델 인풋에 들어가기 위한 데이터의 형태 바꾸기
xMTrain = xMTrain.values
xMTest = xMTest.values
yMTrain = yMTrain.values
yMTest = yMTest.values


In [31]:


# 학습시킬 모델 로드하기
classifiers = {
    "Logisitic Regression": LogisticRegression(),
    "K Nearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "LightGBM Classifier": LGBMClassifier()
}





In [32]:
print(xMTrain.shape, yMTrain.shape)

(787, 30) (787,)


In [33]:


# 모델별 cross validation 한 결과의 평균 정확도 점수 출력하기


for key, classifier in classifiers.items():
    classifier.fit(xMTrain, yMTrain)
    training_score = cross_val_score(classifier, xMTrain, yMTrain, cv=5)
    print(classifier.__class__.__name__, ':', round(training_score.mean(), 2) * 100, '% accuracy')



LogisticRegression : 98.0 % accuracy
KNeighborsClassifier : 95.0 % accuracy
SVC : 95.0 % accuracy
DecisionTreeClassifier : 100.0 % accuracy
RandomForestClassifier : 100.0 % accuracy
GradientBoostingClassifier : 100.0 % accuracy
LGBMClassifier : 100.0 % accuracy


In [34]:
### 올바른 예

# 모델별 분류결과 확인하기 (올바른 예)
for key, classifier in classifiers.items():
    y_pred = classifier.predict(xTest)  ####
    results = classification_report(yTest, y_pred)  ####
    print(classifier.__class__.__name__, '-------','\n', results)


LogisticRegression ------- 
               precision    recall  f1-score   support

           0       1.00      0.00      0.00     56863
           1       0.00      1.00      0.00        98

    accuracy                           0.00     56961
   macro avg       0.50      0.50      0.00     56961
weighted avg       1.00      0.00      0.00     56961

KNeighborsClassifier ------- 
               precision    recall  f1-score   support

           0       1.00      0.88      0.93     56863
           1       0.01      0.95      0.03        98

    accuracy                           0.88     56961
   macro avg       0.51      0.91      0.48     56961
weighted avg       1.00      0.88      0.93     56961

SVC ------- 
               precision    recall  f1-score   support

           0       1.00      0.85      0.92     56863
           1       0.01      0.98      0.02        98

    accuracy                           0.85     56961
   macro avg       0.51      0.91      0.47     56961


In [35]:
46+56817

56863

In [36]:
# 모델별 Confusion Matrix 확인하기 (올바른 예)
for key, classifier in classifiers.items():
    y_pred = classifier.predict(xTest) ####
    cm = confusion_matrix(yTest, y_pred)  ####
    print(classifier.__class__.__name__, '\n', cm, '\n')


LogisticRegression 
 [[   46 56817]
 [    0    98]] 

KNeighborsClassifier 
 [[49880  6983]
 [    5    93]] 

SVC 
 [[48074  8789]
 [    2    96]] 

DecisionTreeClassifier 
 [[    0 56863]
 [    0    98]] 

RandomForestClassifier 
 [[    0 56863]
 [    0    98]] 

GradientBoostingClassifier 
 [[    0 56863]
 [    0    98]] 

LGBMClassifier 
 [[    0 56863]
 [    0    98]] 

