In [14]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.datasets import make_moons
import collections


# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

In [15]:
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [16]:
# Since most of our data has already been scaled we should scale the columns that are left to scale (Amount and Time)
from sklearn.preprocessing import StandardScaler, RobustScaler

# RobustScaler is less prone to outliers.

std_scaler = StandardScaler()
rob_scaler = RobustScaler()

df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)

In [17]:
scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']

df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)

# Amount and Time are Scaled!

df.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,1.783274,-0.994983,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,-0.269825,-0.994983,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,4.983721,-0.994972,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,1.418291,-0.994972,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,0.670579,-0.99496,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


### Train & Test 분리

In [18]:
# Undersampling before cross validating (prone to overfit)
X = df.drop('Class', axis=1)
y = df['Class']

# This is explicitly used for undersampling.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Turn the values into an array for feeding the classification algorithms.
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values


### 새로운 분류기 결과 확인

In [20]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
knn_clf = KNeighborsClassifier()
svc_clf = SVC(probability = True)

ada_clf = AdaBoostClassifier()
xtree_clf = ExtraTreesClassifier()
gb_clf = GradientBoostingClassifier()

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report

for clf in (ada_clf, xtree_clf, gb_clf) :
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print(clf.__class__.__name__)

    rpt_result = classification_report(y_test, y_pred)
    
    print(rpt_result)

AdaBoostClassifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.86      0.72      0.78        98

    accuracy                           1.00     56962
   macro avg       0.93      0.86      0.89     56962
weighted avg       1.00      1.00      1.00     56962

ExtraTreesClassifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.95      0.78      0.85        98

    accuracy                           1.00     56962
   macro avg       0.97      0.89      0.93     56962
weighted avg       1.00      1.00      1.00     56962

GradientBoostingClassifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.74      0.60      0.66        98

    accuracy                           1.00     56962
   macro avg       0.87      0.80      0.83     56962
weighte

In [24]:
for clf in (ada_clf, xtree_clf, gb_clf) :
    
    y_pred = clf.predict(X_test)
    
    print("Classifiers: {}".format(clf.__class__.__name__))
    print('Accuracy Score: {:.2f}'.format(accuracy_score(y_test, y_pred)))
    print('Recall Score: {:.2f}'.format(recall_score(y_test, y_pred)))
    print('Precision Score: {:.2f}'.format(precision_score(y_test, y_pred)))
    print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
    print('ROC AUC score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))
    print('---' * 45)


Classifiers: AdaBoostClassifier
Accuracy Score: 1.00
Recall Score: 0.72
Precision Score: 0.86
F1 Score: 0.78
ROC AUC score: 0.86
---------------------------------------------------------------------------------------------------------------------------------------
Classifiers: ExtraTreesClassifier
Accuracy Score: 1.00
Recall Score: 0.78
Precision Score: 0.95
F1 Score: 0.85
ROC AUC score: 0.89
---------------------------------------------------------------------------------------------------------------------------------------
Classifiers: GradientBoostingClassifier
Accuracy Score: 1.00
Recall Score: 0.60
Precision Score: 0.74
F1 Score: 0.66
ROC AUC score: 0.80
---------------------------------------------------------------------------------------------------------------------------------------


### 간접 투표 방식

In [22]:
voting_clf = VotingClassifier(estimators=[('log', log_clf), ('rf', rnd_clf), ('knn',knn_clf),('svc',svc_clf)],  voting = 'soft')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('log', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('svc', SVC(probability=True))],
                 voting='soft')

### REPORT

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report


y_pred = voting_clf.predict(X_test)
    
print("Soft Voting Claasifier")

rpt_result = classification_report(y_test, y_pred)
print(rpt_result)

Soft Voting Claasifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      0.71      0.82        98

    accuracy                           1.00     56962
   macro avg       0.99      0.86      0.91     56962
weighted avg       1.00      1.00      1.00     56962



In [27]:
print('Accuracy Score: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Recall Score: {:.2f}'.format(recall_score(y_test, y_pred)))
print('Precision Score: {:.2f}'.format(precision_score(y_test, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
print('ROC AUC score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))
print('---' * 45)

Accuracy Score: 1.00
Recall Score: 0.71
Precision Score: 0.97
F1 Score: 0.82
ROC AUC score: 0.86
---------------------------------------------------------------------------------------------------------------------------------------


### 간접 투표 방식 2

In [29]:
voting_clf = VotingClassifier(estimators=[('log', log_clf), ('rf', rnd_clf), ('knn',knn_clf),('svc',svc_clf), ('ada', ada_clf), ('xt', xtree_clf)], voting = 'soft')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('log', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('svc', SVC(probability=True)),
                             ('ada', AdaBoostClassifier()),
                             ('xt', ExtraTreesClassifier())],
                 voting='soft')

In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report

y_pred = voting_clf.predict(X_test)
    
print("Soft Voting Claasifier")

rpt_result = classification_report(y_test, y_pred)
print(rpt_result)

Soft Voting Claasifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.97      0.71      0.82        98

    accuracy                           1.00     56962
   macro avg       0.99      0.86      0.91     56962
weighted avg       1.00      1.00      1.00     56962



In [35]:
y_pred = voting_clf.predict(X_test)

In [36]:
print('Accuracy Score: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Recall Score: {:.2f}'.format(recall_score(y_test, y_pred)))
print('Precision Score: {:.2f}'.format(precision_score(y_test, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
print('ROC AUC score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))
print('---' * 45)

Accuracy Score: 1.00
Recall Score: 0.71
Precision Score: 0.97
F1 Score: 0.82
ROC AUC score: 0.86
---------------------------------------------------------------------------------------------------------------------------------------


### 간접 투표 방식 3

In [32]:
voting_clf3 = VotingClassifier(estimators=[('rf', rnd_clf), ('ada', ada_clf), ('xt', xtree_clf)], voting = 'soft')
voting_clf3.fit(X_train, y_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier()),
                             ('ada', AdaBoostClassifier()),
                             ('xt', ExtraTreesClassifier())],
                 voting='soft')

In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report

y_pred = voting_clf3.predict(X_test)
    
print('Accuracy Score: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Recall Score: {:.2f}'.format(recall_score(y_test, y_pred)))
print('Precision Score: {:.2f}'.format(precision_score(y_test, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
print('ROC AUC score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))
print('---' * 45)

Accuracy Score: 1.00
Recall Score: 0.79
Precision Score: 0.97
F1 Score: 0.87
ROC AUC score: 0.89
---------------------------------------------------------------------------------------------------------------------------------------


# 스태킹 앙상블

In [38]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datsets(model, X_train, y_train, x_test, n_folds):
  #지정된 n_folds값으로 kfold생성
  kf=KFold(n_splits=n_folds, shuffle=False, random_state=0)
  #추후에 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화
  train_fold_pred = np.zeros((X_train.shape[0],1))
  test_pred = np.zeros((X_test.shape[0], n_folds))
  print(model.__class__.__name__,' model 시작')
  for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train)):
  # 입력된 학습데이터에서 기반 모델이 학습/예측 할 폴드 데이터 세트 추출
    print('\t 폴드 세트:' , folder_counter+1,'시작')
    X_tr = X_train[train_index]
    y_tr = y_train[train_index]
    x_te = X_train[valid_index]
# 폴드 세트 내부에서 다시 만들어진 학습데이터로 기반 모델 학습 수행
    model.fit(X_tr, y_tr)
# 폴드 세트 내부에서 다시 만들어진 검증 데이터로 기반모델 예측 후 데이저 저장
    train_fold_pred[valid_index, : ]=model.predict(x_te).reshape(-1,1)
# 입력된 원본 테스트 데이터를 폴드 세트 내 학습된 기반 모델에서 예측 후 데이터 저장
    test_pred[:, folder_counter] = model.predict(X_test)
#폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성
  test_pred_mean= np.mean(test_pred,axis=1).reshape(-1,1)
  #train_fold_pred는 치종 메타 모델이 사용하는 학습 데이터, test_pred_mean은 테스트 데이터 
  return train_fold_pred, test_pred_mean

In [39]:
#knn_train, knn_test = get_stacking_base_datsets(knn_clf, X_train, y_train, X_test, 5)
rnd_train, rnd_test = get_stacking_base_datsets(rnd_clf, X_train, y_train, X_test, 5)
#log_train, log_test = get_stacking_base_datsets(log_clf, X_train, y_train, X_test, 5)
ada_train, ada_test = get_stacking_base_datsets(ada_clf, X_train, y_train, X_test, 5)
#svc_train, svc_test = get_stacking_base_datsets(svc_clf, X_train, y_train, X_test, 5)
xtree_train, xtree_test = get_stacking_base_datsets(xtree_clf, X_train, y_train, X_test, 5)

RandomForestClassifier  model 시작
	 폴드 세트: 1 시작
	 폴드 세트: 2 시작
	 폴드 세트: 3 시작
	 폴드 세트: 4 시작
	 폴드 세트: 5 시작
AdaBoostClassifier  model 시작
	 폴드 세트: 1 시작
	 폴드 세트: 2 시작
	 폴드 세트: 3 시작
	 폴드 세트: 4 시작
	 폴드 세트: 5 시작
ExtraTreesClassifier  model 시작
	 폴드 세트: 1 시작
	 폴드 세트: 2 시작
	 폴드 세트: 3 시작
	 폴드 세트: 4 시작
	 폴드 세트: 5 시작


In [57]:
Stack_final_X_train=np.concatenate((rnd_train, xtree_train, ada_train),axis=1)
Stack_final_X_test=np.concatenate((rnd_test, xtree_test, ada_test),axis=1)

In [58]:
print('원본 학습 피처데이터 shape', X_train.shape, '원본 테스트 피처 shape', X_test.shape)
print('스태킹 학습 피처 데이터 shape:', Stack_final_X_train.shape, '스태킹 테스트 피처 데이터 shape:', Stack_final_X_test.shape)

원본 학습 피처데이터 shape (227845, 30) 원본 테스트 피처 shape (56962, 30)
스태킹 학습 피처 데이터 shape: (227845, 3) 스태킹 테스트 피처 데이터 shape: (56962, 3)


In [45]:
lr_final = LogisticRegression()

lr_final.fit(Stack_final_X_train, y_train)
stack_final = lr_final.predict(Stack_final_X_test)

In [46]:
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, stack_final)))
print('Recall Score: {:.2f}'.format(recall_score(y_test, stack_final)))
print('Precision Score: {:.2f}'.format(precision_score(y_test, stack_final)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, stack_final)))
print('ROC AUC score: {:.2f}'.format(roc_auc_score(y_test,stack_final)))

ValueError: Found input variables with inconsistent numbers of samples: [56962, 227845]

# Over Sampling 으로 앙상블

In [49]:
from imblearn.over_sampling import SMOTE

# SMOTE Technique (OverSampling) After splitting and Cross Validating
sm = SMOTE( random_state=42)
# Xsm_train, ysm_train = sm.fit_sample(X_train, y_train)

# This will be the data were we are going to 
Xsm_train, ysm_train = sm.fit_sample(X_train, y_train)

### 간접 투표 1

In [55]:
voting_clf5 = VotingClassifier(estimators=[('log', log_clf), ('rf', rnd_clf), ('knn',knn_clf),('svc',svc_clf)],  voting = 'soft')
voting_clf5.fit(Xsm_train, ysm_train)

VotingClassifier(estimators=[('log', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('svc', SVC(probability=True))],
                 voting='soft')

In [56]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report

y_pred = voting_clf5.predict(X_test)
    
print('Accuracy Score: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Recall Score: {:.2f}'.format(recall_score(y_test, y_pred)))
print('Precision Score: {:.2f}'.format(precision_score(y_test, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
print('ROC AUC score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))
print('---' * 45)

Accuracy Score: 1.00
Recall Score: 0.89
Precision Score: 0.50
F1 Score: 0.64
ROC AUC score: 0.94
---------------------------------------------------------------------------------------------------------------------------------------


### 간접 투표 2

In [59]:
voting_clf6 = VotingClassifier(estimators=[('log', log_clf), ('rf', rnd_clf), ('knn',knn_clf),('svc',svc_clf), ('ada', ada_clf), ('xt', xtree_clf)], voting = 'soft')
voting_clf6.fit(Xsm_train, ysm_train)

VotingClassifier(estimators=[('log', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('knn', KNeighborsClassifier()),
                             ('svc', SVC(probability=True)),
                             ('ada', AdaBoostClassifier()),
                             ('xt', ExtraTreesClassifier())],
                 voting='soft')

In [60]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report

y_pred = voting_clf6.predict(X_test)
    
print('Accuracy Score: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Recall Score: {:.2f}'.format(recall_score(y_test, y_pred)))
print('Precision Score: {:.2f}'.format(precision_score(y_test, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
print('ROC AUC score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))
print('---' * 45)

Accuracy Score: 1.00
Recall Score: 0.89
Precision Score: 0.69
F1 Score: 0.78
ROC AUC score: 0.94
---------------------------------------------------------------------------------------------------------------------------------------


### 간접 투표 3

In [50]:
voting_clf4 = VotingClassifier(estimators=[ ('rf', rnd_clf), ('ada', ada_clf), ('xt', xtree_clf)],  voting = 'soft')
voting_clf4.fit(Xsm_train, ysm_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier()),
                             ('ada', AdaBoostClassifier()),
                             ('xt', ExtraTreesClassifier())],
                 voting='soft')

In [51]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report

y_pred = voting_clf4.predict(X_test)
    
print('Accuracy Score: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Recall Score: {:.2f}'.format(recall_score(y_test, y_pred)))
print('Precision Score: {:.2f}'.format(precision_score(y_test, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
print('ROC AUC score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))
print('---' * 45)

Accuracy Score: 1.00
Recall Score: 0.84
Precision Score: 0.93
F1 Score: 0.88
ROC AUC score: 0.92
---------------------------------------------------------------------------------------------------------------------------------------
