In [1]:
# ==============================
# 1. 기본 라이브러리 로드
# ==============================
import pandas as pd  # 데이터프레임 처리 라이브러리
import numpy as np  # 수학 연산 및 배열 처리 라이브러리
import matplotlib.pyplot as plt  # 데이터 시각화 라이브러리
%matplotlib inline  
# ==============================
# 2. 머신러닝 모델 관련 라이브러리
# ==============================

# (1) 앙상블 학습 모델
from sklearn.ensemble import VotingClassifier  # 여러 개의 분류 모델을 조합하는 투표 기반 앙상블 학습

# (2) 개별 분류 모델
from sklearn.linear_model import LogisticRegression  # 로지스틱 회귀 분류기
from sklearn.neighbors import KNeighborsClassifier  # K-최근접 이웃(KNN) 분류기
from sklearn.ensemble import RandomForestClassifier  # 랜덤 포레스트 분류기
from sklearn.ensemble import GradientBoostingClassifier  # 그래디언트 부스팅 트리 분류기
from sklearn.tree import DecisionTreeClassifier  # 결정 트리 분류기
from sklearn.ensemble import AdaBoostClassifier  # AdaBoost 분류기
from sklearn.svm import SVC  # 서포트 벡터 머신(SVM) 분류기

# (3) 부스팅 기반 분류 모델 (XGBoost & LightGBM)
import xgboost as xgb  # XGBoost 라이브러리 (트리 기반 부스팅 기법)
from xgboost import XGBClassifier  # XGBoost 분류기
from lightgbm import LGBMClassifier  # LightGBM 분류기

# ==============================
# 3. 모델 평가 및 성능 지표
# ==============================

from sklearn.metrics import confusion_matrix  # 혼동 행렬
from sklearn.metrics import precision_score, recall_score  # 정밀도(precision) 및 재현율(recall)
from sklearn.metrics import f1_score, roc_auc_score  # F1-score 및 ROC-AUC 점수
from sklearn.metrics import accuracy_score  # 분류 모델의 정확도(accuracy)

# ==============================
# 4. 하이퍼파라미터 최적화
# ==============================

from hyperopt import hp  # 하이퍼파라미터 탐색 공간 정의
from hyperopt import STATUS_OK  # 최적화 과정에서 상태 반환
from hyperopt import fmin, tpe, Trials  # 최적의 하이퍼파라미터 탐색을 위한 함수들

# ==============================
# 5. 데이터셋 로드 및 데이터 분할
# ==============================

from sklearn.datasets import load_breast_cancer  # 유방암 데이터셋 (예제 데이터셋)
from sklearn.model_selection import train_test_split  # 데이터 분할 (학습/테스트 세트)
from sklearn.model_selection import cross_val_score  # 교차 검증을 통한 모델 성능 평가

# ==============================
# 6. 데이터 전처리
# ==============================

from sklearn.preprocessing import StandardScaler  # 데이터 표준화 (평균 0, 분산 1 변환)
from sklearn.preprocessing import MinMaxScaler  # 데이터 Min-Max 스케일링 (0~1 범위 변환)
from sklearn.impute import SimpleImputer  # 결측값 처리
from imblearn.under_sampling import RandomUnderSampler  # 불균형 데이터의 샘플링을 위한 언더샘플링
from imblearn.over_sampling import RandomOverSampler  # 불균형 데이터의 샘플링을 위한 오버샘플링
from imblearn.over_sampling import SMOTE  # SMOTE 기법 (Synthetic Minority Over-sampling Technique)
from scipy.stats import randint  # 범위 내에서 랜덤값 생성

# ==============================
# 7. 통계 분석 관련 라이브러리
# ==============================

from scipy.stats import shapiro, skew, kurtosis  # 데이터 분포 분석을 위한 함수들

from sklearn.model_selection import GridSearchCV  # 그리드 서치를 통한 하이퍼파라미터 최적화
from sklearn.model_selection import RandomizedSearchCV  # 랜덤 서치를 통한 하이퍼파라미터 최적화

import seaborn as sns  # 고급 데이터 시각화 라이브러리

# ==============================
# 8. 기타 유틸리티
# ==============================

import time  # 코드 실행 시간 측정
import warnings  # 경고 메시지 무시 설정
warnings.filterwarnings('ignore') 

In [2]:
card_df=pd.read_csv(
    './creditcard/creditcard.csv'
    ,encoding='utf-8'
)

card_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
def get_preprocessed_df(df=None):
    df_copy = df.copy()
    df_copy.drop('Time', axis=1, inplace=True)
    return df_copy

def get_train_test_dataset(df=None):
    if df is None:
        return '데이터베이스가 존재하지 않습니다'
    else:
        df_copy = get_preprocessed_df(df)
        
        # 명시적으로 Class를 y로 설정
        y_target = df_copy['Class']
        X_features = df_copy.drop('Class', axis=1,inplace=False)

        X_train, X_test, y_train, y_test = train_test_split(
            X_features,
            y_target,
            test_size=0.3,
            random_state=0,
            stratify=y_target
        )
        return X_train, X_test, y_train, y_test


In [4]:
X_train,X_test,y_train,y_test=get_train_test_dataset(card_df)

print('='*50)
print(f'학습 데이터 레이블 값 비율\n{y_train.value_counts()/y_train.shape[0]*100}')
print('='*50)
print(f'테스트 데이터 레이블 값 비율\n{y_test.value_counts()/y_test.shape[0]*100}')


학습 데이터 레이블 값 비율
Class
0    99.827451
1     0.172549
Name: count, dtype: float64
테스트 데이터 레이블 값 비율
Class
0    99.826785
1     0.173215
Name: count, dtype: float64


## 원본 데이터 가공 없이 모델 학습, 일반화 성능 확인

In [6]:
def get_clf_eval(y_test, pred=None, pred_proba=None,clf_name=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred) # 정확도 점수
    precision = precision_score(y_test , pred) # 정밀도 점수
    recall = recall_score(y_test , pred) # 재현율 점수
    f1 = f1_score(y_test,pred) # 정밀도, 재현율 조화평균 값
    if clf_name:
        print(f'\n### {clf_name} ###')
    # ROC-AUC 추가
    roc_auc = roc_auc_score(y_test, pred_proba) # AUC 점수 : 불균형 데이터 셋에서 필요
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [7]:
def get_model_train_eval(
    model                   # 분류 알고리즘
    ,ftr_train=None         # 학습 데이터
    ,ftr_test=None          # 테스트 데이터
    ,tgt_train=None         # 학습 데이터 레이블
    ,tgt_test=None          # 테스트 데이터 레이블
    ):
    print('='*100)
    model_start=time.time()
    model.fit(ftr_train,tgt_train)
    pred = model.predict(ftr_test)
    pred_proba=model.predict_proba(ftr_test)[:,1] # 예측 확률
    model_end=time.time()
    get_clf_eval(tgt_test,pred,pred_proba,clf_name = model.__class__.__name__)
    print(f"{model.__class__.__name__} 시간: {model_end - model_start:.4f}초")


In [8]:
lr_clf = LogisticRegression(max_iter=1000)

rf_clf = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=156) 

gb_clf = GradientBoostingClassifier(n_estimators=1000, max_depth=10, random_state=156) 

xgb_clf = XGBClassifier(
    n_estimators=1000,
    max_depth=10,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=156,
    n_jobs=-1  
)


lgbm_clf = LGBMClassifier(
    n_estimators=1000,
    num_leaves=64,
    boosting_type='gbdt',
    boosting_from_average=False,
    random_state=156,
    n_jobs=-1  # 모든 CPU 코어 사용
)


svm_clf = SVC(kernel='linear', probability=True, random_state=156)


get_model_train_eval(lr_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(rf_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(gb_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(xgb_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(lgbm_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(svm_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)



### LogisticRegression ###
오차 행렬
[[85282    13]
 [   58    90]]
정확도: 0.9992, 정밀도: 0.8738, 재현율: 0.6081, F1: 0.7171, AUC:0.9734
LogisticRegression 시간: 5.0299초

### RandomForestClassifier ###
오차 행렬
[[85290     5]
 [   42   106]]
정확도: 0.9994, 정밀도: 0.9550, 재현율: 0.7162, F1: 0.8185, AUC:0.9787
RandomForestClassifier 시간: 1104.0554초

### GradientBoostingClassifier ###
오차 행렬
[[85269    26]
 [   38   110]]
정확도: 0.9993, 정밀도: 0.8088, 재현율: 0.7432, F1: 0.7746, AUC:0.9536
GradientBoostingClassifier 시간: 870.0048초

### XGBClassifier ###
오차 행렬
[[85278    17]
 [   41   107]]
정확도: 0.9993, 정밀도: 0.8629, 재현율: 0.7230, F1: 0.7868, AUC:0.8986
XGBClassifier 시간: 5.9249초
[LightGBM] [Info] Number of positive: 344, number of negative: 199020
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017765 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 199364, number of used fe

---

In [9]:
card_copy_df=get_preprocessed_df(card_df)

results = []

for col in card_copy_df.select_dtypes(include='number').columns:
    stat, p = shapiro(card_copy_df[col].dropna().values)  # 결측값 제거
    result = '정규성 만족' if p > 0.05 else '정규성 없음'
    results.append({'컬럼명': col, 'Shapiro 통계량': stat, 'p-value': p, '정규성': result})

normality_df = pd.DataFrame(results)

normality_df = normality_df.sort_values(by='정규성', ascending=False).reset_index(drop=True)

print(normality_df)

       컬럼명  Shapiro 통계량        p-value     정규성
0       V1     0.798984  6.507232e-156  정규성 없음
1       V2     0.739692  1.734920e-164  정규성 없음
2   Amount     0.310045  6.536480e-199  정규성 없음
3      V28     0.438524  2.331847e-191  정규성 없음
4      V27     0.578435  4.230113e-181  정규성 없음
5      V26     0.978638   2.731202e-91  정규성 없음
6      V25     0.974873   2.207008e-95  정규성 없음
7      V24     0.970827   3.146398e-99  정규성 없음
8      V23     0.467174  1.835392e-189  정규성 없음
9      V22     0.982956   9.501475e-86  정규성 없음
10     V21     0.485593  3.404182e-188  정규성 없음
11     V20     0.529521  5.377893e-185  정규성 없음
12     V19     0.983033   1.222334e-85  정규성 없음
13     V18     0.984649   3.055418e-83  정규성 없음
14     V17     0.790859  3.266145e-157  정규성 없음
15     V16     0.950301  1.398559e-113  정규성 없음
16     V15     0.993588   8.461927e-64  정규성 없음
17     V14     0.884268  2.237838e-138  정규성 없음
18     V13     0.998995   2.714300e-31  정규성 없음
19     V12     0.882798  9.137950e-139  정규성 없음
20     V11   

In [10]:
def get_preprocessed_df(df=None, skew_thresh=0.05):
    df_copy = df.copy()

    # 'Time', 'Class' 제외
    num_cols = df_copy.select_dtypes(include='number').columns
    num_cols = [col for col in num_cols if col not in ['Time', 'Class']]

    # 왜도가 특정 threshold 이상인 컬럼만 스케일링
    skewed_cols = [col for col in num_cols if abs(skew(df_copy[col])) >= skew_thresh]

    # 스케일링
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df_copy[skewed_cols])
    
    # 스케일된 컬럼 추가하고 기존은 제거
    for i, col in enumerate(skewed_cols):
        df_copy[col + '_Scaled'] = scaled_features[:, i]
        df_copy.drop(col, axis=1, inplace=True)

    # 'Time' 제거
    if 'Time' in df_copy.columns:
        df_copy.drop(['Time'], axis=1, inplace=True)

    return df_copy


In [11]:
X_train, X_test, y_train, y_test = get_train_test_dataset(card_df)

In [12]:
get_model_train_eval(lr_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(rf_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(gb_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(xgb_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(lgbm_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(svm_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)


### LogisticRegression ###
오차 행렬
[[85280    15]
 [   53    95]]
정확도: 0.9992, 정밀도: 0.8636, 재현율: 0.6419, F1: 0.7364, AUC:0.9712
LogisticRegression 시간: 0.3393초

### RandomForestClassifier ###
오차 행렬
[[85290     5]
 [   42   106]]
정확도: 0.9994, 정밀도: 0.9550, 재현율: 0.7162, F1: 0.8185, AUC:0.9787
RandomForestClassifier 시간: 1222.1665초

### GradientBoostingClassifier ###
오차 행렬
[[85269    26]
 [   38   110]]
정확도: 0.9993, 정밀도: 0.8088, 재현율: 0.7432, F1: 0.7746, AUC:0.9536
GradientBoostingClassifier 시간: 959.6895초

### XGBClassifier ###
오차 행렬
[[85276    19]
 [   38   110]]
정확도: 0.9993, 정밀도: 0.8527, 재현율: 0.7432, F1: 0.7942, AUC:0.9030
XGBClassifier 시간: 6.9387초
[LightGBM] [Info] Number of positive: 344, number of negative: 199020
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 199364, number of used fe

In [13]:
def get_preprocessed_df(df=None, skew_thresh=0.05):
    df_copy = df.copy()

    # 'Time', 'Class' 제외
    num_cols = df_copy.select_dtypes(include='number').columns
    num_cols = [col for col in num_cols if col not in ['Time', 'Class']]

    # 왜도가 threshold 이상인 컬럼만 선택
    skewed_cols = [col for col in num_cols if abs(skew(df_copy[col])) >= skew_thresh]

    for col in skewed_cols:
        # 로그 변환을 위해 모든 값이 양수인지 확인
        min_val = df_copy[col].min()
        if min_val <= 0:
            # 0이나 음수가 있다면 log(1 + x - min_val)로 처리
            df_copy[col + '_Log'] = np.log1p(df_copy[col] - min_val + 1)
        else:
            df_copy[col + '_Log'] = np.log1p(df_copy[col])

        # 원래 컬럼 제거
        df_copy.drop(col, axis=1, inplace=True)

    # 'Time' 제거
    if 'Time' in df_copy.columns:
        df_copy.drop(['Time'], axis=1, inplace=True)

    return df_copy

In [14]:
X_train, X_test, y_train, y_test = get_train_test_dataset(card_df)

In [15]:
get_model_train_eval(lr_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(rf_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(gb_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(xgb_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(lgbm_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)
get_model_train_eval(svm_clf, ftr_train=X_train, ftr_test=X_test, tgt_train=y_train, tgt_test=y_test)


### LogisticRegression ###
오차 행렬
[[85280    15]
 [   71    77]]
정확도: 0.9990, 정밀도: 0.8370, 재현율: 0.5203, F1: 0.6417, AUC:0.9678
LogisticRegression 시간: 0.3944초

### RandomForestClassifier ###
오차 행렬
[[85290     5]
 [   42   106]]
정확도: 0.9994, 정밀도: 0.9550, 재현율: 0.7162, F1: 0.8185, AUC:0.9788
RandomForestClassifier 시간: 905.7109초

### GradientBoostingClassifier ###
오차 행렬
[[85270    25]
 [   38   110]]
정확도: 0.9993, 정밀도: 0.8148, 재현율: 0.7432, F1: 0.7774, AUC:0.9536
GradientBoostingClassifier 시간: 712.6737초

### XGBClassifier ###
오차 행렬
[[85280    15]
 [   38   110]]
정확도: 0.9994, 정밀도: 0.8800, 재현율: 0.7432, F1: 0.8059, AUC:0.9281
XGBClassifier 시간: 4.2549초
[LightGBM] [Info] Number of positive: 344, number of negative: 199020
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010720 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 199364, number of used fea