In [1]:
!pip install pytorch_tabnet

Collecting pytorch_tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch_tabnet
Successfully installed pytorch_tabnet-4.1.0


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from datetime import datetime
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier , StackingClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import cross_val_score,GridSearchCV

# 한글 깨짐 방지
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'malgun Gothic'

# OSS_1:4

In [3]:
train = pd.read_csv('./OSS_0.25_train.csv', encoding='euc-kr')
test = pd.read_csv('./OSS_0.25_test.csv', encoding='euc-kr')

In [5]:
X_train = train[['자기자본증가율','순운전자본회전률', '총자본증가율', '기업수명주기', '자기자본구성비율', '총자산대비잉여현금흐름', '당좌자산회전률', 'log자산총계', '총자본투자효율','총자본회전률', '자기자본회전률','총자본순이익률', '총자산대비현금흐름']]
y_train = train[['t-1감사의견코드']]

X_test = test[['자기자본증가율','순운전자본회전률', '총자본증가율', '기업수명주기', '자기자본구성비율', '총자산대비잉여현금흐름', '당좌자산회전률', 'log자산총계', '총자본투자효율','총자본회전률', '자기자본회전률','총자본순이익률', '총자산대비현금흐름']]
y_test = test[['t-1감사의견코드']]

In [6]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_tabnet(X_train, y_train, X_test, y_test):
    # Stratified 5-fold 교차검증 설정
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    # 최적 하이퍼파라미터 설정 (여기서는 고정값으로 사용)
    best_params = {
        'n_d': 8,  # Number of decision steps (also known as the number of features for attention)
        'n_a': 8,  # Number of attention features (output dimension of each attention head)
        'n_steps': 3,  # Number of steps in the architecture (usually between 3 and 10)
        'gamma': 1.5,  # The factor by which to scale the contribution of each augmented sample
        'n_independent': 3,  # Number of independent GLU layers in each decision step
        'n_shared': 3,  # Number of shared GLU layers in each decision step
        'lambda_sparse': 0.001  # The sparsity loss weight
    }

    # 각 fold 별 평가 지표를 저장할 리스트 초기화
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_score_list = []
    confusion_matrix_list = []

    best_f1_score = 0
    best_model = None

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_test_fold, y_test_fold = X_train.iloc[test_idx], y_train.iloc[test_idx]

        # TabNet 모델 초기화
        model = TabNetClassifier(
            n_d=best_params['n_d'],
            n_a=best_params['n_a'],
            n_steps=best_params['n_steps'],
            gamma=best_params['gamma'],
            n_independent=best_params['n_independent'],
            n_shared=best_params['n_shared'],
            lambda_sparse=best_params['lambda_sparse'],
            verbose=0
        )

        # 모델 학습
        model.fit(
            X_train_fold.values, y_train_fold.values.ravel(),  # Convert y_train to 1D array using .ravel()
            eval_set=[(X_test_fold.values, y_test_fold.values.ravel())],  # Convert y_test to 1D array using .ravel()
            eval_metric=['auc'],
            patience=100,
            batch_size=1024,
            virtual_batch_size=128,
            max_epochs=1000
        )

        # 테스트 데이터에 대한 예측
        y_pred = model.predict(X_test_fold.values)

        # 평가 지표 계산
        accuracy = accuracy_score(y_test_fold, y_pred)
        precision = precision_score(y_test_fold, y_pred)
        recall = recall_score(y_test_fold, y_pred)
        f1 = f1_score(y_test_fold, y_pred)
        conf_matrix = confusion_matrix(y_test_fold, y_pred)

        # 각 fold 별 평가 지표를 리스트에 추가
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_score_list.append(f1)
        confusion_matrix_list.append(conf_matrix)

        print(f"Fold {fold_idx}")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 score: {f1}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("------------------------------")

        # 가장 좋은 f1-score 값을 가진 모델을 저장
        if f1 > best_f1_score:
            best_f1_score = f1
            best_model = model

    # 가장 좋은 f1-score 값을 가진 모델로 최종 예측 수행
    y_pred_final = best_model.predict(X_test.values)

    # 평가 지표 계산
    accuracy_final = accuracy_score(y_test, y_pred_final)
    precision_final = precision_score(y_test, y_pred_final)
    recall_final = recall_score(y_test, y_pred_final)
    f1_final = f1_score(y_test, y_pred_final)
    conf_matrix_final = confusion_matrix(y_test, y_pred_final)  # 테스트 데이터에 대한 Confusion Matrix 계산

    print("Final Test Results")
    print(f"Accuracy: {accuracy_final}")
    print(f"Precision: {precision_final}")
    print(f"Recall: {recall_final}")
    print(f"F1 score: {f1_final}")
    print("Confusion Matrix:")
    print(conf_matrix_final)

    return accuracy_list, precision_list, recall_list, f1_score_list

In [7]:
# 함수를 호출하여 cross validation과 test를 수행합니다.
evaluate_tabnet(X_train, y_train, X_test, y_test)


Early stopping occurred at epoch 137 with best_epoch = 37 and best_val_0_auc = 0.91482
Fold 1
Accuracy: 0.8952301603832535
Precision: 0.7462284482758621
Recall: 0.7213541666666666
F1 score: 0.7335805084745763
Confusion Matrix:
[[7211  471]
 [ 535 1385]]
------------------------------

Early stopping occurred at epoch 313 with best_epoch = 213 and best_val_0_auc = 0.93157
Fold 2
Accuracy: 0.9053322224536555
Precision: 0.7693127330847096
Recall: 0.7520833333333333
F1 score: 0.7606004740584672
Confusion Matrix:
[[7249  433]
 [ 476 1444]]
------------------------------

Early stopping occurred at epoch 265 with best_epoch = 165 and best_val_0_auc = 0.93182
Fold 3
Accuracy: 0.9073109768798167
Precision: 0.7946224256292906
Recall: 0.7234375
F1 score: 0.757360959651036
Confusion Matrix:
[[7323  359]
 [ 531 1389]]
------------------------------

Early stopping occurred at epoch 145 with best_epoch = 45 and best_val_0_auc = 0.91873
Fold 4
Accuracy: 0.8917933763799208
Precision: 0.7546189376443

([0.8952301603832535,
  0.9053322224536555,
  0.9073109768798167,
  0.8917933763799208,
  0.9027285982087065],
 [0.7462284482758621,
  0.7693127330847096,
  0.7946224256292906,
  0.7546189376443418,
  0.7761611639619473],
 [0.7213541666666666,
  0.7520833333333333,
  0.7234375,
  0.6803748047891723,
  0.722019781363873],
 [0.7335805084745763,
  0.7606004740584672,
  0.757360959651036,
  0.7155762387079114,
  0.7481121898597627])

# OSS_1:3

In [8]:
train = pd.read_csv('./OSS_0.33_train.csv', encoding='euc-kr')
test = pd.read_csv('./OSS_0.33_test.csv', encoding='euc-kr')

In [9]:
X_train = train[['유동자산회전률','기업수명주기', '총자산대비잉여현금흐름','자기자본구성비율', 'log자산총계','자기자본회전률', '순운전자본회전률', '자기자본증가율', '총자본증가율', '총자산대비현금흐름', '총자본투자효율']]
y_train = train[['t-1감사의견코드']]

X_test = test[['유동자산회전률','기업수명주기', '총자산대비잉여현금흐름','자기자본구성비율', 'log자산총계','자기자본회전률', '순운전자본회전률', '자기자본증가율', '총자본증가율', '총자산대비현금흐름', '총자본투자효율']]
y_test = test[['t-1감사의견코드']]

In [10]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_tabnet(X_train, y_train, X_test, y_test):
    # Stratified 5-fold 교차검증 설정
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    # 최적 하이퍼파라미터 설정 (여기서는 고정값으로 사용)
    best_params = {
        'n_d': 8,  # Number of decision steps (also known as the number of features for attention)
        'n_a': 8,  # Number of attention features (output dimension of each attention head)
        'n_steps': 3,  # Number of steps in the architecture (usually between 3 and 10)
        'gamma': 1.5,  # The factor by which to scale the contribution of each augmented sample
        'n_independent': 3,  # Number of independent GLU layers in each decision step
        'n_shared': 3,  # Number of shared GLU layers in each decision step
        'lambda_sparse': 0.001  # The sparsity loss weight
    }

    # 각 fold 별 평가 지표를 저장할 리스트 초기화
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_score_list = []
    confusion_matrix_list = []

    best_f1_score = 0
    best_model = None

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_test_fold, y_test_fold = X_train.iloc[test_idx], y_train.iloc[test_idx]

        # TabNet 모델 초기화
        model = TabNetClassifier(
            n_d=best_params['n_d'],
            n_a=best_params['n_a'],
            n_steps=best_params['n_steps'],
            gamma=best_params['gamma'],
            n_independent=best_params['n_independent'],
            n_shared=best_params['n_shared'],
            lambda_sparse=best_params['lambda_sparse'],
            verbose=0
        )

        # 모델 학습
        model.fit(
            X_train_fold.values, y_train_fold.values.ravel(),  # Convert y_train to 1D array using .ravel()
            eval_set=[(X_test_fold.values, y_test_fold.values.ravel())],  # Convert y_test to 1D array using .ravel()
            eval_metric=['auc'],
            patience=100,
            batch_size=1024,
            virtual_batch_size=128,
            max_epochs=1000
        )

        # 테스트 데이터에 대한 예측
        y_pred = model.predict(X_test_fold.values)

        # 평가 지표 계산
        accuracy = accuracy_score(y_test_fold, y_pred)
        precision = precision_score(y_test_fold, y_pred)
        recall = recall_score(y_test_fold, y_pred)
        f1 = f1_score(y_test_fold, y_pred)
        conf_matrix = confusion_matrix(y_test_fold, y_pred)

        # 각 fold 별 평가 지표를 리스트에 추가
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_score_list.append(f1)
        confusion_matrix_list.append(conf_matrix)

        print(f"Fold {fold_idx}")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 score: {f1}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("------------------------------")

        # 가장 좋은 f1-score 값을 가진 모델을 저장
        if f1 > best_f1_score:
            best_f1_score = f1
            best_model = model

    # 가장 좋은 f1-score 값을 가진 모델로 최종 예측 수행
    y_pred_final = best_model.predict(X_test.values)

    # 평가 지표 계산
    accuracy_final = accuracy_score(y_test, y_pred_final)
    precision_final = precision_score(y_test, y_pred_final)
    recall_final = recall_score(y_test, y_pred_final)
    f1_final = f1_score(y_test, y_pred_final)
    conf_matrix_final = confusion_matrix(y_test, y_pred_final)  # 테스트 데이터에 대한 Confusion Matrix 계산

    print("Final Test Results")
    print(f"Accuracy: {accuracy_final}")
    print(f"Precision: {precision_final}")
    print(f"Recall: {recall_final}")
    print(f"F1 score: {f1_final}")
    print("Confusion Matrix:")
    print(conf_matrix_final)

    return accuracy_list, precision_list, recall_list, f1_score_list

In [11]:
# 함수를 호출하여 cross validation과 test를 수행합니다.
evaluate_tabnet(X_train, y_train, X_test, y_test)


Early stopping occurred at epoch 328 with best_epoch = 228 and best_val_0_auc = 0.9237
Fold 1
Accuracy: 0.8864341085271318
Precision: 0.8
Recall: 0.7229166666666667
F1 score: 0.7595075239398085
Confusion Matrix:
[[5473  347]
 [ 532 1388]]
------------------------------

Early stopping occurred at epoch 243 with best_epoch = 143 and best_val_0_auc = 0.9207
Fold 2
Accuracy: 0.8900516795865633
Precision: 0.7975528364849833
Recall: 0.7464862051015096
F1 score: 0.7711750470556602
Confusion Matrix:
[[5455  364]
 [ 487 1434]]
------------------------------

Early stopping occurred at epoch 168 with best_epoch = 68 and best_val_0_auc = 0.9231
Fold 3
Accuracy: 0.8850129198966409
Precision: 0.8027011156782149
Recall: 0.7116085372201978
F1 score: 0.7544150110375277
Confusion Matrix:
[[5483  336]
 [ 554 1367]]
------------------------------

Early stopping occurred at epoch 165 with best_epoch = 65 and best_val_0_auc = 0.92322
Fold 4
Accuracy: 0.8865486496963432
Precision: 0.7950169875424689
Reca

([0.8864341085271318,
  0.8900516795865633,
  0.8850129198966409,
  0.8865486496963432,
  0.8859025713916526],
 [0.8,
  0.7975528364849833,
  0.8027011156782149,
  0.7950169875424689,
  0.7798165137614679],
 [0.7229166666666667,
  0.7464862051015096,
  0.7116085372201978,
  0.73125,
  0.7526041666666666],
 [0.7595075239398085,
  0.7711750470556602,
  0.7544150110375277,
  0.7618014107433533,
  0.7659687251523987])

# tomek_1:4

In [12]:
train = pd.read_csv('./TomekLinks_0.25_train.csv', encoding='euc-kr')
test = pd.read_csv('./TomekLinks_0.25_test.csv', encoding='euc-kr')

In [13]:
X_train = train[['순운전자본회전률', '자기자본구성비율', '자기자본증가율', '총자본증가율', '유동자산회전률', 'log자산총계',
       '총자산대비잉여현금흐름', '기업수명주기', '영업이익증가율', '총자본투자효율', '매출액대비잉여현금흐름']]
y_train = train[['t-1감사의견코드']]

X_test = test[['순운전자본회전률', '자기자본구성비율', '자기자본증가율', '총자본증가율', '유동자산회전률', 'log자산총계',
       '총자산대비잉여현금흐름', '기업수명주기', '영업이익증가율', '총자본투자효율', '매출액대비잉여현금흐름']]
y_test = test[['t-1감사의견코드']]

In [14]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_tabnet(X_train, y_train, X_test, y_test):
    # Stratified 5-fold 교차검증 설정
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    # 최적 하이퍼파라미터 설정 (여기서는 고정값으로 사용)
    best_params = {
        'n_d': 8,  # Number of decision steps (also known as the number of features for attention)
        'n_a': 8,  # Number of attention features (output dimension of each attention head)
        'n_steps': 3,  # Number of steps in the architecture (usually between 3 and 10)
        'gamma': 1.5,  # The factor by which to scale the contribution of each augmented sample
        'n_independent': 3,  # Number of independent GLU layers in each decision step
        'n_shared': 3,  # Number of shared GLU layers in each decision step
        'lambda_sparse': 0.001  # The sparsity loss weight
    }

    # 각 fold 별 평가 지표를 저장할 리스트 초기화
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_score_list = []
    confusion_matrix_list = []

    best_f1_score = 0
    best_model = None

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_test_fold, y_test_fold = X_train.iloc[test_idx], y_train.iloc[test_idx]

        # TabNet 모델 초기화
        model = TabNetClassifier(
            n_d=best_params['n_d'],
            n_a=best_params['n_a'],
            n_steps=best_params['n_steps'],
            gamma=best_params['gamma'],
            n_independent=best_params['n_independent'],
            n_shared=best_params['n_shared'],
            lambda_sparse=best_params['lambda_sparse'],
            verbose=0
        )

        # 모델 학습
        model.fit(
            X_train_fold.values, y_train_fold.values.ravel(),  # Convert y_train to 1D array using .ravel()
            eval_set=[(X_test_fold.values, y_test_fold.values.ravel())],  # Convert y_test to 1D array using .ravel()
            eval_metric=['auc'],
            patience=100,
            batch_size=1024,
            virtual_batch_size=128,
            max_epochs=1000
        )

        # 테스트 데이터에 대한 예측
        y_pred = model.predict(X_test_fold.values)

        # 평가 지표 계산
        accuracy = accuracy_score(y_test_fold, y_pred)
        precision = precision_score(y_test_fold, y_pred)
        recall = recall_score(y_test_fold, y_pred)
        f1 = f1_score(y_test_fold, y_pred)
        conf_matrix = confusion_matrix(y_test_fold, y_pred)

        # 각 fold 별 평가 지표를 리스트에 추가
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_score_list.append(f1)
        confusion_matrix_list.append(conf_matrix)

        print(f"Fold {fold_idx}")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 score: {f1}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("------------------------------")

        # 가장 좋은 f1-score 값을 가진 모델을 저장
        if f1 > best_f1_score:
            best_f1_score = f1
            best_model = model

    # 가장 좋은 f1-score 값을 가진 모델로 최종 예측 수행
    y_pred_final = best_model.predict(X_test.values)

    # 평가 지표 계산
    accuracy_final = accuracy_score(y_test, y_pred_final)
    precision_final = precision_score(y_test, y_pred_final)
    recall_final = recall_score(y_test, y_pred_final)
    f1_final = f1_score(y_test, y_pred_final)
    conf_matrix_final = confusion_matrix(y_test, y_pred_final)  # 테스트 데이터에 대한 Confusion Matrix 계산

    print("Final Test Results")
    print(f"Accuracy: {accuracy_final}")
    print(f"Precision: {precision_final}")
    print(f"Recall: {recall_final}")
    print(f"F1 score: {f1_final}")
    print("Confusion Matrix:")
    print(conf_matrix_final)

    return accuracy_list, precision_list, recall_list, f1_score_list

In [15]:
# 함수를 호출하여 cross validation과 test를 수행합니다.
evaluate_tabnet(X_train, y_train, X_test, y_test)


Early stopping occurred at epoch 117 with best_epoch = 17 and best_val_0_auc = 0.91092
Fold 1
Accuracy: 0.8914809414705269
Precision: 0.758235294117647
Recall: 0.6713541666666667
F1 score: 0.7121546961325966
Confusion Matrix:
[[7271  411]
 [ 631 1289]]
------------------------------

Early stopping occurred at epoch 213 with best_epoch = 113 and best_val_0_auc = 0.9206
Fold 2
Accuracy: 0.9010622786919392
Precision: 0.7464430894308943
Recall: 0.7651041666666667
F1 score: 0.7556584362139916
Confusion Matrix:
[[7183  499]
 [ 451 1469]]
------------------------------

Early stopping occurred at epoch 171 with best_epoch = 71 and best_val_0_auc = 0.93036
Fold 3
Accuracy: 0.8979379295980005
Precision: 0.7393075356415478
Recall: 0.75625
F1 score: 0.7476828012358393
Confusion Matrix:
[[7170  512]
 [ 468 1452]]
------------------------------

Early stopping occurred at epoch 226 with best_epoch = 126 and best_val_0_auc = 0.91831
Fold 4
Accuracy: 0.8976254946886065
Precision: 0.7623042505592841

([0.8914809414705269,
  0.9010622786919392,
  0.8979379295980005,
  0.8976254946886065,
  0.8978337846282024],
 [0.758235294117647,
  0.7464430894308943,
  0.7393075356415478,
  0.7623042505592841,
  0.7804295942720764],
 [0.6713541666666667,
  0.7651041666666667,
  0.75625,
  0.7095262883914628,
  0.6808953669963561],
 [0.7121546961325966,
  0.7556584362139916,
  0.7476828012358393,
  0.7349689943380966,
  0.7272727272727274])

# tomek_1:3

In [16]:
train = pd.read_csv('./TomekLinks_0.33_train.csv', encoding='euc-kr')
test = pd.read_csv('./TomekLinks_0.33_test.csv', encoding='euc-kr')

In [17]:
X_train = train[['자기자본증가율', '자기자본회전률', '유동자산회전률', '기업수명주기', '총자산대비잉여현금흐름', '자기자본구성비율', '순운전자본회전률', '총자본증가율','log자산총계', '총자본투자효율','총자본순이익률','매출액대비잉여현금흐름','총자산대비현금흐름']]
y_train = train[['t-1감사의견코드']]

X_test = test[['자기자본증가율', '자기자본회전률', '유동자산회전률', '기업수명주기', '총자산대비잉여현금흐름', '자기자본구성비율', '순운전자본회전률', '총자본증가율','log자산총계', '총자본투자효율','총자본순이익률','매출액대비잉여현금흐름','총자산대비현금흐름']]
y_test = test[['t-1감사의견코드']]

In [18]:
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_tabnet(X_train, y_train, X_test, y_test):
    # Stratified 5-fold 교차검증 설정
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    # 최적 하이퍼파라미터 설정 (여기서는 고정값으로 사용)
    best_params = {
        'n_d': 8,  # Number of decision steps (also known as the number of features for attention)
        'n_a': 8,  # Number of attention features (output dimension of each attention head)
        'n_steps': 3,  # Number of steps in the architecture (usually between 3 and 10)
        'gamma': 1.5,  # The factor by which to scale the contribution of each augmented sample
        'n_independent': 3,  # Number of independent GLU layers in each decision step
        'n_shared': 3,  # Number of shared GLU layers in each decision step
        'lambda_sparse': 0.001  # The sparsity loss weight
    }

    # 각 fold 별 평가 지표를 저장할 리스트 초기화
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_score_list = []
    confusion_matrix_list = []

    best_f1_score = 0
    best_model = None

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_test_fold, y_test_fold = X_train.iloc[test_idx], y_train.iloc[test_idx]

        # TabNet 모델 초기화
        model = TabNetClassifier(
            n_d=best_params['n_d'],
            n_a=best_params['n_a'],
            n_steps=best_params['n_steps'],
            gamma=best_params['gamma'],
            n_independent=best_params['n_independent'],
            n_shared=best_params['n_shared'],
            lambda_sparse=best_params['lambda_sparse'],
            verbose=0
        )

        # 모델 학습
        model.fit(
            X_train_fold.values, y_train_fold.values.ravel(),  # Convert y_train to 1D array using .ravel()
            eval_set=[(X_test_fold.values, y_test_fold.values.ravel())],  # Convert y_test to 1D array using .ravel()
            eval_metric=['auc'],
            patience=100,
            batch_size=1024,
            virtual_batch_size=128,
            max_epochs=1000
        )

        # 테스트 데이터에 대한 예측
        y_pred = model.predict(X_test_fold.values)

        # 평가 지표 계산
        accuracy = accuracy_score(y_test_fold, y_pred)
        precision = precision_score(y_test_fold, y_pred)
        recall = recall_score(y_test_fold, y_pred)
        f1 = f1_score(y_test_fold, y_pred)
        conf_matrix = confusion_matrix(y_test_fold, y_pred)

        # 각 fold 별 평가 지표를 리스트에 추가
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_score_list.append(f1)
        confusion_matrix_list.append(conf_matrix)

        print(f"Fold {fold_idx}")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 score: {f1}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("------------------------------")

        # 가장 좋은 f1-score 값을 가진 모델을 저장
        if f1 > best_f1_score:
            best_f1_score = f1
            best_model = model

    # 가장 좋은 f1-score 값을 가진 모델로 최종 예측 수행
    y_pred_final = best_model.predict(X_test.values)

    # 평가 지표 계산
    accuracy_final = accuracy_score(y_test, y_pred_final)
    precision_final = precision_score(y_test, y_pred_final)
    recall_final = recall_score(y_test, y_pred_final)
    f1_final = f1_score(y_test, y_pred_final)
    conf_matrix_final = confusion_matrix(y_test, y_pred_final)  # 테스트 데이터에 대한 Confusion Matrix 계산

    print("Final Test Results")
    print(f"Accuracy: {accuracy_final}")
    print(f"Precision: {precision_final}")
    print(f"Recall: {recall_final}")
    print(f"F1 score: {f1_final}")
    print("Confusion Matrix:")
    print(conf_matrix_final)

    return accuracy_list, precision_list, recall_list, f1_score_list

In [19]:
# 함수를 호출하여 cross validation과 test를 수행합니다.
evaluate_tabnet(X_train, y_train, X_test, y_test)


Early stopping occurred at epoch 178 with best_epoch = 78 and best_val_0_auc = 0.91829
Fold 1
Accuracy: 0.8863049095607235
Precision: 0.7576808721506442
Recall: 0.7963541666666667
F1 score: 0.7765363128491619
Confusion Matrix:
[[5331  489]
 [ 391 1529]]
------------------------------

Early stopping occurred at epoch 131 with best_epoch = 31 and best_val_0_auc = 0.91808
Fold 2
Accuracy: 0.8890180878552971
Precision: 0.8020477815699659
Recall: 0.7339927121290994
F1 score: 0.7665126393041587
Confusion Matrix:
[[5471  348]
 [ 511 1410]]
------------------------------

Early stopping occurred at epoch 155 with best_epoch = 55 and best_val_0_auc = 0.92497
Fold 3
Accuracy: 0.8908268733850129
Precision: 0.8091954022988506
Recall: 0.7329515877147319
F1 score: 0.7691887462441955
Confusion Matrix:
[[5487  332]
 [ 513 1408]]
------------------------------

Early stopping occurred at epoch 313 with best_epoch = 213 and best_val_0_auc = 0.93257
Fold 4
Accuracy: 0.8962398242667011
Precision: 0.8260

([0.8863049095607235,
  0.8890180878552971,
  0.8908268733850129,
  0.8962398242667011,
  0.8813800232588189],
 [0.7576808721506442,
  0.8020477815699659,
  0.8091954022988506,
  0.8260361938120256,
  0.764799154334038],
 [0.7963541666666667,
  0.7339927121290994,
  0.7329515877147319,
  0.7369791666666666,
  0.7536458333333333],
 [0.7765363128491619,
  0.7665126393041587,
  0.7691887462441955,
  0.7789705477566748,
  0.7591815320041972])