In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from datetime import datetime
import time
import warnings
warnings.filterwarnings("ignore")

# 한글 깨짐 방지
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'malgun Gothic'

In [6]:
train = pd.read_csv('/Users/taewon/Documents/금융 빅데이터/Project_2/코딩/상장기업/sampling data/Undersampling_0.33_train.csv', encoding='euc-kr')
test = pd.read_csv('/Users/taewon/Documents/금융 빅데이터/Project_2/코딩/상장기업/sampling data/Undersampling_0.33_test.csv', encoding='euc-kr')

In [7]:
X_train = train[['부채비율', '자기자본순이익률', '부가가치율', '당좌자산회전률', '총자본회전률', '총자본증가율',
       '연구개발비대비매출액', '매출액대비잉여현금흐름', '차입금의존도']]

X_test = test[['부채비율', '자기자본순이익률', '부가가치율', '당좌자산회전률', '총자본회전률', '총자본증가율',
       '연구개발비대비매출액', '매출액대비잉여현금흐름', '차입금의존도']]

In [8]:
y_train = train[['t-1감사의견코드']]

y_test = test[['t-1감사의견코드']]

In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

In [10]:
# 서브모델 및 메타모델 설정
submodels = [
    RandomForestClassifier(n_estimators=100, random_state=0),
    # LGBMClassifier(n_estimators=100, random_state=0),
    CatBoostClassifier(iterations=100, random_state=0, verbose=0),
    TabNetClassifier(n_d=8, n_a=8, n_steps=3, gamma=1.5, n_independent=2, n_shared=2, lambda_sparse=0.001, verbose=0)
]

meta_model = LogisticRegression()

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 서브모델 및 메타모델을 활용한 스태킹 앙상블 함수
def stacking_ensemble(submodels, meta_model, X_train, y_train, X_test, y_test, n_folds=5):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)

    meta_features_train = np.zeros((len(X_train), len(submodels)))
    meta_features_test = np.zeros((len(X_test), len(submodels)))

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train[train_idx], y_train[train_idx]
        X_val_fold, y_val_fold = X_train[val_idx], y_train[val_idx]

        for idx, model in enumerate(submodels):
            model.fit(X_train_fold, y_train_fold)
            val_pred = model.predict_proba(X_val_fold)[:, 1]
            meta_features_train[val_idx, idx] = val_pred

            test_pred = model.predict_proba(X_test)[:, 1]
            meta_features_test[:, idx] += test_pred / n_folds

    meta_model.fit(meta_features_train, y_train)
    meta_pred = meta_model.predict(meta_features_test)

    return meta_pred

In [12]:
# 메타모델 : Logistic Regression
# 스태킹 앙상블 수행
meta_predictions = stacking_ensemble(submodels, meta_model, X_train.values, y_train.values.ravel(), X_test.values, y_test.values.ravel(), n_folds=5)

# 최종 예측 결과 평가
accuracy = accuracy_score(y_test.values.ravel(), meta_predictions)
precision = precision_score(y_test.values.ravel(), meta_predictions)
recall = recall_score(y_test.values.ravel(), meta_predictions)
f1 = f1_score(y_test.values.ravel(), meta_predictions)
conf_matrix = confusion_matrix(y_test.values.ravel(), meta_predictions)

print("Final Stacking Ensemble Accuracy:", accuracy)
print("Final Stacking Ensemble Precision:", precision)
print("Final Stacking Ensemble Recall:", recall)
print("Final Stacking Ensemble F1 score:", f1)
print("Final Stacking Ensemble Confusion Matrix:")
print(conf_matrix)

Final Stacking Ensemble Accuracy: 0.8611111111111112
Final Stacking Ensemble Precision: 0.8
Final Stacking Ensemble Recall: 0.5925925925925926
Final Stacking Ensemble F1 score: 0.6808510638297872
Final Stacking Ensemble Confusion Matrix:
[[77  4]
 [11 16]]


In [13]:
# # 메타모델 : LGBMClassifier
# meta_model = LGBMClassifier()

# # 스태킹 앙상블 수행
# meta_predictions = stacking_ensemble(submodels, meta_model, X_train.values, y_train.values.ravel(), X_test.values, y_test.values.ravel(), n_folds=5)

# # 최종 예측 결과 평가
# accuracy = accuracy_score(y_test.values.ravel(), meta_predictions)
# precision = precision_score(y_test.values.ravel(), meta_predictions)
# recall = recall_score(y_test.values.ravel(), meta_predictions)
# f1 = f1_score(y_test.values.ravel(), meta_predictions)
# conf_matrix = confusion_matrix(y_test.values.ravel(), meta_predictions)

# print("Final Stacking Ensemble Accuracy:", accuracy)
# print("Final Stacking Ensemble Precision:", precision)
# print("Final Stacking Ensemble Recall:", recall)
# print("Final Stacking Ensemble F1 score:", f1)
# print("Final Stacking Ensemble Confusion Matrix:")
# print(conf_matrix)

In [14]:
# 메타모델 : RandomForestClassifier
meta_model = RandomForestClassifier()

# 스태킹 앙상블 수행
meta_predictions = stacking_ensemble(submodels, meta_model, X_train.values, y_train.values.ravel(), X_test.values, y_test.values.ravel(), n_folds=5)

# 최종 예측 결과 평가
accuracy = accuracy_score(y_test.values.ravel(), meta_predictions)
precision = precision_score(y_test.values.ravel(), meta_predictions)
recall = recall_score(y_test.values.ravel(), meta_predictions)
f1 = f1_score(y_test.values.ravel(), meta_predictions)
conf_matrix = confusion_matrix(y_test.values.ravel(), meta_predictions)

print("Final Stacking Ensemble Accuracy:", accuracy)
print("Final Stacking Ensemble Precision:", precision)
print("Final Stacking Ensemble Recall:", recall)
print("Final Stacking Ensemble F1 score:", f1)
print("Final Stacking Ensemble Confusion Matrix:")
print(conf_matrix)

Final Stacking Ensemble Accuracy: 0.8333333333333334
Final Stacking Ensemble Precision: 0.6956521739130435
Final Stacking Ensemble Recall: 0.5925925925925926
Final Stacking Ensemble F1 score: 0.6399999999999999
Final Stacking Ensemble Confusion Matrix:
[[74  7]
 [11 16]]


In [15]:
# 메타모델 : XGBClassifier
meta_model = XGBClassifier()

# 스태킹 앙상블 수행
meta_predictions = stacking_ensemble(submodels, meta_model, X_train.values, y_train.values.ravel(), X_test.values, y_test.values.ravel(), n_folds=5)

# 최종 예측 결과 평가
accuracy = accuracy_score(y_test.values.ravel(), meta_predictions)
precision = precision_score(y_test.values.ravel(), meta_predictions)
recall = recall_score(y_test.values.ravel(), meta_predictions)
f1 = f1_score(y_test.values.ravel(), meta_predictions)
conf_matrix = confusion_matrix(y_test.values.ravel(), meta_predictions)

print("Final Stacking Ensemble Accuracy:", accuracy)
print("Final Stacking Ensemble Precision:", precision)
print("Final Stacking Ensemble Recall:", recall)
print("Final Stacking Ensemble F1 score:", f1)
print("Final Stacking Ensemble Confusion Matrix:")
print(conf_matrix)

Final Stacking Ensemble Accuracy: 0.8055555555555556
Final Stacking Ensemble Precision: 0.6153846153846154
Final Stacking Ensemble Recall: 0.5925925925925926
Final Stacking Ensemble F1 score: 0.6037735849056604
Final Stacking Ensemble Confusion Matrix:
[[71 10]
 [11 16]]


In [16]:
# 메타모델 : TabNetClassifier
meta_model = TabNetClassifier()

# 스태킹 앙상블 수행
meta_predictions = stacking_ensemble(submodels, meta_model, X_train.values, y_train.values.ravel(), X_test.values, y_test.values.ravel(), n_folds=5)

# 최종 예측 결과 평가
accuracy = accuracy_score(y_test.values.ravel(), meta_predictions)
precision = precision_score(y_test.values.ravel(), meta_predictions)
recall = recall_score(y_test.values.ravel(), meta_predictions)
f1 = f1_score(y_test.values.ravel(), meta_predictions)
conf_matrix = confusion_matrix(y_test.values.ravel(), meta_predictions)

print("Final Stacking Ensemble Accuracy:", accuracy)
print("Final Stacking Ensemble Precision:", precision)
print("Final Stacking Ensemble Recall:", recall)
print("Final Stacking Ensemble F1 score:", f1)
print("Final Stacking Ensemble Confusion Matrix:")
print(conf_matrix)

epoch 0  | loss: 0.0     |  0:00:00s
epoch 1  | loss: 0.0     |  0:00:00s
epoch 2  | loss: 0.0     |  0:00:00s
epoch 3  | loss: 0.0     |  0:00:00s
epoch 4  | loss: 0.0     |  0:00:00s
epoch 5  | loss: 0.0     |  0:00:00s
epoch 6  | loss: 0.0     |  0:00:00s
epoch 7  | loss: 0.0     |  0:00:00s
epoch 8  | loss: 0.0     |  0:00:00s
epoch 9  | loss: 0.0     |  0:00:00s
epoch 10 | loss: 0.0     |  0:00:00s
epoch 11 | loss: 0.0     |  0:00:00s
epoch 12 | loss: 0.0     |  0:00:00s
epoch 13 | loss: 0.0     |  0:00:00s
epoch 14 | loss: 0.0     |  0:00:00s
epoch 15 | loss: 0.0     |  0:00:00s
epoch 16 | loss: 0.0     |  0:00:00s
epoch 17 | loss: 0.0     |  0:00:00s
epoch 18 | loss: 0.0     |  0:00:00s
epoch 19 | loss: 0.0     |  0:00:00s
epoch 20 | loss: 0.0     |  0:00:00s
epoch 21 | loss: 0.0     |  0:00:00s
epoch 22 | loss: 0.0     |  0:00:00s
epoch 23 | loss: 0.0     |  0:00:00s
epoch 24 | loss: 0.0     |  0:00:00s
epoch 25 | loss: 0.0     |  0:00:00s
epoch 26 | loss: 0.0     |  0:00:00s
e

### LGBM

In [17]:
# Final Stacking Ensemble Accuracy: 0.8518518518518519
# Final Stacking Ensemble Precision: 0.7037037037037037
# Final Stacking Ensemble Recall: 0.7037037037037037
# Final Stacking Ensemble F1 score: 0.7037037037037037
# Final Stacking Ensemble Confusion Matrix:
# [[73  8]
#  [ 8 19]]