In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from datetime import datetime
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier , StackingClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import cross_val_score,GridSearchCV

# 한글 깨짐 방지
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'malgun Gothic'

In [8]:
train = pd.read_csv('./Dataset/Undersampling/OSS_0.33_train.csv', encoding='euc-kr')
test = pd.read_csv('./Dataset/Undersampling/OSS_0.33_test.csv', encoding='euc-kr')

In [9]:
X_train_sum=train[['유동자산회전률', '총자산대비잉여현금흐름','자기자본구성비율', 'log자산총계','자기자본회전률', '순운전자본회전률', '자기자본증가율', '총자본증가율', '총자산대비현금흐름', '총자본투자효율']]
y_train = train[['t-1감사의견코드']]
X_test_sum=test[['유동자산회전률', '총자산대비잉여현금흐름','자기자본구성비율', 'log자산총계','자기자본회전률', '순운전자본회전률', '자기자본증가율', '총자본증가율', '총자산대비현금흐름', '총자본투자효율']]
y_test = test[['t-1감사의견코드']]

---
---
- Logistic Regression
- Random Forest
- XGBoost
- LightGBM

---
---
# Stacking

In [10]:
# !pip install CatBoost
# !pip install pytorch_tabnet

Collecting pytorch_tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
                                              0.0/44.5 kB ? eta -:--:--
     ---------------------------------------- 44.5/44.5 kB ? eta 0:00:00
Collecting torch>=1.3 (from pytorch_tabnet)
  Downloading torch-2.0.1-cp310-cp310-win_amd64.whl (172.3 MB)
                                              0.0/172.3 MB ? eta -:--:--
                                             1.6/172.3 MB 50.9 MB/s eta 0:00:04
                                             4.2/172.3 MB 52.5 MB/s eta 0:00:04
     -                                       6.8/172.3 MB 54.5 MB/s eta 0:00:04
     -                                       8.4/172.3 MB 53.4 MB/s eta 0:00:04
     --                                      9.6/172.3 MB 43.9 MB/s eta 0:00:04
     --                                     10.5/172.3 MB 43.5 MB/s eta 0:00:04
     --                                     10.5/172.3 MB 43.5 MB/s eta 0:00:04
     --                      


[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

In [14]:
# 서브모델 및 메타모델 설정

submodels = [
    LogisticRegression(C=1, penalty='l2', random_state=0),
    RandomForestClassifier(n_estimators=150, random_state=0, max_depth=15, min_samples_leaf=14, max_features='auto'),
    XGBClassifier(n_estimators=100, random_state=0, enable_categorical=True),
    LGBMClassifier(n_estimators=110, random_state=0, learning_rate=0.08, max_depth=12)
    # CatBoostClassifier(iterations=120, random_state=0, verbose=0, cat_features=categorical_feature, depth=10, learning_rate=0.1)
    # SVC(probability=True, random_state=0)

]

meta_model = LogisticRegression()

In [15]:
# 서브모델 및 메타모델을 활용한 스태킹 앙상블 함수
def stacking_ensemble(submodels, meta_model, X_train, y_train, X_test, y_test, n_folds=5):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)

    meta_features_train = np.zeros((len(X_train), len(submodels)))
    meta_features_test = np.zeros((len(X_test), len(submodels)))

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_val_fold, y_val_fold = X_train.iloc[val_idx], y_train.iloc[val_idx]

        for idx, model in enumerate(submodels):
            if isinstance(model, CatBoostClassifier):
                model.fit(X_train_fold, y_train_fold, cat_features=categorical_feature)
            else:
                model.fit(X_train_fold, y_train_fold)

            val_pred = model.predict_proba(X_val_fold)[:, 1]
            meta_features_train[val_idx, idx] = val_pred

            test_pred = model.predict_proba(X_test)[:, 1]
            meta_features_test[:, idx] += test_pred / n_folds

    meta_model.fit(meta_features_train, y_train)
    meta_pred = meta_model.predict(meta_features_test)

    return meta_pred

In [16]:
# 스태킹 앙상블 수행
meta_predictions = stacking_ensemble(submodels, meta_model, X_train_sum, y_train, X_test_sum, y_test, n_folds=5)

# 최종 예측 결과 평가
accuracy = accuracy_score(y_test, meta_predictions)
precision = precision_score(y_test, meta_predictions)
recall = recall_score(y_test, meta_predictions)
f1 = f1_score(y_test, meta_predictions)
conf_matrix = confusion_matrix(y_test, meta_predictions)

print("Final Stacking Ensemble Accuracy:", accuracy)
print("Final Stacking Ensemble Precision:", precision)
print("Final Stacking Ensemble Recall:", recall)
print("Final Stacking Ensemble F1 score:", f1)
print("Final Stacking Ensemble Confusion Matrix:")
print(conf_matrix)

ValueError: Experimental support for categorical data is not implemented for current tree method yet.

In [30]:
meta_model.coef_

array([[0.04774367, 2.88861109, 2.75394855, 0.83247379]])

---

In [None]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np

from pytorch_tabnet.tab_model import TabNetClassifier

# 서브모델 및 메타모델 설정
categorical_feature = [3]
submodels = [
    RandomForestClassifier(n_estimators=150, random_state=0, max_depth=15, min_samples_leaf=14, max_features='auto'),
    LGBMClassifier(n_estimators=110, random_state=0, categorical_feature=categorical_feature, learning_rate=0.08, max_depth=12),
    CatBoostClassifier(iterations=120, random_state=0, verbose=0, cat_features=categorical_feature, depth=10, learning_rate=0.1),
    SVC(probability=True, random_state=0),
    TabNetClassifier(n_d=8, n_a=8, n_steps=3, gamma=1.5, n_independent=2, n_shared=2, lambda_sparse=0.001, verbose=0)
]

meta_model = LogisticRegression(C=1, penalty='l2', random_state=0)

# 서브모델 및 메타모델을 활용한 스태킹 앙상블 함수
def stacking_ensemble(submodels, meta_model, X_train, y_train, X_test, y_test, n_folds=5):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)

    meta_features_train = np.zeros((len(X_train), len(submodels)))
    meta_features_test = np.zeros((len(X_test), len(submodels)))

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_val_fold, y_val_fold = X_train.iloc[val_idx], y_train.iloc[val_idx]

        for idx, model in enumerate(submodels):
            if isinstance(model, CatBoostClassifier):
                model.fit(X_train_fold, y_train_fold, cat_features=categorical_feature)
            elif isinstance(model, TabNetClassifier):
                model.fit(X_train_fold.values.reshape(-1, 1), y_train_fold.values)  # 데이터를 1차원으로 변환하여 학습
            else:
                model.fit(X_train_fold, y_train_fold)

            val_pred = model.predict_proba(X_val_fold)[:, 1]
            meta_features_train[val_idx, idx] = val_pred

            test_pred = model.predict_proba(X_test)[:, 1]
            meta_features_test[:, idx] += test_pred / n_folds

    meta_model.fit(meta_features_train, y_train)
    meta_pred = meta_model.predict(meta_features_test)

    return meta_pred

# 스태킹 앙상블 수행
meta_predictions = stacking_ensemble(submodels, meta_model, X_train, y_train, X_test, y_test, n_folds=5)

# 최종 예측 결과 평가
accuracy = accuracy_score(y_test, meta_predictions)
precision = precision_score(y_test, meta_predictions)
recall = recall_score(y_test, meta_predictions)
f1 = f1_score(y_test, meta_predictions)
conf_matrix = confusion_matrix(y_test, meta_predictions)

print("Final Stacking Ensemble Accuracy:", accuracy)
print("Final Stacking Ensemble Precision:", precision)
print("Final Stacking Ensemble Recall:", recall)
print("Final Stacking Ensemble F1 score:", f1)
print("Final Stacking Ensemble Confusion Matrix:")
print(conf_matrix)


ValueError: ignored

In [None]:
# 스태킹 앙상블 수행
meta_predictions = stacking_ensemble(submodels, meta_model, X_train, y_train, X_test, y_test, n_folds=5)

# 최종 예측 결과 평가
accuracy = accuracy_score(y_test, meta_predictions)
precision = precision_score(y_test, meta_predictions)
recall = recall_score(y_test, meta_predictions)
f1 = f1_score(y_test, meta_predictions)
conf_matrix = confusion_matrix(y_test, meta_predictions)

print("Final Stacking Ensemble Accuracy:", accuracy)
print("Final Stacking Ensemble Precision:", precision)
print("Final Stacking Ensemble Recall:", recall)
print("Final Stacking Ensemble F1 score:", f1)
print("Final Stacking Ensemble Confusion Matrix:")
print(conf_matrix)


---
---
---

In [None]:
# 서브모델 및 메타모델 설정
submodels = [
    RandomForestClassifier(n_estimators=100, random_state=0),
    XGBClassifier(n_estimators=100, random_state=0),
    LGBMClassifier(n_estimators=100, random_state=0),
    CatBoostClassifier(iterations=100, random_state=0, verbose=0),
    SVC(probability=True, random_state=0),
    TabNetClassifier(n_d=8, n_a=8, n_steps=3, gamma=1.5, n_independent=2, n_shared=2, lambda_sparse=0.001, verbose=0)
]

meta_model = LogisticRegression()

In [None]:
# 서브모델 및 메타모델을 활용한 스태킹 앙상블 함수
def stacking_ensemble(submodels, meta_model, X_train, y_train, X_test, y_test, n_folds=5):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)

    meta_features_train = np.zeros((len(X_train), len(submodels)))
    meta_features_test = np.zeros((len(X_test), len(submodels)))

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train[train_idx], y_train[train_idx]
        X_val_fold, y_val_fold = X_train[val_idx], y_train[val_idx]

        for idx, model in enumerate(submodels):
            model.fit(X_train_fold, y_train_fold)
            val_pred = model.predict_proba(X_val_fold)[:, 1]
            meta_features_train[val_idx, idx] = val_pred

            test_pred = model.predict_proba(X_test)[:, 1]
            meta_features_test[:, idx] += test_pred / n_folds

    meta_model.fit(meta_features_train, y_train)
    meta_pred = meta_model.predict(meta_features_test)

    return meta_pred

In [None]:
# 메타모델 : Logistic Regression
# 스태킹 앙상블 수행
meta_predictions = stacking_ensemble(submodels, meta_model, X_train.values, y_train.values.ravel(), X_test.values, y_test.values.ravel(), n_folds=5)

# 최종 예측 결과 평가
accuracy = accuracy_score(y_test.values.ravel(), meta_predictions)
precision = precision_score(y_test.values.ravel(), meta_predictions)
recall = recall_score(y_test.values.ravel(), meta_predictions)
f1 = f1_score(y_test.values.ravel(), meta_predictions)
conf_matrix = confusion_matrix(y_test.values.ravel(), meta_predictions)

print("Final Stacking Ensemble Accuracy:", accuracy)
print("Final Stacking Ensemble Precision:", precision)
print("Final Stacking Ensemble Recall:", recall)
print("Final Stacking Ensemble F1 score:", f1)
print("Final Stacking Ensemble Confusion Matrix:")
print(conf_matrix)

KeyboardInterrupt: ignored

---
---
---

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from pytorch_tabnet.tab_model import TabNetClassifier

In [35]:
# 서브모델 및 메타모델 설정
submodels = [
    RandomForestClassifier(n_estimators=100, random_state=0),
    LGBMClassifier(n_estimators=100, random_state=0),
    CatBoostClassifier(iterations=100, random_state=0, verbose=0),
    TabNetClassifier(n_d=8, n_a=8, n_steps=3, gamma=1.5, n_independent=2, n_shared=2, lambda_sparse=0.001, verbose=0),
    # LogisticRegression(C=1, penalty='l2', random_state=0)
]

meta_model = LogisticRegression()

In [36]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 서브모델 및 메타모델을 활용한 스태킹 앙상블 함수
def stacking_ensemble(submodels, meta_model, X_train, y_train, X_test, y_test, n_folds=5):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)

    meta_features_train = np.zeros((len(X_train), len(submodels)))
    meta_features_test = np.zeros((len(X_test), len(submodels)))

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train[train_idx], y_train[train_idx]
        X_val_fold, y_val_fold = X_train[val_idx], y_train[val_idx]

        for idx, model in enumerate(submodels):
            model.fit(X_train_fold, y_train_fold)
            val_pred = model.predict_proba(X_val_fold)[:, 1]
            meta_features_train[val_idx, idx] = val_pred

            test_pred = model.predict_proba(X_test)[:, 1]
            meta_features_test[:, idx] += test_pred / n_folds

    meta_model.fit(meta_features_train, y_train)
    meta_pred = meta_model.predict(meta_features_test)

    return meta_pred

In [None]:
# 메타모델 : Logistic Regression
# 스태킹 앙상블 수행
meta_predictions = stacking_ensemble(submodels, meta_model, X_train.values, y_train.values.ravel(), X_test.values, y_test.values.ravel(), n_folds=5)

# 최종 예측 결과 평가
accuracy = accuracy_score(y_test.values.ravel(), meta_predictions)
precision = precision_score(y_test.values.ravel(), meta_predictions)
recall = recall_score(y_test.values.ravel(), meta_predictions)
f1 = f1_score(y_test.values.ravel(), meta_predictions)
conf_matrix = confusion_matrix(y_test.values.ravel(), meta_predictions)

print("Final Stacking Ensemble Accuracy:", accuracy)
print("Final Stacking Ensemble Precision:", precision)
print("Final Stacking Ensemble Recall:", recall)
print("Final Stacking Ensemble F1 score:", f1)
print("Final Stacking Ensemble Confusion Matrix:")
print(conf_matrix)

In [34]:
meta_model.coef_

array([[2.90632453, 2.13061917, 1.03781337, 0.39391811, 0.18262964]])