In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from datetime import datetime
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier , StackingClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import cross_val_score,GridSearchCV

# 한글 깨짐 방지
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'malgun Gothic'

In [69]:
train = pd.read_csv('./Dataset/Undersampling/OSS_0.33_train.csv',encoding='euc-kr')
test = pd.read_csv('./Dataset/test_다시.csv', encoding='euc-kr')

In [70]:
X_train_sum=train[['유동자산회전률', '총자산대비잉여현금흐름','자기자본구성비율', 'log자산총계','자기자본회전률', '순운전자본회전률', '자기자본증가율', '총자본증가율', '총자산대비현금흐름', '총자본투자효율']]
y_train = train[['t-1감사의견코드']]
X_test_sum=test[['유동자산회전률', '총자산대비잉여현금흐름','자기자본구성비율', 'log자산총계','자기자본회전률', '순운전자본회전률', '자기자본증가율', '총자본증가율', '총자산대비현금흐름', '총자본투자효율']]
y_test = test[['t-1감사의견코드']]

In [71]:
test['t-1감사의견코드'].value_counts()

0.0    38257
1.0     2868
Name: t-1감사의견코드, dtype: int64

In [72]:
from imblearn.under_sampling import OneSidedSelection

# One Side Selection을 사용하여 언더샘플링된 데이터 생성
oss = OneSidedSelection()
x_test_undersampled, y_test_undersampled = oss.fit_resample(X_test_sum, y_test)

In [73]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_lgbm(X_train, y_train, X_test, y_test, k_fold=5):
    # Stratified k-fold 교차검증 설정
    cv = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=0)

    # LGBM 하이퍼파라미터 설정 (여기서는 고정값으로 사용)
    params = {
        'max_depth': 8,
        'min_child_samples': 20,
        'n_estimators':100,
        'learning_rate': 0.01,
        'objective': 'binary',
        'random_state': 0
    }

    # 각 fold 별 평가 지표를 저장할 리스트 초기화
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_score_list = []
    confusion_matrix_list = []

    best_f1_score = 0
    best_model = None

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_test_fold, y_test_fold = X_train.iloc[test_idx], y_train.iloc[test_idx]

        # LGBM 모델 초기화
        model = lgb.LGBMClassifier(**params)

        # 모델 학습
        model.fit(X_train_fold, y_train_fold)

        # 테스트 데이터에 대한 예측 확률 얻기
        probabilities = model.predict_proba(X_test_fold)

        # threshold를 0.4로 설정하여 예측 클래스를 조정
        threshold = 0.5
        predicted_classes = (probabilities[:, 1] > threshold).astype(np.int)

        # 평가 지표 계산
        accuracy = accuracy_score(y_test_fold, predicted_classes)
        precision = precision_score(y_test_fold, predicted_classes)
        recall = recall_score(y_test_fold, predicted_classes)
        f1 = f1_score(y_test_fold, predicted_classes)
        conf_matrix = confusion_matrix(y_test_fold, predicted_classes)

        # 각 fold 별 평가 지표를 리스트에 추가
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_score_list.append(f1)
        confusion_matrix_list.append(conf_matrix)

        print(f"Fold {fold_idx}")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 score: {f1}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("------------------------------")

        # 가장 좋은 f1-score 값을 가진 모델을 저장
        if f1 > best_f1_score:
            best_f1_score = f1
            best_model = model

    # 가장 좋은 f1-score 값을 가진 모델로 최종 예측 수행
    probabilities_final = best_model.predict_proba(X_test)
    y_pred_final = (probabilities_final[:, 1] > threshold).astype(np.int)

    # 평가 지표 계산
    accuracy_final = accuracy_score(y_test, y_pred_final)
    precision_final = precision_score(y_test, y_pred_final)
    recall_final = recall_score(y_test, y_pred_final)
    f1_final = f1_score(y_test, y_pred_final)
    conf_matrix_final = confusion_matrix(y_test, y_pred_final)

    print("Final Test Results")
    print(f"Accuracy: {accuracy_final}")
    print(f"Precision: {precision_final}")
    print(f"Recall: {recall_final}")
    print(f"F1 score: {f1_final}")
    print("Confusion Matrix:")
    print(conf_matrix_final)

    return accuracy_list, precision_list, recall_list, f1_score_list


In [74]:
evaluate_lgbm(X_train_sum, y_train, x_test_undersampled, y_test_undersampled)

Fold 1
Accuracy: 0.8901808785529716
Precision: 0.8390367553865653
Recall: 0.6895833333333333
F1 score: 0.7570040022870212
Confusion Matrix:
[[5566  254]
 [ 596 1324]]
------------------------------
Fold 2
Accuracy: 0.8890180878552971
Precision: 0.8269704433497537
Recall: 0.6991150442477876
F1 score: 0.7576868829337094
Confusion Matrix:
[[5538  281]
 [ 578 1343]]
------------------------------
Fold 3
Accuracy: 0.8829457364341086
Precision: 0.8169893816364772
Recall: 0.6808953669963561
F1 score: 0.7427597955706985
Confusion Matrix:
[[5526  293]
 [ 613 1308]]
------------------------------
Fold 4
Accuracy: 0.8893913942369815
Precision: 0.8288009888751545
Recall: 0.6984375
F1 score: 0.758055398530243
Confusion Matrix:
[[5542  277]
 [ 579 1341]]
------------------------------
Fold 5
Accuracy: 0.8835766894947668
Precision: 0.8239033693579149
Recall: 0.675
F1 score: 0.7420555396507301
Confusion Matrix:
[[5542  277]
 [ 624 1296]]
------------------------------
Final Test Results
Accuracy: 0.93

([0.8901808785529716,
  0.8890180878552971,
  0.8829457364341086,
  0.8893913942369815,
  0.8835766894947668],
 [0.8390367553865653,
  0.8269704433497537,
  0.8169893816364772,
  0.8288009888751545,
  0.8239033693579149],
 [0.6895833333333333,
  0.6991150442477876,
  0.6808953669963561,
  0.6984375,
  0.675],
 [0.7570040022870212,
  0.7576868829337094,
  0.7427597955706985,
  0.758055398530243,
  0.7420555396507301])