In [1]:
import pandas as pd
import numpy as np
#워닝 메시지
import warnings
warnings.filterwarnings('ignore')


In [2]:
train = pd.read_csv('./train_1대1.csv', encoding='euc-kr')
test = pd.read_csv('./test_1대1.csv', encoding='euc-kr')

In [3]:
X_train_sum=train[['유동자산회전률', '총자산대비잉여현금흐름','자기자본구성비율', 'log자산총계','자기자본회전률', '순운전자본회전률', '자기자본증가율', '총자본증가율', '총자산대비현금흐름', '총자본투자효율']]
y_train = train[['t-1감사의견코드']]
X_test_sum=test[['유동자산회전률', '총자산대비잉여현금흐름','자기자본구성비율', 'log자산총계','자기자본회전률', '순운전자본회전률', '자기자본증가율', '총자본증가율', '총자산대비현금흐름', '총자본투자효율']]
y_test = test[['t-1감사의견코드']]

In [4]:
X_train = train.drop('t-1감사의견코드',axis=1)
y_train = train[['t-1감사의견코드']]

X_test = test.drop('t-1감사의견코드',axis=1)
y_test = test[['t-1감사의견코드']]

In [15]:
# ------------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------------

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC


def perform_model_grid_search(model_class, X_train, y_train, param_grid, k_fold=5):
    # Stratified k-fold 교차검증 설정
    cv = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=0)

    # 모델 초기화
    model = model_class()

    # 그리드 서치 설정
    grid_search = GridSearchCV(model, param_grid, scoring='f1', cv=cv, verbose=0, n_jobs=-1)

    # 모델 학습 및 튜닝
    grid_search.fit(X_train, y_train)

    # 최적 하이퍼파라미터 출력
    print("Best Hyperparameters:", grid_search.best_params_)

    # 평균 평가 지표 계산
    mean_f1_score = np.mean(grid_search.cv_results_['mean_test_score'])
    print("Mean F1 Score:", mean_f1_score)

    return grid_search.best_params_, mean_f1_score

def evaluate_model_with_best_params_and_threshold(model_class, X_train, y_train, X_test, y_test, best_params, threshold=0.5, k_fold=5):
    cv = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=0)

    model = model_class(**best_params, verbose=0)

    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_score_list = []
    confusion_matrix_list = []

    best_f1_score = 0
    best_model = None

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_test_fold, y_test_fold = X_train.iloc[test_idx], y_train.iloc[test_idx]

        model.fit(X_train_fold, y_train_fold)

        y_pred_fold_prob = model.predict_proba(X_test_fold)[:, 1]
        y_pred_fold = (y_pred_fold_prob > threshold).astype(int)

        accuracy = accuracy_score(y_test_fold, y_pred_fold)
        precision = precision_score(y_test_fold, y_pred_fold)
        recall = recall_score(y_test_fold, y_pred_fold)
        f1 = f1_score(y_test_fold, y_pred_fold)
        conf_matrix = confusion_matrix(y_test_fold, y_pred_fold)

        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_score_list.append(f1)
        confusion_matrix_list.append(conf_matrix)

        print(f"Fold {fold_idx}")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 score: {f1}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("------------------------------")

        if f1 > best_f1_score:
            best_f1_score = f1
            best_model = model

    y_pred_final_prob = best_model.predict_proba(X_test)[:, 1]
    y_pred_final = (y_pred_final_prob > threshold).astype(int)

    accuracy_final = accuracy_score(y_test, y_pred_final)
    precision_final = precision_score(y_test, y_pred_final)
    recall_final = recall_score(y_test, y_pred_final)
    f1_final = f1_score(y_test, y_pred_final)
    conf_matrix_final = confusion_matrix(y_test, y_pred_final)

    print("Final Test Results with Threshold =", threshold)
    print(f"Accuracy: {accuracy_final}")
    print(f"Precision: {precision_final}")
    print(f"Recall: {recall_final}")
    print(f"F1 score: {f1_final}")
    print("Confusion Matrix:")
    print(conf_matrix_final)

    return accuracy_list, precision_list, recall_list, f1_score_list

# --------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------
# 모델 선택 및 하이퍼파라미터 그리드 설정
model_choices = {
    "LogisticRegression": (LogisticRegression, {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2']
    }),
    "RandomForest": (RandomForestClassifier, {
        'n_estimators': [200],
        'max_depth': [14],
        # 'min_samples_split': [2, 5, 10],
        # 'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt']
    }),
    "XGBoost": (xgb.XGBClassifier, {
        'max_depth': [8],
        'learning_rate': [0.09],
        'n_estimators': [108]
    }),
    "LightGBM": (LGBMClassifier, {
        'max_depth': [11,12,13],
        'learning_rate': [0.057,0.055],
        'n_estimators': [120,110]
    }),
    "CatBoost": (CatBoostClassifier, {
        'depth': [9],
        'learning_rate': [0.1],
        'iterations': [190]
    }),
    "SVC": (SVC, {
        'C': [ 1],
        'kernel': ['linear']
        # 'gamma': ['scale', 'auto']
    })
}


In [6]:
# 모델 선택
selected_model = "LightGBM"

# 모델 선택 및 설정
model_class, param_grid = model_choices[selected_model]

# 모델 하이퍼파라미터 튜닝
best_params, mean_f1_score = perform_model_grid_search(model_class, X_train_sum, y_train, param_grid)

# 모델 평가
evaluate_model_with_best_params_and_threshold(model_class, X_train_sum, y_train, X_test_sum, y_test, best_params)

# ----------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------

Best Hyperparameters: {'learning_rate': 0.055, 'max_depth': 11, 'n_estimators': 120}
Mean F1 Score: 0.8724155746432741
You can set `force_col_wise=true` to remove the overhead.
Fold 1
Accuracy: 0.8693048685238219
Precision: 0.9060710194730813
Recall: 0.8239583333333333
F1 score: 0.8630660120021821
Confusion Matrix:
[[1757  164]
 [ 338 1582]]
------------------------------
You can set `force_col_wise=true` to remove the overhead.
Fold 2
Accuracy: 0.8614944024993492
Precision: 0.9011560693641618
Recall: 0.8119791666666667
F1 score: 0.8542465753424657
Confusion Matrix:
[[1750  171]
 [ 361 1559]]
------------------------------
You can set `force_col_wise=true` to remove the overhead.
Fold 3
Accuracy: 0.8664410309815153
Precision: 0.9107351225204201
Recall: 0.812597605413847
F1 score: 0.8588720770288858
Confusion Matrix:
[[1767  153]
 [ 360 1561]]
------------------------------
You can set `force_col_wise=true` to remove the overhead.
Fold 4
Accuracy: 0.8729497526685759
Precision: 0.9064095

([0.8693048685238219,
  0.8614944024993492,
  0.8664410309815153,
  0.8729497526685759,
  0.8768229166666667],
 [0.9060710194730813,
  0.9011560693641618,
  0.9107351225204201,
  0.9064095292115711,
  0.9085262563523433],
 [0.8239583333333333,
  0.8119791666666667,
  0.812597605413847,
  0.831858407079646,
  0.8380208333333333],
 [0.8630660120021821,
  0.8542465753424657,
  0.8588720770288858,
  0.8675352877307275,
  0.8718504470333243])

In [7]:
# 모델 선택
selected_model = "LogisticRegression"

# 모델 선택 및 설정
model_class, param_grid = model_choices[selected_model]

# 모델 하이퍼파라미터 튜닝
best_params, mean_f1_score = perform_model_grid_search(model_class, X_train_sum, y_train, param_grid)

# 모델 평가
evaluate_model_with_best_params_and_threshold(model_class, X_train_sum, y_train, X_test_sum, y_test, best_params)

# ----------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------

Best Hyperparameters: {'C': 1, 'penalty': 'l2'}
Mean F1 Score: nan
Fold 1
Accuracy: 0.7878156730018224
Precision: 0.8705566733735748
Recall: 0.6760416666666667
F1 score: 0.7610671357373204
Confusion Matrix:
[[1728  193]
 [ 622 1298]]
------------------------------
Fold 2
Accuracy: 0.7758396250976308
Precision: 0.8463047743623283
Recall: 0.6739583333333333
F1 score: 0.750362423890983
Confusion Matrix:
[[1686  235]
 [ 626 1294]]
------------------------------
Fold 3
Accuracy: 0.7872949752668575
Precision: 0.8729729729729729
Recall: 0.672566371681416
F1 score: 0.7597765363128492
Confusion Matrix:
[[1732  188]
 [ 629 1292]]
------------------------------
Fold 4
Accuracy: 0.7742775318927363
Precision: 0.8624484181568088
Recall: 0.6527850078084331
F1 score: 0.7431111111111111
Confusion Matrix:
[[1720  200]
 [ 667 1254]]
------------------------------
Fold 5
Accuracy: 0.7942708333333334
Precision: 0.8697643979057592
Recall: 0.6921875
F1 score: 0.7708816705336428
Confusion Matrix:
[[1721  199]

([0.7878156730018224,
  0.7758396250976308,
  0.7872949752668575,
  0.7742775318927363,
  0.7942708333333334],
 [0.8705566733735748,
  0.8463047743623283,
  0.8729729729729729,
  0.8624484181568088,
  0.8697643979057592],
 [0.6760416666666667,
  0.6739583333333333,
  0.672566371681416,
  0.6527850078084331,
  0.6921875],
 [0.7610671357373204,
  0.750362423890983,
  0.7597765363128492,
  0.7431111111111111,
  0.7708816705336428])

In [11]:
# 모델 선택
selected_model = "XGBoost"

# 모델 선택 및 설정
model_class, param_grid = model_choices[selected_model]

# 모델 하이퍼파라미터 튜닝
best_params, mean_f1_score = perform_model_grid_search(model_class, X_train_sum, y_train, param_grid)

# 모델 평가
evaluate_model_with_best_params_and_threshold(model_class, X_train_sum, y_train, X_test_sum, y_test, best_params)

# ----------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------

Best Hyperparameters: {'learning_rate': 0.09, 'max_depth': 8, 'n_estimators': 108}
Mean F1 Score: 0.8701729278268135
Parameters: { "verbose" } are not used.

Fold 1
Accuracy: 0.8695652173913043
Precision: 0.8997183098591549
Recall: 0.8317708333333333
F1 score: 0.8644113667117725
Confusion Matrix:
[[1743  178]
 [ 323 1597]]
------------------------------
Parameters: { "verbose" } are not used.

Fold 2
Accuracy: 0.8622754491017964
Precision: 0.8976558033161807
Recall: 0.8177083333333334
F1 score: 0.8558190242572908
Confusion Matrix:
[[1742  179]
 [ 350 1570]]
------------------------------
Parameters: { "verbose" } are not used.

Fold 3
Accuracy: 0.8653996355115855
Precision: 0.903448275862069
Recall: 0.8183237896928683
F1 score: 0.8587817536192297
Confusion Matrix:
[[1752  168]
 [ 349 1572]]
------------------------------
Parameters: { "verbose" } are not used.

Fold 4
Accuracy: 0.874251497005988
Precision: 0.9071347678369196
Recall: 0.8339406559083811
F1 score: 0.8689991863303499
Confu

([0.8695652173913043,
  0.8622754491017964,
  0.8653996355115855,
  0.874251497005988,
  0.87578125],
 [0.8997183098591549,
  0.8976558033161807,
  0.903448275862069,
  0.9071347678369196,
  0.9069373942470389],
 [0.8317708333333333,
  0.8177083333333334,
  0.8183237896928683,
  0.8339406559083811,
  0.8375],
 [0.8644113667117725,
  0.8558190242572908,
  0.8587817536192297,
  0.8689991863303499,
  0.8708367181153533])

In [14]:
# 모델 선택
selected_model = "CatBoost"

# 모델 선택 및 설정
model_class, param_grid = model_choices[selected_model]

# 모델 하이퍼파라미터 튜닝
best_params, mean_f1_score = perform_model_grid_search(model_class, X_train_sum, y_train, param_grid)

# 모델 평가
evaluate_model_with_best_params_and_threshold(model_class, X_train_sum, y_train, X_test_sum, y_test, best_params)

# ----------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------

0:	learn: 0.6397955	total: 154ms	remaining: 29.1s
1:	learn: 0.5969108	total: 165ms	remaining: 15.5s
2:	learn: 0.5602154	total: 175ms	remaining: 10.9s
3:	learn: 0.5289158	total: 186ms	remaining: 8.64s
4:	learn: 0.5012140	total: 197ms	remaining: 7.3s
5:	learn: 0.4789660	total: 208ms	remaining: 6.39s
6:	learn: 0.4588299	total: 220ms	remaining: 5.74s
7:	learn: 0.4410960	total: 232ms	remaining: 5.27s
8:	learn: 0.4269038	total: 243ms	remaining: 4.89s
9:	learn: 0.4146278	total: 254ms	remaining: 4.58s
10:	learn: 0.4032017	total: 265ms	remaining: 4.32s
11:	learn: 0.3936151	total: 277ms	remaining: 4.11s
12:	learn: 0.3850322	total: 288ms	remaining: 3.92s
13:	learn: 0.3776465	total: 298ms	remaining: 3.75s
14:	learn: 0.3711135	total: 314ms	remaining: 3.66s
15:	learn: 0.3655256	total: 324ms	remaining: 3.52s
16:	learn: 0.3598707	total: 335ms	remaining: 3.4s
17:	learn: 0.3551859	total: 345ms	remaining: 3.3s
18:	learn: 0.3510535	total: 356ms	remaining: 3.21s
19:	learn: 0.3474345	total: 366ms	remaining:

([0.8698255662587868,
  0.8612340536318667,
  0.8687841707888571,
  0.872429054933611,
  0.8799479166666667],
 [0.9052511415525114,
  0.8974212034383954,
  0.9078871617731722,
  0.9076923076923077,
  0.911914172783738],
 [0.8260416666666667,
  0.815625,
  0.8209266007287871,
  0.8292555960437272,
  0.8411458333333334],
 [0.8638344226579521,
  0.8545702592087313,
  0.8622197922361947,
  0.8667029379760609,
  0.875101598482796])

In [10]:
# 모델 선택
selected_model = "RandomForest"

# 모델 선택 및 설정
model_class, param_grid = model_choices[selected_model]

# 모델 하이퍼파라미터 튜닝
best_params, mean_f1_score = perform_model_grid_search(model_class, X_train_sum, y_train, param_grid)

# 모델 평가
evaluate_model_with_best_params_and_threshold(model_class, X_train_sum, y_train, X_test_sum, y_test, best_params)

# ----------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------

Best Hyperparameters: {'max_depth': 14, 'max_features': 'sqrt', 'n_estimators': 200}
Mean F1 Score: 0.8712630582571343
Fold 1
Accuracy: 0.8719083571986462
Precision: 0.9122401847575058
Recall: 0.8229166666666666
F1 score: 0.8652792990142388
Confusion Matrix:
[[1769  152]
 [ 340 1580]]
------------------------------
Fold 2
Accuracy: 0.8633168445717261
Precision: 0.9071803852889667
Recall: 0.809375
F1 score: 0.8554913294797688
Confusion Matrix:
[[1762  159]
 [ 366 1554]]
------------------------------
Fold 3
Accuracy: 0.8677427753189274
Precision: 0.9119533527696793
Recall: 0.8141592920353983
F1 score: 0.8602860286028603
Confusion Matrix:
[[1769  151]
 [ 357 1564]]
------------------------------
Fold 4
Accuracy: 0.8732101015360583
Precision: 0.9111238532110092
Recall: 0.8271733472149921
F1 score: 0.8671214188267394
Confusion Matrix:
[[1765  155]
 [ 332 1589]]
------------------------------
Fold 5
Accuracy: 0.8739583333333333
Precision: 0.9102857142857143
Recall: 0.8296875
F1 score: 0.868

([0.8719083571986462,
  0.8633168445717261,
  0.8677427753189274,
  0.8732101015360583,
  0.8739583333333333],
 [0.9122401847575058,
  0.9071803852889667,
  0.9119533527696793,
  0.9111238532110092,
  0.9102857142857143],
 [0.8229166666666666,
  0.809375,
  0.8141592920353983,
  0.8271733472149921,
  0.8296875],
 [0.8652792990142388,
  0.8554913294797688,
  0.8602860286028603,
  0.8671214188267394,
  0.8681198910081743])

In [16]:
# 모델 선택
selected_model = "SVC"

# 모델 선택 및 설정
model_class, param_grid = model_choices[selected_model]

# 모델 하이퍼파라미터 튜닝
best_params, mean_f1_score = perform_model_grid_search(model_class, X_train_sum, y_train, param_grid)

# 모델 평가
evaluate_model_with_best_params_and_threshold(model_class, X_train_sum, y_train, X_test_sum, y_test, best_params)

# ----------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------

Best Hyperparameters: {'C': 1, 'kernel': 'linear'}
Mean F1 Score: 0.809788263755643


AttributeError: predict_proba is not available when  probability=False