In [14]:
import pandas as pd
import numpy as np
#워닝 메시지
import warnings
warnings.filterwarnings('ignore')


In [2]:
train = pd.read_csv('./Dataset/Undersampling/TomekLinks_0.33_train.csv', encoding='euc-kr')
test = pd.read_csv('./Dataset/Undersampling/TomekLinks_0.33_test.csv', encoding='euc-kr')

In [3]:
X_train_sum=train[['자기자본증가율', '자기자본회전률', '유동자산회전률', '총자산대비잉여현금흐름', '자기자본구성비율', '순운전자본회전률', '총자본증가율','log자산총계', '총자본투자효율','총자본순이익률','매출액대비잉여현금흐름','총자산대비현금흐름']]

X_test_sum=test[['자기자본증가율', '자기자본회전률', '유동자산회전률', '총자산대비잉여현금흐름', '자기자본구성비율', '순운전자본회전률', '총자본증가율','log자산총계', '총자본투자효율','총자본순이익률','매출액대비잉여현금흐름','총자산대비현금흐름']]

In [4]:
X_train = train.drop('t-1감사의견코드',axis=1)
y_train = train[['t-1감사의견코드']]

X_test = test.drop('t-1감사의견코드',axis=1)
y_test = test[['t-1감사의견코드']]

In [21]:
# ------------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------------

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC


def perform_model_grid_search(model_class, X_train, y_train, param_grid, k_fold=5):
    # Stratified k-fold 교차검증 설정
    cv = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=0)

    # 모델 초기화
    model = model_class()

    # 그리드 서치 설정
    grid_search = GridSearchCV(model, param_grid, scoring='f1', cv=cv, verbose=0, n_jobs=-1)

    # 모델 학습 및 튜닝
    grid_search.fit(X_train, y_train)

    # 최적 하이퍼파라미터 출력
    print("Best Hyperparameters:", grid_search.best_params_)

    # 평균 평가 지표 계산
    mean_f1_score = np.mean(grid_search.cv_results_['mean_test_score'])
    print("Mean F1 Score:", mean_f1_score)

    return grid_search.best_params_, mean_f1_score

def evaluate_model_with_best_params(model_class, X_train, y_train, X_test, y_test, best_params, k_fold=5):
    # Stratified k-fold 교차검증 설정
    cv = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=0)

    # 모델 초기화
    model = model_class(**best_params,verbose=0)

    # 각 fold 별 평가 지표를 저장할 리스트 초기화
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_score_list = []
    confusion_matrix_list = []

    best_f1_score = 0
    best_model = None

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_test_fold, y_test_fold = X_train.iloc[test_idx], y_train.iloc[test_idx]

        # 모델 학습
        model.fit(X_train_fold, y_train_fold)

        # 평가 지표 계산
        y_pred_fold = model.predict(X_test_fold)
        accuracy = accuracy_score(y_test_fold, y_pred_fold)
        precision = precision_score(y_test_fold, y_pred_fold)
        recall = recall_score(y_test_fold, y_pred_fold)
        f1 = f1_score(y_test_fold, y_pred_fold)
        conf_matrix = confusion_matrix(y_test_fold, y_pred_fold)

        # 각 fold 별 평가 지표를 리스트에 추가
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_score_list.append(f1)
        confusion_matrix_list.append(conf_matrix)

        print(f"Fold {fold_idx}")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 score: {f1}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("------------------------------")

        # 가장 좋은 f1-score 값을 가진 모델을 저장
        if f1 > best_f1_score:
            best_f1_score = f1
            best_model = model

    # 가장 좋은 f1-score 값을 가진 모델로 최종 예측 수행
    y_pred_final = best_model.predict(X_test)

    # 평가 지표 계산
    accuracy_final = accuracy_score(y_test, y_pred_final)
    precision_final = precision_score(y_test, y_pred_final)
    recall_final = recall_score(y_test, y_pred_final)
    f1_final = f1_score(y_test, y_pred_final)
    conf_matrix_final = confusion_matrix(y_test, y_pred_final)

    print("Final Test Results")
    print(f"Accuracy: {accuracy_final}")
    print(f"Precision: {precision_final}")
    print(f"Recall: {recall_final}")
    print(f"F1 score: {f1_final}")
    print("Confusion Matrix:")
    print(conf_matrix_final)

    return accuracy_list, precision_list, recall_list, f1_score_list
# --------------------------------------------------------------------------------------------
# --------------------------------------------------------------------------------------------
# 모델 선택 및 하이퍼파라미터 그리드 설정
model_choices = {
    "LogisticRegression": (LogisticRegression, {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2']
    }),
    "RandomForest": (RandomForestClassifier, {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt']
    }),
    "XGBoost": (xgb.XGBClassifier, {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'n_estimators': [100, 200, 300]
    }),
    "LightGBM": (LGBMClassifier, {
        'max_depth': [12,14],
        'learning_rate': [0.082, 0.08,0.083,],
        'n_estimators': [120,100,110]
    }),
    "CatBoost": (CatBoostClassifier, {
        'depth': [10,12,14],
        'learning_rate': [0.09, 0.1,0.12],
        'iterations': [120,140,160]
    }),
    "SVC": (SVC, {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    })
}


In [22]:
# 모델 선택
selected_model = "CatBoost"

# 모델 선택 및 설정
model_class, param_grid = model_choices[selected_model]

# 모델 하이퍼파라미터 튜닝
best_params, mean_f1_score = perform_model_grid_search(model_class, X_train_sum, y_train, param_grid)

# 모델 평가
evaluate_model_with_best_params(model_class, X_train_sum, y_train, X_test_sum, y_test, best_params)

# ----------------------------------------------------------------------------------------------------------
# ----------------------------------------------------------------------------------------------------------

0:	learn: 0.6366125	total: 59.9ms	remaining: 8.33s
1:	learn: 0.5886020	total: 105ms	remaining: 7.26s
2:	learn: 0.5475953	total: 143ms	remaining: 6.52s
3:	learn: 0.5134040	total: 180ms	remaining: 6.11s
4:	learn: 0.4845862	total: 218ms	remaining: 5.89s
5:	learn: 0.4592335	total: 261ms	remaining: 5.82s
6:	learn: 0.4368703	total: 311ms	remaining: 5.92s
7:	learn: 0.4174644	total: 351ms	remaining: 5.79s
8:	learn: 0.4010262	total: 392ms	remaining: 5.71s
9:	learn: 0.3867007	total: 441ms	remaining: 5.73s
10:	learn: 0.3742173	total: 494ms	remaining: 5.79s
11:	learn: 0.3624524	total: 536ms	remaining: 5.72s
12:	learn: 0.3522980	total: 589ms	remaining: 5.75s
13:	learn: 0.3431179	total: 632ms	remaining: 5.68s
14:	learn: 0.3351459	total: 673ms	remaining: 5.61s
15:	learn: 0.3276018	total: 720ms	remaining: 5.58s
16:	learn: 0.3216180	total: 768ms	remaining: 5.55s
17:	learn: 0.3159459	total: 815ms	remaining: 5.53s
18:	learn: 0.3110117	total: 859ms	remaining: 5.47s
19:	learn: 0.3067350	total: 898ms	remain

([0.8931524547803618,
  0.8997416020671835,
  0.896124031007752,
  0.9007623723995348,
  0.892363354438558],
 [0.7974959172563963,
  0.817526344980588,
  0.811142061281337,
  0.8147540983606557,
  0.7856016815554387],
 [0.7630208333333334,
  0.76730869338886,
  0.7579385736595523,
  0.7765625,
  0.7786458333333334],
 [0.7798775618844823,
  0.7916219119226637,
  0.7836383207750269,
  0.7952,
  0.7821082919173424])