In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import StandardScaler

In [66]:
train=pd.read_csv('./Dataset/Undersampling_0.33_train.csv',index_col=False, encoding='euc-kr')
test=pd.read_csv('./Dataset/Undersampling_0.33_test.csv',index_col=False,encoding='euc-kr')

In [67]:
X_train_int=train[['부채비율', '총자본회전률', '매출액대비잉여현금흐름', 'PBR', '총자산대비영업현금흐름',
'자기자본증가율', '총자본투자효율', '총자본순이익률', '매출액영업이익률']]

X_test_int=test[['부채비율', '총자본회전률', '매출액대비잉여현금흐름', 'PBR', '총자산대비영업현금흐름',
'자기자본증가율', '총자본투자효율', '총자본순이익률', '매출액영업이익률']]

In [68]:
X_train = train.drop('t-1감사의견코드',axis=1)
y_train = train[['t-1감사의견코드']]

X_test = test.drop('t-1감사의견코드',axis=1)
y_test = test[['t-1감사의견코드']]

In [69]:
X_train_sc = X_train
X_test_sc = X_test

In [70]:
X_train_sum =X_train_sc
X_test_sum =X_test_sc

In [71]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
import numpy as np

def perform_logit_grid_search(X_train, y_train, k_fold=5):
    # Stratified k-fold cross validation setup
    cv = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=0)

    # Logistic Regression hyperparameter candidate list setup
    param_grid = {
        'C': [0.1,0.5,1],
        'penalty': ['l1', 'l2', 'elasticnet'],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'random_state': [0]
    }

    # Initialize a Logistic Regression model
    model = LogisticRegression()

    # Grid search setup
    grid_search = GridSearchCV(model, param_grid, scoring='f1', cv=cv, verbose=1, n_jobs=-1)

    # Fit the model and tune
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters
    print("Best Hyperparameters:", grid_search.best_params_)

    # Calculate the average evaluation metric
    mean_f1_score = np.mean(grid_search.cv_results_['mean_test_score'])
    print("Mean F1 Score:", mean_f1_score)

    return grid_search.best_params_, mean_f1_score


In [72]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
import numpy as np

def evaluate_logit_with_best_params(X_train, y_train, X_test, y_test, best_params, k_fold=5):
    # Stratified k-fold cross validation setup
    cv = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=0)

    # Initialize a Logistic Regression model
    model = LogisticRegression(**best_params)

    # Lists to save the evaluation metrics for each fold
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_score_list = []
    confusion_matrix_list = []

    best_f1_score = 0
    best_model = None

    for fold_idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train), 1):
        X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_test_fold, y_test_fold = X_train.iloc[test_idx], y_train.iloc[test_idx]

        # Fit the model
        model.fit(X_train_fold, y_train_fold)

        # Get the predicted probabilities on the test data
        probabilities = model.predict_proba(X_test_fold)

        # Adjust the predicted classes with a threshold of 0.5
        threshold = 0.395
        predicted_classes = (probabilities[:, 1] > threshold).astype(np.int)

        # Calculate the evaluation metrics
        accuracy = accuracy_score(y_test_fold, predicted_classes)
        precision = precision_score(y_test_fold, predicted_classes)
        recall = recall_score(y_test_fold, predicted_classes)
        f1 = f1_score(y_test_fold, predicted_classes)
        conf_matrix = confusion_matrix(y_test_fold, predicted_classes)

        # Append the evaluation metrics for each fold to the lists
        accuracy_list.append(accuracy)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_score_list.append(f1)
        confusion_matrix_list.append(conf_matrix)

        print(f"Fold {fold_idx}")
        print(f"Accuracy: {accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 score: {f1}")
        print("Confusion Matrix:")
        print(conf_matrix)
        print("------------------------------")

        # Save the model with the best F1 score
        if f1 > best_f1_score:
            best_f1_score = f1
            best_model = model


    # Perform the final prediction with the model with the best F1 score
    probabilities_final = best_model.predict_proba(X_test)
    y_pred_final = (probabilities_final[:, 1] > threshold).astype(np.int)

    # Calculate the evaluation metrics
    accuracy_final = accuracy_score(y_test, y_pred_final)
    precision_final = precision_score(y_test, y_pred_final)
    recall_final = recall_score(y_test, y_pred_final)
    f1_final = f1_score(y_test, y_pred_final)
    conf_matrix_final = confusion_matrix(y_test, y_pred_final)

    print("Final Test Results")
    print(f"Accuracy: {accuracy_final}")
    print(f"Precision: {precision_final}")
    print(f"Recall: {recall_final}")
    print(f"F1 score: {f1_final}")
    print("Confusion Matrix:")
    print(conf_matrix_final)

    return accuracy_list, precision_list, recall_list, f1_score_list


In [73]:
best_params, mean_f1_score = perform_logit_grid_search(X_train, y_train, k_fold=7)
evaluate_logit_with_best_params(X_train_int, y_train, X_test_int, y_test, best_params, k_fold=7)

Fitting 7 folds for each of 45 candidates, totalling 315 fits
Best Hyperparameters: {'C': 0.5, 'penalty': 'l2', 'random_state': 0, 'solver': 'liblinear'}
Mean F1 Score: nan
Fold 1
Accuracy: 0.8269230769230769
Precision: 0.6666666666666666
Recall: 0.6153846153846154
F1 score: 0.64
Confusion Matrix:
[[35  4]
 [ 5  8]]
------------------------------
Fold 2
Accuracy: 0.8653846153846154
Precision: 0.8
Recall: 0.6153846153846154
F1 score: 0.6956521739130435
Confusion Matrix:
[[37  2]
 [ 5  8]]
------------------------------
Fold 3
Accuracy: 0.7307692307692307
Precision: 0.42857142857142855
Recall: 0.23076923076923078
F1 score: 0.3
Confusion Matrix:
[[35  4]
 [10  3]]
------------------------------
Fold 4
Accuracy: 0.8076923076923077
Precision: 0.6
Recall: 0.6923076923076923
F1 score: 0.6428571428571429
Confusion Matrix:
[[33  6]
 [ 4  9]]
------------------------------
Fold 5
Accuracy: 0.8653846153846154
Precision: 0.7142857142857143
Recall: 0.7692307692307693
F1 score: 0.7407407407407408
Co

168 fits failed out of a total of 315.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
21 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dgh06\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dgh06\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\dgh06\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver 

([0.8269230769230769,
  0.8653846153846154,
  0.7307692307692307,
  0.8076923076923077,
  0.8653846153846154,
  0.7450980392156863,
  0.8823529411764706],
 [0.6666666666666666,
  0.8,
  0.42857142857142855,
  0.6,
  0.7142857142857143,
  0.4444444444444444,
  1.0],
 [0.6153846153846154,
  0.6153846153846154,
  0.23076923076923078,
  0.6923076923076923,
  0.7692307692307693,
  0.3333333333333333,
  0.5384615384615384],
 [0.64,
  0.6956521739130435,
  0.3,
  0.6428571428571429,
  0.7407407407407408,
  0.380952380952381,
  0.7000000000000001])