In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from datetime import datetime
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier , StackingClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import cross_val_score,GridSearchCV

# 한글 깨짐 방지
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'malgun Gothic'

In [5]:
train=pd.read_csv('./Dataset/train_Winsorization.csv',encoding='euc-kr')
test=pd.read_csv('./Dataset/test.csv',encoding='euc-kr')

In [6]:
train['기업수명주기'] = train['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

test['기업수명주기'] = test['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

In [7]:
train['파부비초과여부']=train['파부비초과여부'].astype('category')
test['파부비초과여부']=test['파부비초과여부'].astype('category')
train['파차의초과여부']=train['파차의초과여부'].astype('category')
test['파차의초과여부']=test['파차의초과여부'].astype('category')

In [8]:
train=train[['총자본증가율',
'총자산대비잉여현금흐름',
'당좌자산회전률',
'순운전자본회전률',
'자기자본회전률',
'log자산총계',
'자기자본증가율',
'기업수명주기',
'유동자산회전률',
'총자산대비현금흐름',
'매출액대비잉여현금흐름',
'총자본투자효율',
'총자본순이익률',
'파부비초과여부',
'파차의초과여부','t-1감사의견코드']]

test=test[['총자본증가율',
'총자산대비잉여현금흐름',
'당좌자산회전률',
'순운전자본회전률',
'자기자본회전률',
'log자산총계',
'자기자본증가율',
'기업수명주기',
'유동자산회전률',
'총자산대비현금흐름',
'매출액대비잉여현금흐름',
'총자본투자효율',
'총자본순이익률',
'파부비초과여부',
'파차의초과여부','t-1감사의견코드']]

In [9]:
X_train = train.drop('t-1감사의견코드',axis=1)
y_train = train[['t-1감사의견코드']]
X_test = test.drop('t-1감사의견코드',axis=1)
y_test = test[['t-1감사의견코드']]

In [10]:
from imblearn.over_sampling import SMOTENC

# 카테고리형 변수의 인덱스를 지정 (예: 7번, 13번, 14번 컬럼이 카테고리형 변수인 경우)
categorical_features_indices = [7, 13, 14]

# SMOTENC 객체 생성
smote_nc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=0.3)

# 데이터셋과 타겟을 넣고 샘플링
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train)


In [11]:
from imblearn.over_sampling import SMOTENC

# 카테고리형 변수의 인덱스를 지정 (예: 7번, 13번, 14번 컬럼이 카테고리형 변수인 경우)
categorical_features_indices = [7, 13, 14]

# SMOTENC 객체 생성
smote_nc = SMOTENC(categorical_features=categorical_features_indices, sampling_strategy=0.3)

# 데이터셋과 타겟을 넣고 샘플링
X_resampled2, y_resampled2 = smote_nc.fit_resample(X_test, y_test)

In [22]:
def model_basic(x_train, y_train, x_test, y_test): 
    models = [
        LogisticRegression(C=0.01,penalty='l2',random_state=0),
        RandomForestClassifier(random_state=0),
        # XGBClassifier(random_state=0,enable_categorical=True ),
        LGBMClassifier(random_state=0)
    ]

    rdict={'model':[],'acc_train':[], 'auc_train':[], 'acc_test':[],'precision':[],'recall':[],'f1_score':[], 'AUC_test':[]}


    for clf in models:
        clf = clf.fit(x_train, y_train)
    #1열:Train
        y_hat = clf.predict(x_train)
        results_train  = (round(accuracy_score(y_train,y_hat),4),round(roc_auc_score(y_train,y_hat),4))
    #2열:Test
        y_hat = clf.predict(x_test)
        results = (round(accuracy_score(y_test,y_hat),4),
                        round(precision_score(y_test,y_hat),4),
                        round(recall_score(y_test,y_hat),4),
                        round(f1_score(y_test,y_hat),4),
                        round(roc_auc_score(y_test,y_hat),4))

        rdict['model'].append(clf); 
        rdict['acc_train'].append(results_train[0])
        rdict['auc_train'].append(results_train[1])
        
        rdict['acc_test'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['AUC_test'].append(results[4])   

        confusion = confusion_matrix(y_test, y_hat)

        print(confusion)

    rdf_final = pd.DataFrame(data=rdict)
    return rdf_final

In [23]:
X_resampled2.shape

(49734, 15)

In [24]:
y_resampled.shape

(166498, 1)

In [25]:
y_resampled2.shape

(49734, 1)

In [26]:
model_basic(X_resampled, y_resampled ,X_resampled2, y_resampled2)

[[36499  1758]
 [ 7039  4438]]
[[36626  1631]
 [ 3569  7908]]
[[36028  2229]
 [ 2904  8573]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,"LogisticRegression(C=0.01, random_state=0)",0.8267,0.6764,0.8231,0.7163,0.3867,0.5022,0.6704
1,"(DecisionTreeClassifier(max_features='sqrt', r...",1.0,1.0,0.8954,0.829,0.689,0.7526,0.8232
2,LGBMClassifier(random_state=0),0.9091,0.8652,0.8968,0.7936,0.747,0.7696,0.8444
