In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from datetime import datetime
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score,f1_score,recall_score,precision_score,confusion_matrix,ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_recall_curve
from sklearn.ensemble import RandomForestClassifier , StackingClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression,Lasso
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import cross_val_score,GridSearchCV

# 한글 깨짐 방지
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'malgun Gothic'

---
## 1.feature selection된 데이터 불러오기

In [2]:
train=pd.read_csv('./Dataset/train_Winsorization.csv',encoding='euc-kr')
test=pd.read_csv('./Dataset/test.csv',encoding='euc-kr')

In [3]:
train['기업수명주기'] = train['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

test['기업수명주기'] = test['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

train['파부비초과여부']=train['파부비초과여부'].astype('category')
test['파부비초과여부']=test['파부비초과여부'].astype('category')
train['파차의초과여부']=train['파차의초과여부'].astype('category')
test['파차의초과여부']=test['파차의초과여부'].astype('category')

In [4]:
train=train[['총자본증가율',
'총자산대비잉여현금흐름',
'당좌자산회전률',
'순운전자본회전률',
'자기자본회전률',
'log자산총계',
'자기자본증가율',
'기업수명주기',
'유동자산회전률',
'총자산대비현금흐름',
'매출액대비잉여현금흐름',
'총자본투자효율',
'총자본순이익률',
'파부비초과여부',
'파차의초과여부','t-1감사의견코드']]

test=test[['총자본증가율',
'총자산대비잉여현금흐름',
'당좌자산회전률',
'순운전자본회전률',
'자기자본회전률',
'log자산총계',
'자기자본증가율',
'기업수명주기',
'유동자산회전률',
'총자산대비현금흐름',
'매출액대비잉여현금흐름',
'총자본투자효율',
'총자본순이익률',
'파부비초과여부',
'파차의초과여부','t-1감사의견코드']]


In [5]:
X_train = train.drop('t-1감사의견코드',axis=1)
X_train = X_train.iloc[:,3:]
y_train = train[['t-1감사의견코드']]
X_test = test.drop('t-1감사의견코드',axis=1)
X_test = X_test.iloc[:,3:]
y_test = test[['t-1감사의견코드']]

In [6]:
scaler = StandardScaler()
train_sc = scaler.fit_transform(X_train)
test_sc = scaler.transform(X_test)

---
## 2. 리셈플링 X

In [45]:
def model_basic(x_train, y_train, x_test, y_test): 
    models = [
        LogisticRegression(C=1,penalty='l2',random_state=0)
        # ,
        # RandomForestClassifier(random_state=0),
        # XGBClassifier(random_state=0),
        # LGBMClassifier(random_state=0)
    ]

    rdict={'model':[],'acc_train':[], 'auc_train':[], 'acc_test':[],'precision':[],'recall':[],'f1_score':[], 'AUC_test':[]}


    for clf in models:
        clf = clf.fit(x_train, y_train)
    #1열:Train
        y_hat = clf.predict(x_train)
        results_train  = (round(accuracy_score(y_train,y_hat),4),round(roc_auc_score(y_train,y_hat),4))
    #2열:Test
        y_hat = clf.predict(x_test)
        results = (round(accuracy_score(y_test,y_hat),4),
                        round(precision_score(y_test,y_hat),4),
                        round(recall_score(y_test,y_hat),4),
                        round(f1_score(y_test,y_hat),4),
                        round(roc_auc_score(y_test,y_hat),4))

        rdict['model'].append(clf); 
        rdict['acc_train'].append(results_train[0])
        rdict['auc_train'].append(results_train[1])
        
        rdict['acc_test'].append(results[0])
        rdict['precision'].append(results[1])
        rdict['recall'].append(results[2])
        rdict['f1_score'].append(results[3])
        rdict['AUC_test'].append(results[4])   

        confusion = confusion_matrix(y_test, y_hat)

        print(confusion)

    rdf_final = pd.DataFrame(data=rdict)
    return rdf_final

In [46]:
model_basic(train_sc, y_train, test_sc, y_test)

[[37906   351]
 [ 2629   239]]


Unnamed: 0,model,acc_train,auc_train,acc_test,precision,recall,f1_score,AUC_test
0,"LogisticRegression(C=1, random_state=0)",0.9283,0.5361,0.9275,0.4051,0.0833,0.1382,0.5371


---
# 3. 리샘플링 O