In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [2]:
train = pd.read_csv('./Dataset/train_Winsorization.csv', encoding='euc-kr')
test = pd.read_csv('./Dataset/test.csv', encoding='euc-kr')

In [3]:
train['기업수명주기'] = train['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

test['기업수명주기'] = test['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

In [4]:
len(train.columns)

50

In [5]:
train_int = train[['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
      '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률', '재고자산회전률',
       '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율', '순이익증가율',
       '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율', '부가가치율',
       '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계',
       'OCF이자보상배율', '부채상환계수', '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름',
       '매출액대비잉여현금흐름', '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름',
       't-1감사의견코드']]
test_int = test[['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
      '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률', '재고자산회전률',
       '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율', '순이익증가율',
       '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율', '부가가치율',
       '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계',
       'OCF이자보상배율', '부채상환계수', '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름',
       '매출액대비잉여현금흐름', '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름',
       't-1감사의견코드']]

# Standard Scaler

In [6]:
from sklearn.preprocessing import StandardScaler
X_train = train_int.drop('t-1감사의견코드', axis=1)
y_train = train_int[['t-1감사의견코드']]

X_test = test_int.drop('t-1감사의견코드', axis=1)
y_test = test_int[['t-1감사의견코드']]

In [7]:
train_cat = train[['기업수명주기','이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부']]
test_cat = test[['기업수명주기','이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부']]

In [8]:
scaler = StandardScaler()
train_sc = scaler.fit_transform(X_train)
test_sc = scaler.transform(X_test)

train_sc = pd.DataFrame(train_sc, columns=X_train.columns)
test_sc = pd.DataFrame(test_sc, columns=X_test.columns)

# 스케일링 + 카테고리 변수 +타겟 데이터프레임
train_sc_total = pd.concat([train_sc,train_cat ,y_train], axis=1)
test_sc_total = pd.concat([test_sc, test_cat,y_test], axis=1)

- train : 50개
- train_int : 41개(타겟 포함)
- train_cat : 6개
- train_sc_total : 47개(타겟 포함)
- '회사명', '거래소코드', 'Year' 제외 (3개)

---

# Undersampling

In [9]:
from collections import Counter

In [10]:
print(train_sc_total['t-1감사의견코드'].value_counts())
print(test_sc_total['t-1감사의견코드'].value_counts())

t-1감사의견코드
0.0    128076
1.0      9602
Name: count, dtype: int64
t-1감사의견코드
0.0    38257
1.0     2868
Name: count, dtype: int64


In [11]:
train_sc_total.drop('t-1감사의견코드',axis=1)

Unnamed: 0,부채비율,당좌비율,유동비율,이자보상배율,차입금의존도,자기자본구성비율,매출액영업이익률,자기자본순이익률,총자본순이익률,총자본회전률,...,매출액대비잉여현금흐름,총자산대비현금흐름,총자산대비영업현금흐름,총자산대비잉여현금흐름,기업수명주기,이보배초과여부,파부비초과여부,파당비초과여부,파차의초과여부,파로이초과여부
0,0.298383,0.173476,0.082420,-0.252413,-1.049208,-1.162786,-1.581511,-11.727650,-5.864452,-0.679320,...,0.000194,8.381664,-1.599831,-0.843661,5,0,1,0,0,1
1,-0.228212,-0.232380,-0.262760,-0.245176,0.250579,0.331320,0.192509,0.060807,0.210495,-0.561237,...,0.117963,-0.112725,0.335578,0.424888,3,0,0,0,0,0
2,-0.093818,-0.172586,-0.213088,-0.313191,-1.247746,-0.700586,0.022235,-0.309371,-0.362267,-0.722824,...,-0.038364,-4.234050,-3.140765,-2.132309,5,0,1,0,0,0
3,-0.154482,-0.251870,-0.278951,-0.243238,1.053533,-0.438039,0.333791,0.803830,0.997786,-0.536377,...,-0.055781,-1.765124,0.004584,-3.364207,2,0,0,0,1,0
4,-0.097999,-0.287596,-0.194989,-0.243697,-1.190369,-0.687056,0.207911,5.605906,1.659084,-0.057830,...,0.125221,-0.141415,2.120649,1.949162,4,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137673,-0.093123,-0.209503,-0.189677,-0.243783,-0.608773,-0.702780,0.116263,0.819827,0.444094,0.582305,...,0.110991,-0.158204,0.057808,0.339583,3,0,1,0,0,0
137674,-0.215400,-0.116949,-0.154168,-0.246047,-0.425558,0.122525,0.108682,-0.002059,-0.013570,1.104357,...,0.111585,-0.331277,0.438023,0.675581,4,0,0,0,0,0
137675,-0.254527,-0.006997,0.016630,-0.230915,-0.635506,1.003777,0.172913,0.691009,2.212775,0.458007,...,0.115423,2.250956,1.243888,1.053845,2,0,0,0,0,0
137676,-0.265038,0.063887,-0.007410,-0.255136,-1.015303,1.441477,0.098965,-0.106601,-0.142288,-0.238062,...,0.113132,0.572191,0.056037,0.411884,4,0,0,0,0,0


In [12]:
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import RandomUnderSampler

# 't-1감사의견코드' 열을 제외한 데이터와 해당 열만을 추출합니다.
train_X = train_sc_total.drop('t-1감사의견코드', axis=1)
train_y = train_sc_total['t-1감사의견코드']

# Tomel links 방법 적용
undersampler = TomekLinks()
train_X_resampled, train_y_resampled = undersampler.fit_resample(train_X, train_y)

# 원하는 샘플링 비율로 데이터셋을 다시 조정합니다.
ratio = 0.25  # 4:1 비율을 의미합니다.
custom_undersampler = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
train_X_resampled, train_y_resampled = custom_undersampler.fit_resample(train_X_resampled, train_y_resampled)


In [13]:
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import RandomUnderSampler

# 't-1감사의견코드' 열을 제외한 데이터와 해당 열만을 추출합니다.
test_X = test_sc_total.drop('t-1감사의견코드', axis=1)
test_y = test_sc_total['t-1감사의견코드']

# TomekLinks 방법 적용
undersampler = TomekLinks()
test_X_resampled, test_y_resampled = undersampler.fit_resample(test_X, test_y)

# 원하는 샘플링 비율로 데이터셋을 다시 조정합니다.
ratio = 0.25  # 4:1 비율을 의미합니다.
custom_undersampler = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
test_X_resampled, test_y_resampled = custom_undersampler.fit_resample(test_X_resampled, test_y_resampled)


In [14]:
print(train_y_resampled.value_counts())
print(test_y_resampled.value_counts())

t-1감사의견코드
0.0    38408
1.0     9602
Name: count, dtype: int64
t-1감사의견코드
0.0    11472
1.0     2868
Name: count, dtype: int64


---


- 데이터 프레임 정리

In [16]:
train_y_resampled=pd.DataFrame(train_y_resampled,columns=['t-1감사의견코드'])
test_y_resampled=pd.DataFrame(test_y_resampled,columns=['t-1감사의견코드'])

In [17]:
train_int_resampled = train_X_resampled.drop(['기업수명주기','이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부'],axis=1)
test_int_resampled = test_X_resampled.drop(['기업수명주기','이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부'],axis=1)
train_cat_resampled= train_X_resampled[['기업수명주기','이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부']]
test_cat_resampled= test_X_resampled[['기업수명주기','이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부']]

In [18]:
train_resampled_sum = pd.concat([train_X_resampled,train_y_resampled],axis=1)
test_resampled_sum = pd.concat([test_X_resampled,test_y_resampled],axis=1)

train_int_resampled_sum = pd.concat([train_int_resampled,train_y_resampled],axis=1)
train_cat_resampled_sum =pd.concat([train_cat_resampled,train_y_resampled],axis=1)
test_cat_resampled_sum=pd.concat([test_cat_resampled,test_y_resampled],axis=1)

---

# MDA

In [18]:
# 1-1 정규성 테스트(샤피로)
from scipy.stats import norm
from scipy import stats
from statsmodels.formula.api import ols
from scipy.stats import kstest

for j in train_int_resampled_sum.columns:
    a = stats.shapiro(train_int_resampled_sum[j])
    p = a.pvalue
    print(j,a)

부채비율 ShapiroResult(statistic=0.25084060430526733, pvalue=0.0)
당좌비율 ShapiroResult(statistic=0.2566691040992737, pvalue=0.0)
유동비율 ShapiroResult(statistic=0.2560475468635559, pvalue=0.0)
이자보상배율 ShapiroResult(statistic=0.27902841567993164, pvalue=0.0)
차입금의존도 ShapiroResult(statistic=0.9452553391456604, pvalue=0.0)
자기자본구성비율 ShapiroResult(statistic=0.9503960609436035, pvalue=0.0)
매출액영업이익률 ShapiroResult(statistic=0.14688855409622192, pvalue=0.0)
자기자본순이익률 ShapiroResult(statistic=0.5209904313087463, pvalue=0.0)
총자본순이익률 ShapiroResult(statistic=0.7702715396881104, pvalue=0.0)
총자본회전률 ShapiroResult(statistic=0.6647973656654358, pvalue=0.0)
자기자본회전률 ShapiroResult(statistic=0.3933231234550476, pvalue=0.0)
운전자본회전률 ShapiroResult(statistic=0.11871939897537231, pvalue=0.0)
순운전자본회전률 ShapiroResult(statistic=0.7271249294281006, pvalue=0.0)
재고자산회전률 ShapiroResult(statistic=0.18216991424560547, pvalue=0.0)
당좌자산회전률 ShapiroResult(statistic=0.5794222354888916, pvalue=0.0)
유동자산회전률 ShapiroResult(statistic=0.635965108

In [19]:
# 1-2 정규성 테스트(K-S)
for j in train_resampled_sum.columns:
    a = kstest(train_resampled_sum[j],'norm')
    p = a.pvalue
    print(j,a)

부채비율 KstestResult(statistic=0.3906751098050474, pvalue=0.0, statistic_location=-0.2775598980715014, statistic_sign=-1)
당좌비율 KstestResult(statistic=0.36984663905227333, pvalue=0.0, statistic_location=-0.3322595537125994, statistic_sign=-1)
유동비율 KstestResult(statistic=0.36477221111440783, pvalue=0.0, statistic_location=-0.34573161526781654, statistic_sign=-1)
이자보상배율 KstestResult(statistic=0.4484498220678388, pvalue=0.0, statistic_location=-0.1911027722638315, statistic_sign=1)
차입금의존도 KstestResult(statistic=0.10606206285734993, pvalue=0.0, statistic_location=-1.247745901800868, statistic_sign=-1)
자기자본구성비율 KstestResult(statistic=0.08004376611825903, pvalue=5.237534018092004e-268, statistic_location=-0.4358448028582186, statistic_sign=1)
매출액영업이익률 KstestResult(statistic=0.41790692664383744, pvalue=0.0, statistic_location=-0.010666213688669017, statistic_sign=-1)
자기자본순이익률 KstestResult(statistic=0.2688878578967383, pvalue=0.0, statistic_location=-0.248750634732277, statistic_sign=-1)
총자본순이익률 K

-p-value가 0 이는 데이터개수가 많아서 p-value 자체가 너무작아 계산이 불가능하다고 판단. 중심극한 정리에 의해서 정규성이 있다고 가정하고 진행

In [20]:
# 부도기업과 정상기업의 피처별 등분산비교 (정규성 가정으로 bartlett)
Bad = train_int_resampled_sum[train_int_resampled_sum['t-1감사의견코드']== 1] #Existing Customer
Good = train_int_resampled_sum[train_int_resampled_sum['t-1감사의견코드']== 0] #Attrited Customer

c = []
for i in train_int_resampled_sum:
    # lresult = stats.levene(close[i], normal[i])
    lresult = stats.bartlett(Bad[i], Good[i])
    c.append([i,lresult[-1]])

c= pd.DataFrame(c)
c.columns=["피처값",'F-test']
c

Unnamed: 0,피처값,F-test
0,부채비율,9.289546e-235
1,당좌비율,0.0
2,유동비율,2.42445e-304
3,이자보상배율,7.119720999999999e-212
4,차입금의존도,0.0001954632
5,자기자본구성비율,1.44078e-83
6,매출액영업이익률,0.9095311
7,자기자본순이익률,0.0
8,총자본순이익률,0.0
9,총자본회전률,0.0


In [21]:
# F-test 결과 0.05 이상이면 homo 0.05 이하이면 hetero
c["분산"] =''
c["T-test"] =""
for i in c.index:
    if c.loc[i,"F-test"]>=0.05:
        c.loc[i,"분산"] = "homo"
    else:
        c.loc[i,"분산"] = "hetero"
c

Unnamed: 0,피처값,F-test,분산,T-test
0,부채비율,9.289546e-235,hetero,
1,당좌비율,0.0,hetero,
2,유동비율,2.42445e-304,hetero,
3,이자보상배율,7.119720999999999e-212,hetero,
4,차입금의존도,0.0001954632,hetero,
5,자기자본구성비율,1.44078e-83,hetero,
6,매출액영업이익률,0.9095311,homo,
7,자기자본순이익률,0.0,hetero,
8,총자본순이익률,0.0,hetero,
9,총자본회전률,0.0,hetero,


In [22]:
c[c["분산"]=='homo']

Unnamed: 0,피처값,F-test,분산,T-test
6,매출액영업이익률,0.909531,homo,
24,총자본투자효율,0.24135,homo,
34,연구개발비대비매출액,0.520433,homo,
37,총자산대비현금흐름,0.906221,homo,


In [23]:
# homo 인 feature 는 student t-test, hetero이면 Welchs T-Test 진행
c["분산"] =''
c["T-test"] =""
for i in c.index:
    if c.loc[i,"F-test"]>=0.05:
        c.loc[i,"분산"] = "homo"
        result = stats.ttest_ind(Bad[c.loc[i,"피처값"]], Good[c.loc[i,"피처값"]], equal_var=True)       ## equal_var = True Student T-test
        c.loc[i,"T-test"] = result[-1]
        print(Bad[c.loc[i,"피처값"]])
    else:
        c.loc[i,"분산"] = "hetero"
        result = stats.ttest_ind(Bad[c.loc[i,"피처값"]], Good[c.loc[i,"피처값"]], equal_var=False)      ## equal_var = False Welchs T-Test
        c.loc[i,"T-test"] = result[-1]
c

38408    -1.581632
38409   -12.349735
38410     0.029493
38411    -0.025061
38412    -0.125942
           ...    
48005     0.100578
48006     0.104771
48007     0.102997
48008     0.102392
48009     0.096989
Name: 매출액영업이익률, Length: 9602, dtype: float64
38408   -0.614022
38409   -0.776401
38410   -0.463187
38411   -0.746979
38412   -0.010685
           ...   
48005   -0.460953
48006   -0.295594
48007    0.117431
48008    0.542374
48009    0.197503
Name: 총자본투자효율, Length: 9602, dtype: float64
38408   -0.258036
38409   -0.258036
38410   -0.258036
38411   -0.258036
38412   -0.258036
           ...   
48005   -0.258036
48006   -0.258036
48007   -0.258036
48008   -0.258036
48009   -0.258036
Name: 연구개발비대비매출액, Length: 9602, dtype: float64
38408   -0.123710
38409   -0.069908
38410   -0.031936
38411   -0.060454
38412    1.673774
           ...   
48005    0.262653
48006    0.315837
48007    0.849720
48008   -0.197040
48009   -0.121098
Name: 총자산대비현금흐름, Length: 9602, dtype: float64


Unnamed: 0,피처값,F-test,분산,T-test
0,부채비율,9.289546e-235,hetero,0.0
1,당좌비율,0.0,hetero,0.0
2,유동비율,2.42445e-304,hetero,0.0
3,이자보상배율,7.119720999999999e-212,hetero,0.0
4,차입금의존도,0.0001954632,hetero,0.0
5,자기자본구성비율,1.44078e-83,hetero,0.0
6,매출액영업이익률,0.9095311,homo,0.041937
7,자기자본순이익률,0.0,hetero,0.00045
8,총자본순이익률,0.0,hetero,0.0
9,총자본회전률,0.0,hetero,0.0


In [24]:
# 0.05 이상이면 유의미하지 않으므로 0.05 이하인 것만 추출
d = c[c["T-test"]<= 0.05]
d.sort_values('T-test',ascending=False)["피처값"].unique()
# d.sort_values('T-test',ascending=False)["피처값"]

array(['매출액영업이익률', '운전자본회전률', '재고자산회전률', '총자본투자효율', '자기자본순이익률',
       '매출액대비잉여현금흐름', '이윤분배율', '매출액대비금융비용상환능력', '노동소득분배율', '자본분배율',
       '부가가치율', '장기부채상환능력', '유동비율', '총자산대비현금흐름', '이자보상배율', '부채비율',
       '총자본순이익률', '유형자산증가율', '당좌비율', '총자산대비영업현금흐름', '매출액증가율', 'OCF이자보상배율',
       '재고자산증가율', '영업이익증가율', '유동자산증가율', '순이익증가율', '자기자본증가율', '총자본증가율',
       '차입금의존도', '자기자본회전률', '유동자산회전률', 'log자산총계', '당좌자산회전률', '순운전자본회전률',
       '총자본회전률', '자기자본구성비율', '총자산대비잉여현금흐름', 't-1감사의견코드'], dtype=object)

In [25]:
d.sort_values('T-test',ascending=False).dropna()

Unnamed: 0,피처값,F-test,분산,T-test
6,매출액영업이익률,0.9095311,homo,0.041937
11,운전자본회전률,5.596723e-151,hetero,0.014824
13,재고자산회전률,9.090963e-276,hetero,0.008474
24,총자본투자효율,0.2413503,homo,0.002351
7,자기자본순이익률,0.0,hetero,0.00045
36,매출액대비잉여현금흐름,3.395141e-26,hetero,0.00015
28,이윤분배율,2.538784e-84,hetero,7.5e-05
33,매출액대비금융비용상환능력,0.0,hetero,1.1e-05
26,노동소득분배율,3.065944e-37,hetero,3e-06
27,자본분배율,1.656922e-30,hetero,0.0


In [26]:
d['피처값'].values

array(['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
       '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률',
       '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율',
       '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율',
       '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계', 'OCF이자보상배율',
       '장기부채상환능력', '매출액대비금융비용상환능력', '매출액대비잉여현금흐름', '총자산대비현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름', 't-1감사의견코드'], dtype=object)

In [27]:

fea = ['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
       '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률',
       '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율',
       '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율',
       '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계', 'OCF이자보상배율',
       '장기부채상환능력', '매출액대비금융비용상환능력', '매출액대비잉여현금흐름', '총자산대비현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름', 't-1감사의견코드']
mda_feature = train_resampled_sum[fea]
mda_feature

Unnamed: 0,부채비율,당좌비율,유동비율,이자보상배율,차입금의존도,자기자본구성비율,매출액영업이익률,자기자본순이익률,총자본순이익률,총자본회전률,...,이윤분배율,log자산총계,OCF이자보상배율,장기부채상환능력,매출액대비금융비용상환능력,매출액대비잉여현금흐름,총자산대비현금흐름,총자산대비영업현금흐름,총자산대비잉여현금흐름,t-1감사의견코드
0,-0.250439,-0.067503,0.003614,-0.246993,-0.547158,0.867750,0.097957,-0.108987,-0.165444,-0.182128,...,0.061987,-0.633804,-0.246720,0.054645,0.164870,0.111182,0.360514,0.035409,0.235663,0.0
1,-0.265341,0.177080,0.077385,1.738226,-1.247746,1.456470,0.115376,0.117639,0.689271,0.644454,...,0.074153,-0.831143,1.814798,-0.039992,0.049888,0.111854,0.606382,0.869524,0.540841,0.0
2,-0.230371,-0.089246,-0.136576,-0.244825,-1.247746,0.372274,0.094731,-0.169186,-0.328214,1.669913,...,-0.047341,-0.266465,-0.247428,-0.039992,0.435532,0.110080,-0.111170,-0.103463,0.131736,0.0
3,-0.106322,-0.181713,-0.193891,-0.246055,-0.769169,-0.658535,0.097352,0.047757,-0.061924,0.930340,...,0.210364,-0.088285,-0.250441,-0.039992,0.182314,0.109531,-0.487379,-0.399237,-0.026219,0.0
4,2.519262,-0.213020,-0.245579,-0.239757,-1.130710,-1.374140,0.099207,-2.563839,-2.439463,0.476652,...,0.037247,1.178310,-0.248790,-0.041700,1.184598,0.110085,-0.475827,-0.213286,0.128986,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48005,1.343621,-0.199881,-0.221781,1.738226,1.445392,-1.332454,0.100578,2.808800,0.154649,1.589119,...,0.448428,-0.883544,-0.412296,-0.039992,-14.735013,0.099393,0.262653,-2.958654,-2.005218,1.0
48006,-0.165591,-0.203398,-0.216638,-0.229166,-0.962816,-0.367831,0.104771,0.123813,0.123321,0.209411,...,0.218533,-0.522700,-0.285575,-0.039992,-0.548727,0.113594,0.315837,-0.585401,0.419038,1.0
48007,-0.255627,0.031301,0.019563,-0.227858,-1.106911,1.043269,0.102997,-0.043034,0.047724,0.433148,...,0.161465,-0.851760,-0.235670,-0.039992,-0.474477,0.110941,0.849720,0.036248,0.303795,1.0
48008,0.006268,-0.256943,-0.270608,-0.246838,0.803812,-0.922910,0.102392,1.168115,0.641598,3.316862,...,0.158104,-1.094728,-0.247405,-0.001196,0.105620,0.109466,-0.197040,0.738049,-0.080856,1.0


In [28]:
def vif(data):
    import pandas as pd
    from statsmodels.stats.outliers_influence import variance_inflation_factor

    # VIF 출력을 위한 데이터 프레임 형성
    vif = pd.DataFrame()

    # VIF 값과 각 Feature 이름에 대해 설정
    vif["VIF Factor"] = [variance_inflation_factor(data.values, i) for i in range(len(data.columns))]
    vif["features"] = data.columns

    # VIF 값이 높은 순으로 정렬
    vif = vif.sort_values(by="VIF Factor", ascending=False)
    vif = vif.reset_index().drop(columns='index')

    return vif

vif(mda_feature)

Unnamed: 0,VIF Factor,features
0,10.096441,당좌비율
1,9.908995,유동비율
2,7.20673,자본분배율
3,7.116331,노동소득분배율
4,5.420327,순운전자본회전률
5,4.616109,총자본회전률
6,4.448245,유동자산회전률
7,3.393592,당좌자산회전률
8,3.192621,자기자본구성비율
9,3.147623,차입금의존도


In [29]:
fea_hetero= ['부채비율', '당좌비율',  '이자보상배율', '차입금의존도', '자기자본구성비율', 
       '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률',
       '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율',
       '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', 
       '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계', 'OCF이자보상배율',
       '장기부채상환능력', '매출액대비금융비용상환능력', '매출액대비잉여현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름']
fea_homo=['매출액영업이익률','총자본투자효율','총자산대비현금흐름']

In [30]:
# Welch's t-test 수행
t_stat, p_value = stats.ttest_ind(Bad[fea_hetero], Good[fea_hetero], equal_var=False)
result_df_hetero = pd.DataFrame({'t-statistic': t_stat, 'p-value': p_value}, index=fea_hetero)

# t-test 수행
t_stat, p_value = stats.ttest_ind(Bad[fea_homo], Good[fea_homo], equal_var=True)
result_df_homo = pd.DataFrame({'t-statistic': t_stat, 'p-value': p_value}, index=fea_homo)

result_df = pd.concat([result_df_hetero,result_df_homo],axis=0)


result_df = result_df.sort_values('p-value', ascending=True).reset_index()

In [31]:
result_df = result_df[['index', 'p-value']]
result_df.columns = ['Variable', 'p-value']

---

# Chi 2

* 카이제곱 검정 조건
    * 종속변인은 범주형 자료여야 한다.
    * 기대빈도가 5이하인 셀이 전체의 20%가 넘지 않아야 한다.
    * 각 칸의 빈도는 다른 칸의 빈도와 독립적이어야 한다.

In [32]:
from scipy.stats import chi2_contingency

# 기대빈도가 5 이하인 항목의 비율을 저장할 딕셔너리
expected_freq_5_ratio = {}

# 각 독립 변수에 대해 기대빈도 계산 및 비율 확인
for column in train_cat_resampled_sum.columns[:-1]:
    # 교차 테이블 생성
    contingency_table = pd.crosstab(train_cat_resampled_sum[column], train_cat_resampled_sum['t-1감사의견코드'])

    # 카이제곱 검정 수행
    chi2, p_value, dof, expected_freq = chi2_contingency(contingency_table)

    # 기대빈도가 5 이하인 항목의 비율 계산
    expected_freq_5 = (expected_freq <= 5).mean()

    # 결과 저장
    expected_freq_5_ratio[column] = expected_freq_5

# 결과 출력
for column, ratio in expected_freq_5_ratio.items():
    print(f"변수 '{column}'의 기대빈도가 5 이하인 항목 비율: {ratio}")

변수 '기업수명주기'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '이보배초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '파부비초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '파당비초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '파차의초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '파로이초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0


In [33]:
# 독립변수와 종속변수 간의 카이제곱 검정 수행
chi2_scores = []

for column in train_cat_resampled_sum.columns:
    # 교차 테이블 생성
    contingency_table = pd.crosstab(train_cat_resampled_sum[column], test_cat_resampled_sum['t-1감사의견코드'])
    chi2, p_value, dof, expected_freq = chi2_contingency(contingency_table)
    # 카이제곱 통계량(chi2), p-value(p_value), 자유도(dof), 예상빈도(expected_freq)
    print(p_value)
    chi2_scores.append((column, chi2))

# 카이제곱 검정 결과를 기준으로 변수 정렬
sorted_features = sorted(chi2_scores, key=lambda x : x[1], reverse=True)

# 선택된 변수 출력
sorted_features

0.03432644920768823
0.28023165335659916
0.8401590681003678
0.5633064213462493
0.7914532085623038
0.28821070917838865
1.0


[('기업수명주기', 10.391387147798698),
 ('이보배초과여부', 1.1659660900683912),
 ('파로이초과여부', 1.1279608610413745),
 ('파당비초과여부', 0.33401157410732923),
 ('파차의초과여부', 0.06992005971443019),
 ('파부비초과여부', 0.04067886264793981),
 ('t-1감사의견코드', 0.0)]

In [34]:
chi2_scores

[('기업수명주기', 10.391387147798698),
 ('이보배초과여부', 1.1659660900683912),
 ('파부비초과여부', 0.04067886264793981),
 ('파당비초과여부', 0.33401157410732923),
 ('파차의초과여부', 0.06992005971443019),
 ('파로이초과여부', 1.1279608610413745),
 ('t-1감사의견코드', 0.0)]

In [35]:
import pandas as pd
from scipy.stats import chi2_contingency

# 독립변수와 종속변수 간의 카이제곱 검정 수행
chi2_scores = []

p_values = []
for column in test_cat_resampled_sum.columns:
    # 교차 테이블 생성
    contingency_table = pd.crosstab(test_cat_resampled_sum[column], test_cat_resampled_sum['t-1감사의견코드'])
    chi2, p_value, dof, expected_freq = chi2_contingency(contingency_table)
    p_values.append(p_value)
    chi2_scores.append((column, chi2))

# p-value가 0.05보다 작은 값을 출력하는 데이터프레임 생성
result_df_1 = pd.DataFrame({'Variable': test_cat_resampled_sum.columns, 'p-value': p_values})
filtered_df_chi = result_df_1[result_df_1['p-value'] < 0.05]

filtered_df_chi


Unnamed: 0,Variable,p-value
0,기업수명주기,1.0010070000000001e-162
2,파부비초과여부,1.343308e-94
4,파차의초과여부,1.051892e-105
5,파로이초과여부,5.1832440000000007e-23
6,t-1감사의견코드,0.0


In [36]:
result = pd.concat([result_df, filtered_df_chi], axis=0)

In [37]:
result.sort_values('p-value', ascending=True).reset_index(drop=True)

Unnamed: 0,Variable,p-value
0,순운전자본회전률,0.0
1,log자산총계,0.0
2,당좌자산회전률,0.0
3,t-1감사의견코드,0.0
4,자기자본구성비율,0.0
5,총자산대비잉여현금흐름,0.0
6,총자본회전률,0.0
7,유동자산회전률,1.004416e-297
8,자기자본회전률,7.849399e-271
9,차입금의존도,4.3729479999999996e-260


---
# feature개수 정하기 위한 Logit

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
import statsmodels.api as sm
import numpy as np
lr_clf = LogisticRegression()

feature = train_resampled_sum.drop(['t-1감사의견코드'],axis=1)
target = train_int_resampled_sum[['t-1감사의견코드']]

logit = SelectFromModel(LogisticRegression())
logit.fit(feature, target)
logit_support = logit.get_support()
lr_feature = feature.loc[:,logit_support].columns.tolist()

In [50]:
lr_feature

['자기자본구성비율',
 '총자본순이익률',
 '순운전자본회전률',
 '당좌자산회전률',
 '유동자산회전률',
 '총자본증가율',
 '자기자본증가율',
 '순이익증가율',
 '총자본투자효율',
 'log자산총계',
 '총자산대비현금흐름',
 '총자산대비잉여현금흐름',
 '파당비초과여부',
 '파차의초과여부',
 '파로이초과여부']

In [51]:
len(lr_feature)

15

---

### Embedded Method

> Randomforeset

In [52]:
from sklearn.ensemble import RandomForestClassifier

In [59]:
selector = SelectFromModel(estimator=RandomForestClassifier(), threshold=0.0192).fit(feature, target)
rf = selector.get_support()
count = np.count_nonzero(rf)
count

15

In [60]:
rf_features = feature.loc[:, rf].columns.tolist()
rf_features

['자기자본순이익률',
 '총자본회전률',
 '자기자본회전률',
 '순운전자본회전률',
 '재고자산회전률',
 '당좌자산회전률',
 '유동자산회전률',
 '매출액증가율',
 '총자본증가율',
 '자기자본증가율',
 '유형자산증가율',
 '유동자산증가율',
 'log자산총계',
 '매출액대비잉여현금흐름',
 '총자산대비잉여현금흐름']

> LASSO

In [61]:
lasso = SelectFromModel(estimator=LogisticRegression(penalty='l1', solver='liblinear', C=0.00185)).fit(feature, target)
lasso_support = lasso.get_support()
lasso_feature = feature.loc[:,lasso_support].columns.tolist()

In [62]:
len(lasso_feature)

15

In [63]:
lasso_feature

['자기자본구성비율',
 '총자본순이익률',
 '총자본회전률',
 '자기자본회전률',
 '순운전자본회전률',
 '유동자산회전률',
 '총자본증가율',
 '자기자본증가율',
 '영업이익증가율',
 '총자본투자효율',
 'log자산총계',
 '매출액대비잉여현금흐름',
 '총자산대비현금흐름',
 '총자산대비잉여현금흐름',
 '기업수명주기']

# Wrapper Method

In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

In [65]:
selector = SequentialFeatureSelector(estimator=LogisticRegression(), n_features_to_select=15, direction='backward', scoring='f1', cv=5, n_jobs=-1)

# 변수 선택 수행
selector.fit(feature, target)

# 선택된 변수의 인덱스
selected_features = selector.get_support(indices=True)

# 선택된 변수 출력
for i in selected_features:
    print(feature.columns[i])

자기자본구성비율
총자본순이익률
순운전자본회전률
당좌자산회전률
유동자산회전률
총자본증가율
자기자본증가율
영업이익증가율
총자본투자효율
log자산총계
매출액대비잉여현금흐름
총자산대비현금흐름
총자산대비영업현금흐름
총자산대비잉여현금흐름
기업수명주기


In [66]:
selected_columns = feature.columns[selected_features]
wrapper_features = list(selected_columns)

---

# 종합

In [79]:
result = result.sort_values('p-value', ascending=True).reset_index(drop=True)
result = result.head(15)
filter = result[['Variable']]

In [80]:
rf_features = pd.DataFrame(rf_features)
lasso_features = pd.DataFrame(lasso_feature)
wrapper_features = pd.DataFrame(wrapper_features)

In [81]:
total = pd.concat([filter, rf_features, lasso_features, wrapper_features], axis=1)

In [82]:
total.columns = ['t&chi', 'rf', 'lasso', 'wrapper']
total

Unnamed: 0,t&chi,rf,lasso,wrapper
0,순운전자본회전률,자기자본순이익률,자기자본구성비율,자기자본구성비율
1,log자산총계,총자본회전률,총자본순이익률,총자본순이익률
2,당좌자산회전률,자기자본회전률,총자본회전률,순운전자본회전률
3,t-1감사의견코드,순운전자본회전률,자기자본회전률,당좌자산회전률
4,자기자본구성비율,재고자산회전률,순운전자본회전률,유동자산회전률
5,총자산대비잉여현금흐름,당좌자산회전률,유동자산회전률,총자본증가율
6,총자본회전률,유동자산회전률,총자본증가율,자기자본증가율
7,유동자산회전률,매출액증가율,자기자본증가율,영업이익증가율
8,자기자본회전률,총자본증가율,영업이익증가율,총자본투자효율
9,차입금의존도,자기자본증가율,총자본투자효율,log자산총계


In [83]:
filter = total['t&chi'].tolist()
rf = total['rf'].tolist()
lasso = total['lasso'].tolist()
wrapper = total['wrapper'].tolist()

In [84]:
feature.columns

Index(['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
       '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률',
       '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율',
       '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율',
       '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계', 'OCF이자보상배율', '부채상환계수',
       '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름', '매출액대비잉여현금흐름',
       '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름', '기업수명주기', '이보배초과여부',
       '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부'],
      dtype='object')

In [85]:
total_result = pd.DataFrame(index=feature.columns)

# 각 컬럼의 값에 인덱스 포함 여부에 따라 True 또는 False 입력
total_result['t&chi'] = total_result.index.isin(filter)
total_result['wrapper'] = total_result.index.isin(wrapper)
total_result['rf'] = total_result.index.isin(rf)
total_result['lasso'] = total_result.index.isin(lasso)
total_result["true_sum"] = total_result.sum(axis=1)

total_result.sort_values('true_sum', ascending=False, inplace=True)
total_result

Unnamed: 0,t&chi,wrapper,rf,lasso,true_sum
자기자본증가율,True,True,True,True,4
log자산총계,True,True,True,True,4
총자본증가율,True,True,True,True,4
유동자산회전률,True,True,True,True,4
순운전자본회전률,True,True,True,True,4
총자산대비잉여현금흐름,True,True,True,True,4
자기자본회전률,True,False,True,True,3
당좌자산회전률,True,True,True,False,3
기업수명주기,True,True,False,True,3
총자본회전률,True,False,True,True,3


In [86]:
total_result_2 = total_result[total_result['true_sum']>=3]
total_result_2.reset_index()

Unnamed: 0,index,t&chi,wrapper,rf,lasso,true_sum
0,자기자본증가율,True,True,True,True,4
1,log자산총계,True,True,True,True,4
2,총자본증가율,True,True,True,True,4
3,유동자산회전률,True,True,True,True,4
4,순운전자본회전률,True,True,True,True,4
5,총자산대비잉여현금흐름,True,True,True,True,4
6,자기자본회전률,True,False,True,True,3
7,당좌자산회전률,True,True,True,False,3
8,기업수명주기,True,True,False,True,3
9,총자본회전률,True,False,True,True,3


In [None]:
total

In [None]:
total_result_2.index

Index(['총자산대비잉여현금흐름', '총자본증가율', 'log자산총계', '순운전자본회전률', '자기자본증가율',
       '매출액대비잉여현금흐름', '자기자본구성비율', '총자본회전률', '자기자본회전률', '총자산대비현금흐름', '총자본투자효율',
       '총자본순이익률', '당좌자산회전률', '유동자산회전률'],
      dtype='object')

In [87]:
total_result = pd.DataFrame(index=feature.columns)

# 각 컬럼의 값에 인덱스 포함 여부에 따라 True 또는 False 입력
total_result['t&chi'] = total_result.index.isin(filter)
total_result['wrapper'] = total_result.index.isin(wrapper)
# total_result['rf'] = total_result.index.isin(rf)
total_result['lasso'] = total_result.index.isin(lasso)
total_result["true_sum"] = total_result.sum(axis=1)

total_result.sort_values('true_sum', ascending=False, inplace=True)
total_result

Unnamed: 0,t&chi,wrapper,lasso,true_sum
순운전자본회전률,True,True,True,3
자기자본구성비율,True,True,True,3
자기자본증가율,True,True,True,3
총자본증가율,True,True,True,3
유동자산회전률,True,True,True,3
log자산총계,True,True,True,3
총자산대비잉여현금흐름,True,True,True,3
기업수명주기,True,True,True,3
영업이익증가율,False,True,True,2
총자본투자효율,False,True,True,2


In [88]:
total_result_3 = total_result[total_result['true_sum']>=2]
total_result_3.reset_index()

Unnamed: 0,index,t&chi,wrapper,lasso,true_sum
0,순운전자본회전률,True,True,True,3
1,자기자본구성비율,True,True,True,3
2,자기자본증가율,True,True,True,3
3,총자본증가율,True,True,True,3
4,유동자산회전률,True,True,True,3
5,log자산총계,True,True,True,3
6,총자산대비잉여현금흐름,True,True,True,3
7,기업수명주기,True,True,True,3
8,영업이익증가율,False,True,True,2
9,총자본투자효율,False,True,True,2


In [89]:
total_result_3.index[:11]

Index(['순운전자본회전률', '자기자본구성비율', '자기자본증가율', '총자본증가율', '유동자산회전률', 'log자산총계',
       '총자산대비잉여현금흐름', '기업수명주기', '영업이익증가율', '총자본투자효율', '매출액대비잉여현금흐름'],
      dtype='object')

In [20]:
# train_resampled_sum.to_csv('TomekLinks_0.25_train.csv',index=False,encoding='euc-kr')
# test_resampled_sum.to_csv('TomekLinks_0.25_test.csv',index=False,encoding='euc-kr')