In [84]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np

In [85]:
train = pd.read_csv('./Dataset/train_Winsorization.csv', encoding='euc-kr')
test = pd.read_csv('./Dataset/test.csv', encoding='euc-kr')

In [86]:
train['기업수명주기'] = train['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

test['기업수명주기'] = test['기업수명주기'].map({
    '도입기' : 1,
    '성장기' : 2,
    '성숙기' : 3,
    '수축기' : 4,
    '쇠퇴기' : 5
}).astype('category')

In [95]:
len(train.columns)

50

# Filter Method
## t-test & chi-square

- t-test 하기 위해 연속형 컬럼만 추출

In [5]:
train.columns

Index(['회사명', '거래소코드', 'Year', '부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도',
       '자기자본구성비율', '매출액영업이익률', '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률',
       '운전자본회전률', '순운전자본회전률', '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율',
       '총자본증가율', '자기자본증가율', '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율',
       '영업이익증가율', '총자본투자효율', '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'OCF이자보상배율',
       '부채상환계수', '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름',
       '매출액대비잉여현금흐름', '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름', '기업수명주기',
       '이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부', 'log자산총계',
       't-1감사의견코드'],
      dtype='object')

In [6]:
train_int = train[['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
      '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률', '재고자산회전률',
       '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율', '순이익증가율',
       '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율', '부가가치율',
       '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계',
       'OCF이자보상배율', '부채상환계수', '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름',
       '매출액대비잉여현금흐름', '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름',
       't-1감사의견코드']]

In [7]:
test_int = test[['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
      '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률', '재고자산회전률',
       '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율', '순이익증가율',
       '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율', '부가가치율',
       '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계',
       'OCF이자보상배율', '부채상환계수', '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름',
       '매출액대비잉여현금흐름', '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름',
       't-1감사의견코드']]

# Standard Scaler

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
X_train = train_int.drop('t-1감사의견코드', axis=1)
y_train = train_int[['t-1감사의견코드']]

X_test = test_int.drop('t-1감사의견코드', axis=1)
y_test = test_int[['t-1감사의견코드']]

In [10]:
scaler = StandardScaler()
train_sc = scaler.fit_transform(X_train)
test_sc = scaler.transform(X_test)

In [11]:
train_sc = pd.DataFrame(train_sc, columns=X_train.columns)
test_sc = pd.DataFrame(test_sc, columns=X_test.columns)

In [12]:
# 스케일링 + 타겟 데이터프레임
train_sc_total = pd.concat([train_sc, y_train], axis=1)
test_sc_total = pd.concat([test_sc, y_test], axis=1)

# MDA

In [13]:
# 1-1 정규성 테스트(샤피로)
from scipy.stats import norm
from scipy import stats
from statsmodels.formula.api import ols
from scipy.stats import kstest

for j in train_sc_total.columns:
    a = stats.shapiro(train_sc_total[j])
    p = a.pvalue
    print(j,a)

부채비율 ShapiroResult(statistic=0.24403750896453857, pvalue=0.0)
당좌비율 ShapiroResult(statistic=0.2609418034553528, pvalue=0.0)
유동비율 ShapiroResult(statistic=0.25839483737945557, pvalue=0.0)
이자보상배율 ShapiroResult(statistic=0.28382623195648193, pvalue=0.0)
차입금의존도 ShapiroResult(statistic=0.9394765496253967, pvalue=0.0)
자기자본구성비율 ShapiroResult(statistic=0.9559506177902222, pvalue=0.0)
매출액영업이익률 ShapiroResult(statistic=0.1464797854423523, pvalue=0.0)
자기자본순이익률 ShapiroResult(statistic=0.49838918447494507, pvalue=0.0)
총자본순이익률 ShapiroResult(statistic=0.7577058672904968, pvalue=0.0)
총자본회전률 ShapiroResult(statistic=0.6562888622283936, pvalue=0.0)
자기자본회전률 ShapiroResult(statistic=0.36522388458251953, pvalue=0.0)
운전자본회전률 ShapiroResult(statistic=0.12048518657684326, pvalue=0.0)
순운전자본회전률 ShapiroResult(statistic=0.7150239944458008, pvalue=0.0)
재고자산회전률 ShapiroResult(statistic=0.1800088882446289, pvalue=0.0)
당좌자산회전률 ShapiroResult(statistic=0.5666403770446777, pvalue=0.0)
유동자산회전률 ShapiroResult(statistic=0.61591798

In [14]:
# 1-2 정규성 테스트(K-S)
for j in train_sc_total.columns:
    a = kstest(train_sc_total[j],'norm')
    p = a.pvalue
    print(j,a)

부채비율 KstestResult(statistic=0.3906751098050474, pvalue=0.0, statistic_location=-0.2775598980715014, statistic_sign=-1)
당좌비율 KstestResult(statistic=0.36984663905227333, pvalue=0.0, statistic_location=-0.3322595537125994, statistic_sign=-1)
유동비율 KstestResult(statistic=0.36477221111440783, pvalue=0.0, statistic_location=-0.34573161526781654, statistic_sign=-1)
이자보상배율 KstestResult(statistic=0.44220548347789634, pvalue=0.0, statistic_location=-0.1909002445930694, statistic_sign=1)
차입금의존도 KstestResult(statistic=0.10606206285734993, pvalue=0.0, statistic_location=-1.247745901800868, statistic_sign=-1)
자기자본구성비율 KstestResult(statistic=0.0758168364052138, pvalue=0.0, statistic_location=-1.4333778461001914, statistic_sign=-1)
매출액영업이익률 KstestResult(statistic=0.4196833421850839, pvalue=0.0, statistic_location=-0.010666213688669017, statistic_sign=-1)
자기자본순이익률 KstestResult(statistic=0.27273314810430427, pvalue=0.0, statistic_location=-0.2716236761529744, statistic_sign=-1)
총자본순이익률 KstestResult(stati

p-value가 0 이는 데이터개수가 많아서 p-value 자체가 너무작아 계산이 불가능하다고 판단. 중심극한 정리에 의해서 정규성이 있다고 가정하고 진행

In [15]:
# 부도기업과 정상기업의 피처별 등분산비교 (정규성 가정으로 bartlett)
Bad = train_sc_total[train_sc_total['t-1감사의견코드']== 1] #Existing Customer
Good = train_sc_total[train_sc_total['t-1감사의견코드']== 0] #Attrited Customer

c = []
for i in train_sc_total:
    # lresult = stats.levene(close[i], normal[i])
    lresult = stats.bartlett(Bad[i], Good[i])
    c.append([i,lresult[-1]])

c= pd.DataFrame(c)
c.columns=["피처값",'F-test']
c

Unnamed: 0,피처값,F-test
0,부채비율,8.773001000000001e-268
1,당좌비율,0.0
2,유동비율,0.0
3,이자보상배율,8.292491999999999e-237
4,차입금의존도,5.671281e-05
5,자기자본구성비율,1.167465e-94
6,매출액영업이익률,0.2306937
7,자기자본순이익률,0.0
8,총자본순이익률,0.0
9,총자본회전률,0.0


In [16]:
# F-test 결과 0.05 이상이면 homo 0.05 이하이면 hetero
c["분산"] =''
c["T-test"] =""
for i in c.index:
    if c.loc[i,"F-test"]>=0.05:
        c.loc[i,"분산"] = "homo"
    else:
        c.loc[i,"분산"] = "hetero"
c

Unnamed: 0,피처값,F-test,분산,T-test
0,부채비율,8.773001000000001e-268,hetero,
1,당좌비율,0.0,hetero,
2,유동비율,0.0,hetero,
3,이자보상배율,8.292491999999999e-237,hetero,
4,차입금의존도,5.671281e-05,hetero,
5,자기자본구성비율,1.167465e-94,hetero,
6,매출액영업이익률,0.2306937,homo,
7,자기자본순이익률,0.0,hetero,
8,총자본순이익률,0.0,hetero,
9,총자본회전률,0.0,hetero,


In [17]:
c[c["분산"]=='homo']

Unnamed: 0,피처값,F-test,분산,T-test
6,매출액영업이익률,0.230694,homo,
24,총자본투자효율,0.4463,homo,
25,부가가치율,0.295177,homo,
37,총자산대비현금흐름,0.876309,homo,


In [18]:
c

Unnamed: 0,피처값,F-test,분산,T-test
0,부채비율,8.773001000000001e-268,hetero,
1,당좌비율,0.0,hetero,
2,유동비율,0.0,hetero,
3,이자보상배율,8.292491999999999e-237,hetero,
4,차입금의존도,5.671281e-05,hetero,
5,자기자본구성비율,1.167465e-94,hetero,
6,매출액영업이익률,0.2306937,homo,
7,자기자본순이익률,0.0,hetero,
8,총자본순이익률,0.0,hetero,
9,총자본회전률,0.0,hetero,


In [19]:
# homo 인 feature 는 student t-test, hetero이면 Welchs T-Test 진행
c["분산"] =''
c["T-test"] =""
for i in c.index:
    if c.loc[i,"F-test"]>=0.05:
        c.loc[i,"분산"] = "homo"
        result = stats.ttest_ind(Bad[c.loc[i,"피처값"]], Good[c.loc[i,"피처값"]], equal_var=True)       ## equal_var = True Student T-test
        c.loc[i,"T-test"] = result[-1]
        print(Bad[c.loc[i,"피처값"]])
    else:
        c.loc[i,"분산"] = "hetero"
        result = stats.ttest_ind(Bad[c.loc[i,"피처값"]], Good[c.loc[i,"피처값"]], equal_var=False)      ## equal_var = False Welchs T-Test
        c.loc[i,"T-test"] = result[-1]
c

8         -1.581632
9        -12.349735
25         0.029493
27        -0.025061
33        -0.125942
            ...    
137604     0.100578
137622     0.104771
137624     0.102997
137628     0.102392
137663     0.096989
Name: 매출액영업이익률, Length: 9602, dtype: float64
8        -0.614022
9        -0.776401
25       -0.463187
27       -0.746979
33       -0.010685
            ...   
137604   -0.460953
137622   -0.295594
137624    0.117431
137628    0.542374
137663    0.197503
Name: 총자본투자효율, Length: 9602, dtype: float64
8        -0.294838
9        -5.558117
25       -0.006468
27       -0.064376
33        0.441486
            ...   
137604   -0.026444
137622    0.001288
137624   -0.000765
137628   -0.002785
137663    0.005785
Name: 부가가치율, Length: 9602, dtype: float64
8        -0.123710
9        -0.069908
25       -0.031936
27       -0.060454
33        1.673774
            ...   
137604    0.262653
137622    0.315837
137624    0.849720
137628   -0.197040
137663   -0.121098
Name: 총자산대비현금흐름, Lengt

Unnamed: 0,피처값,F-test,분산,T-test
0,부채비율,8.773001000000001e-268,hetero,0.0
1,당좌비율,0.0,hetero,0.0
2,유동비율,0.0,hetero,0.0
3,이자보상배율,8.292491999999999e-237,hetero,0.0
4,차입금의존도,5.671281e-05,hetero,0.0
5,자기자본구성비율,1.167465e-94,hetero,0.0
6,매출액영업이익률,0.2306937,homo,0.034306
7,자기자본순이익률,0.0,hetero,0.000104
8,총자본순이익률,0.0,hetero,0.0
9,총자본회전률,0.0,hetero,0.0


In [20]:
# 0.05 이상이면 유의미하지 않으므로 0.05 이하인 것만 추출
d = c[c["T-test"]<= 0.05]
d.sort_values('T-test',ascending=False)["피처값"].unique()
# d.sort_values('T-test',ascending=False)["피처값"]

array(['매출액영업이익률', '운전자본회전률', '재고자산회전률', '총자본투자효율', '이윤분배율', '자기자본순이익률',
       '매출액대비잉여현금흐름', '노동소득분배율', '매출액대비금융비용상환능력', '자본분배율', '부가가치율',
       '장기부채상환능력', '유동비율', '부채비율', '총자본순이익률', '총자산대비현금흐름', '이자보상배율',
       '총자산대비영업현금흐름', '유형자산증가율', '당좌비율', 'OCF이자보상배율', '매출액증가율', '재고자산증가율',
       '유동자산증가율', '영업이익증가율', '자기자본증가율', '순이익증가율', '총자본증가율', '자기자본회전률',
       '차입금의존도', '유동자산회전률', 'log자산총계', '당좌자산회전률', '순운전자본회전률', '총자본회전률',
       '자기자본구성비율', '총자산대비잉여현금흐름', 't-1감사의견코드'], dtype=object)

In [21]:
d.sort_values('T-test',ascending=False).dropna()

Unnamed: 0,피처값,F-test,분산,T-test
6,매출액영업이익률,0.2306937,homo,0.034306
11,운전자본회전률,9.391575e-152,hetero,0.025077
13,재고자산회전률,6.380853e-264,hetero,0.018824
24,총자본투자효율,0.4463002,homo,0.006248
28,이윤분배율,4.0640039999999996e-44,hetero,0.000997
7,자기자본순이익률,0.0,hetero,0.000104
36,매출액대비잉여현금흐름,5.3744000000000006e-55,hetero,1.6e-05
26,노동소득분배율,6.581281e-30,hetero,2e-06
33,매출액대비금융비용상환능력,0.0,hetero,1e-06
27,자본분배율,9.076037e-24,hetero,0.0


In [22]:
d['피처값'].values

array(['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
       '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률',
       '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율',
       '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율',
       '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계', 'OCF이자보상배율',
       '장기부채상환능력', '매출액대비금융비용상환능력', '매출액대비잉여현금흐름', '총자산대비현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름', 't-1감사의견코드'], dtype=object)

In [23]:

fea = ['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
       '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률',
       '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율',
       '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율',
       '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계', 'OCF이자보상배율',
       '장기부채상환능력', '매출액대비금융비용상환능력', '매출액대비잉여현금흐름', '총자산대비현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름']
mda_feature = train[fea]
mda_feature

Unnamed: 0,부채비율,당좌비율,유동비율,이자보상배율,차입금의존도,자기자본구성비율,매출액영업이익률,자기자본순이익률,총자본순이익률,총자본회전률,...,자본분배율,이윤분배율,log자산총계,OCF이자보상배율,장기부채상환능력,매출액대비금융비용상환능력,매출액대비잉여현금흐름,총자산대비현금흐름,총자산대비영업현금흐름,총자산대비잉여현금흐름
0,1251.02,347.96,354.61,-13.357516,6.09,7.40,-414.90,-826.27,-82.59,0.17,...,0.00,0.00,22.828067,-9.332108,-0.000000,4.174742,-2.629424,0.772454,-0.244115,-0.221444
1,107.19,68.72,68.72,8.709388,45.96,48.26,25.08,13.81,6.61,0.36,...,79.24,42.80,23.977288,9.548280,2.352094,56.802819,0.189178,0.000884,0.100369,0.069046
2,399.11,109.86,109.86,-198.699143,0.00,20.04,-17.15,-12.57,-1.80,0.10,...,6.81,-30.69,23.592060,-618.027372,-0.000000,-23.752940,-3.552263,-0.373468,-0.518387,-0.516536
3,267.34,55.31,55.31,14.620331,70.59,27.22,60.12,66.76,18.17,0.40,...,86.58,61.48,23.265147,5.010647,4.843848,26.994572,-3.969096,-0.149208,0.041456,-0.798633
4,390.03,30.73,124.85,13.221831,1.76,20.41,28.90,408.97,27.88,1.17,...,95.29,76.32,23.488822,16.602420,0.000000,46.704264,0.362891,-0.001722,0.418095,0.418095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137673,400.62,84.46,129.25,12.957288,19.60,19.98,6.17,67.90,10.04,2.20,...,55.69,32.77,23.805059,4.818200,0.000000,213.217431,0.022309,-0.003247,0.050929,0.049512
137674,135.02,148.14,158.66,6.055733,25.22,42.55,4.29,9.33,3.32,3.04,...,56.58,18.05,25.560289,4.835895,0.036460,195.625970,0.036524,-0.018967,0.118604,0.126453
137675,50.03,223.79,300.12,52.200088,18.78,66.65,20.22,58.72,36.01,2.00,...,77.57,59.76,22.865585,40.753692,0.163392,288.357414,0.128391,0.215584,0.262040,0.213073
137676,27.20,272.56,280.21,-21.661011,7.13,78.62,1.88,1.88,1.43,0.88,...,65.89,26.06,24.057147,40.446933,0.295099,-288.743530,0.073553,0.063097,0.050614,0.066068


In [24]:
def vif(data):
    import pandas as pd
    from statsmodels.stats.outliers_influence import variance_inflation_factor

    # VIF 출력을 위한 데이터 프레임 형성
    vif = pd.DataFrame()

    # VIF 값과 각 Feature 이름에 대해 설정
    vif["VIF Factor"] = [variance_inflation_factor(data.values, i) for i in range(len(data.columns))]
    vif["features"] = data.columns

    # VIF 값이 높은 순으로 정렬
    vif = vif.sort_values(by="VIF Factor", ascending=False)
    vif = vif.reset_index().drop(columns='index')

    return vif

vif(mda_feature)

Unnamed: 0,VIF Factor,features
0,31.387494,log자산총계
1,12.714899,당좌비율
2,12.586102,유동비율
3,9.932378,자본분배율
4,9.735564,자기자본구성비율
5,9.513582,노동소득분배율
6,9.266639,순운전자본회전률
7,7.621522,차입금의존도
8,6.981453,유동자산회전률
9,6.938925,총자본회전률


In [25]:
# t-test가 가장 작은순으로 의미 있으므로 가장작은 순위로 12개 피처 선정
fea = ['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
       '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률',
       '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율',
       '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율',
       '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'OCF이자보상배율',
       '장기부채상환능력', '매출액대비금융비용상환능력', '매출액대비잉여현금흐름', '총자산대비현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름']
mda_feature2 = train[fea]
mda_feature2


Unnamed: 0,부채비율,당좌비율,유동비율,이자보상배율,차입금의존도,자기자본구성비율,매출액영업이익률,자기자본순이익률,총자본순이익률,총자본회전률,...,노동소득분배율,자본분배율,이윤분배율,OCF이자보상배율,장기부채상환능력,매출액대비금융비용상환능력,매출액대비잉여현금흐름,총자산대비현금흐름,총자산대비영업현금흐름,총자산대비잉여현금흐름
0,1251.02,347.96,354.61,-13.357516,6.09,7.40,-414.90,-826.27,-82.59,0.17,...,0.00,0.00,0.00,-9.332108,-0.000000,4.174742,-2.629424,0.772454,-0.244115,-0.221444
1,107.19,68.72,68.72,8.709388,45.96,48.26,25.08,13.81,6.61,0.36,...,20.76,79.24,42.80,9.548280,2.352094,56.802819,0.189178,0.000884,0.100369,0.069046
2,399.11,109.86,109.86,-198.699143,0.00,20.04,-17.15,-12.57,-1.80,0.10,...,93.19,6.81,-30.69,-618.027372,-0.000000,-23.752940,-3.552263,-0.373468,-0.518387,-0.516536
3,267.34,55.31,55.31,14.620331,70.59,27.22,60.12,66.76,18.17,0.40,...,13.42,86.58,61.48,5.010647,4.843848,26.994572,-3.969096,-0.149208,0.041456,-0.798633
4,390.03,30.73,124.85,13.221831,1.76,20.41,28.90,408.97,27.88,1.17,...,4.71,95.29,76.32,16.602420,0.000000,46.704264,0.362891,-0.001722,0.418095,0.418095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137673,400.62,84.46,129.25,12.957288,19.60,19.98,6.17,67.90,10.04,2.20,...,44.31,55.69,32.77,4.818200,0.000000,213.217431,0.022309,-0.003247,0.050929,0.049512
137674,135.02,148.14,158.66,6.055733,25.22,42.55,4.29,9.33,3.32,3.04,...,43.42,56.58,18.05,4.835895,0.036460,195.625970,0.036524,-0.018967,0.118604,0.126453
137675,50.03,223.79,300.12,52.200088,18.78,66.65,20.22,58.72,36.01,2.00,...,22.43,77.57,59.76,40.753692,0.163392,288.357414,0.128391,0.215584,0.262040,0.213073
137676,27.20,272.56,280.21,-21.661011,7.13,78.62,1.88,1.88,1.43,0.88,...,34.11,65.89,26.06,40.446933,0.295099,-288.743530,0.073553,0.063097,0.050614,0.066068


In [26]:
vif(mda_feature2)

Unnamed: 0,VIF Factor,features
0,12.714899,당좌비율
1,12.585796,유동비율
2,9.228735,순운전자본회전률
3,7.392161,자본분배율
4,7.003755,노동소득분배율
5,6.981452,유동자산회전률
6,6.659666,총자본회전률
7,5.23915,당좌자산회전률
8,5.009313,자기자본구성비율
9,3.222909,차입금의존도


In [32]:
fea = ['부채비율', '당좌비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
       '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률',
       '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율',
       '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율',
       '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'OCF이자보상배율',
       '장기부채상환능력', '매출액대비금융비용상환능력', '매출액대비잉여현금흐름', '총자산대비현금흐름',
       '총자산대비영업현금흐름', '총자산대비잉여현금흐름']
mda_feature3 = train[fea]
mda_feature3


Unnamed: 0,부채비율,당좌비율,이자보상배율,차입금의존도,자기자본구성비율,매출액영업이익률,자기자본순이익률,총자본순이익률,총자본회전률,자기자본회전률,...,노동소득분배율,자본분배율,이윤분배율,OCF이자보상배율,장기부채상환능력,매출액대비금융비용상환능력,매출액대비잉여현금흐름,총자산대비현금흐름,총자산대비영업현금흐름,총자산대비잉여현금흐름
0,1251.02,347.96,-13.357516,6.09,7.40,-414.90,-826.27,-82.59,0.17,2.28,...,0.00,0.00,0.00,-9.332108,-0.000000,4.174742,-2.629424,0.772454,-0.244115,-0.221444
1,107.19,68.72,8.709388,45.96,48.26,25.08,13.81,6.61,0.36,0.76,...,20.76,79.24,42.80,9.548280,2.352094,56.802819,0.189178,0.000884,0.100369,0.069046
2,399.11,109.86,-198.699143,0.00,20.04,-17.15,-12.57,-1.80,0.10,0.68,...,93.19,6.81,-30.69,-618.027372,-0.000000,-23.752940,-3.552263,-0.373468,-0.518387,-0.516536
3,267.34,55.31,14.620331,70.59,27.22,60.12,66.76,18.17,0.40,1.48,...,13.42,86.58,61.48,5.010647,4.843848,26.994572,-3.969096,-0.149208,0.041456,-0.798633
4,390.03,30.73,13.221831,1.76,20.41,28.90,408.97,27.88,1.17,17.19,...,4.71,95.29,76.32,16.602420,0.000000,46.704264,0.362891,-0.001722,0.418095,0.418095
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137673,400.62,84.46,12.957288,19.60,19.98,6.17,67.90,10.04,2.20,14.88,...,44.31,55.69,32.77,4.818200,0.000000,213.217431,0.022309,-0.003247,0.050929,0.049512
137674,135.02,148.14,6.055733,25.22,42.55,4.29,9.33,3.32,3.04,8.55,...,43.42,56.58,18.05,4.835895,0.036460,195.625970,0.036524,-0.018967,0.118604,0.126453
137675,50.03,223.79,52.200088,18.78,66.65,20.22,58.72,36.01,2.00,3.27,...,22.43,77.57,59.76,40.753692,0.163392,288.357414,0.128391,0.215584,0.262040,0.213073
137676,27.20,272.56,-21.661011,7.13,78.62,1.88,1.88,1.43,0.88,1.15,...,34.11,65.89,26.06,40.446933,0.295099,-288.743530,0.073553,0.063097,0.050614,0.066068


In [33]:
vif(mda_feature3)

Unnamed: 0,VIF Factor,features
0,9.228716,순운전자본회전률
1,7.392028,자본분배율
2,7.003737,노동소득분배율
3,6.906392,유동자산회전률
4,6.658317,총자본회전률
5,5.169877,당좌자산회전률
6,5.009258,자기자본구성비율
7,3.212213,차입금의존도
8,2.768261,OCF이자보상배율
9,2.767082,이자보상배율


In [63]:
fea_hetero= ['부채비율', '당좌비율', '이자보상배율', '차입금의존도', '자기자본구성비율', 
       '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률',
       '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율',
       '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율',
        '노동소득분배율', '자본분배율', '이윤분배율', 'OCF이자보상배율',
       '장기부채상환능력', '매출액대비금융비용상환능력', '매출액대비잉여현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름']
fea_homo=['매출액영업이익률','총자본투자효율','부가가치율','총자산대비현금흐름']

In [68]:
# Welch's t-test 수행
t_stat, p_value = stats.ttest_ind(Bad[fea_hetero], Good[fea_hetero], equal_var=False)
result_df_hetero = pd.DataFrame({'t-statistic': t_stat, 'p-value': p_value}, index=fea_hetero)

# t-test 수행
t_stat, p_value = stats.ttest_ind(Bad[fea_homo], Good[fea_homo], equal_var=True)
result_df_homo = pd.DataFrame({'t-statistic': t_stat, 'p-value': p_value}, index=fea_homo)

result_df = pd.concat([result_df_hetero,result_df_homo],axis=0)


result_df = result_df.sort_values('p-value', ascending=True).reset_index()

In [69]:
result_df = result_df[['index', 'p-value']]
result_df.columns = ['Variable', 'p-value']

In [70]:
result_df

Unnamed: 0,Variable,p-value
0,자기자본구성비율,0.0
1,총자산대비잉여현금흐름,0.0
2,유동자산회전률,0.0
3,총자본회전률,0.0
4,당좌자산회전률,0.0
5,순운전자본회전률,0.0
6,차입금의존도,6.457497e-287
7,자기자본회전률,1.349787e-276
8,총자본증가율,4.577537e-271
9,순이익증가율,1.849513e-221


---

# Chi 2

* 카이제곱 검정 조건
    * 종속변인은 범주형 자료여야 한다.
    * 기대빈도가 5이하인 셀이 전체의 20%가 넘지 않아야 한다.
    * 각 칸의 빈도는 다른 칸의 빈도와 독립적이어야 한다.

- 범주형 변수만 추출

In [87]:
train.columns

Index(['회사명', '거래소코드', 'Year', '부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도',
       '자기자본구성비율', '매출액영업이익률', '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률',
       '운전자본회전률', '순운전자본회전률', '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율',
       '총자본증가율', '자기자본증가율', '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율',
       '영업이익증가율', '총자본투자효율', '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'OCF이자보상배율',
       '부채상환계수', '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름',
       '매출액대비잉여현금흐름', '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름', '기업수명주기',
       '이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부', 'log자산총계',
       't-1감사의견코드'],
      dtype='object')

In [72]:
train_cat = train[['기업수명주기', 't-1감사의견코드', '이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부']].astype('category')
test_cat = test[['기업수명주기', 't-1감사의견코드', '이보배초과여부', '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부']].astype('category')

In [73]:
X_train_cat = train_cat.drop('t-1감사의견코드', axis=1)
y_train_cat = train['t-1감사의견코드']
X_test_cat = test_cat.drop('t-1감사의견코드', axis=1)
y_test_cat = test['t-1감사의견코드']

In [74]:
from scipy.stats import chi2_contingency

# 기대빈도가 5 이하인 항목의 비율을 저장할 딕셔너리
expected_freq_5_ratio = {}

# 각 독립 변수에 대해 기대빈도 계산 및 비율 확인
for column in train_cat.columns[:-1]:
    # 교차 테이블 생성
    contingency_table = pd.crosstab(train_cat[column], train_cat['t-1감사의견코드'])

    # 카이제곱 검정 수행
    chi2, p_value, dof, expected_freq = chi2_contingency(contingency_table)

    # 기대빈도가 5 이하인 항목의 비율 계산
    expected_freq_5 = (expected_freq <= 5).mean()

    # 결과 저장
    expected_freq_5_ratio[column] = expected_freq_5

# 결과 출력
for column, ratio in expected_freq_5_ratio.items():
    print(f"변수 '{column}'의 기대빈도가 5 이하인 항목 비율: {ratio}")

변수 '기업수명주기'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 't-1감사의견코드'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '이보배초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '파부비초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '파당비초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0
변수 '파차의초과여부'의 기대빈도가 5 이하인 항목 비율: 0.0


In [75]:
# 독립변수와 종속변수 간의 카이제곱 검정 수행
chi2_scores = []

for column in X_train_cat.columns:
    # 교차 테이블 생성
    contingency_table = pd.crosstab(X_train_cat[column], y_train_cat)
    chi2, p_value, dof, expected_freq = chi2_contingency(contingency_table)
    # 카이제곱 통계량(chi2), p-value(p_value), 자유도(dof), 예상빈도(expected_freq)
    print(p_value)
    chi2_scores.append((column, chi2))

# 카이제곱 검정 결과를 기준으로 변수 정렬
sorted_features = sorted(chi2_scores, key=lambda x : x[1], reverse=True)

# 선택된 변수 출력
sorted_features

0.0
0.12908771021386392
0.0
0.00039229715645192347
0.0
1.9096172238484248e-59


[('기업수명주기', 3089.2507048662806),
 ('파차의초과여부', 1985.8292770838254),
 ('파부비초과여부', 1906.060822214597),
 ('파로이초과여부', 264.3747900999849),
 ('파당비초과여부', 12.568529415290646),
 ('이보배초과여부', 2.303441378969885)]

In [76]:
chi2_scores

[('기업수명주기', 3089.2507048662806),
 ('이보배초과여부', 2.303441378969885),
 ('파부비초과여부', 1906.060822214597),
 ('파당비초과여부', 12.568529415290646),
 ('파차의초과여부', 1985.8292770838254),
 ('파로이초과여부', 264.3747900999849)]

In [43]:
import pandas as pd
from scipy.stats import chi2_contingency

# 독립변수와 종속변수 간의 카이제곱 검정 수행
chi2_scores = []

p_values = []
for column in X_train_cat.columns:
    # 교차 테이블 생성
    contingency_table = pd.crosstab(X_train_cat[column], y_train_cat)
    chi2, p_value, dof, expected_freq = chi2_contingency(contingency_table)
    p_values.append(p_value)
    chi2_scores.append((column, chi2))

# p-value가 0.05보다 작은 값을 출력하는 데이터프레임 생성
result_df_1 = pd.DataFrame({'Variable': X_train_cat.columns, 'p-value': p_values})
filtered_df_chi = result_df_1[result_df_1['p-value'] < 0.05]

filtered_df_chi


Unnamed: 0,Variable,p-value
0,기업수명주기,0.0
2,파부비초과여부,0.0
3,파당비초과여부,0.0003922972
4,파차의초과여부,0.0
5,파로이초과여부,1.909617e-59


In [44]:
result_df

Unnamed: 0,Variable,p-value
0,총자산대비잉여현금흐름,0.0
1,자기자본구성비율,0.0
2,유동자산회전률,0.0
3,당좌자산회전률,0.0
4,순운전자본회전률,0.0
5,총자본회전률,0.0
6,차입금의존도,6.457497e-287
7,자기자본회전률,1.349787e-276
8,총자본증가율,4.577537e-271
9,순이익증가율,1.849513e-221


In [77]:
result = pd.concat([result_df, filtered_df_chi], axis=0)

In [78]:
result

Unnamed: 0,Variable,p-value
0,자기자본구성비율,0.0
1,총자산대비잉여현금흐름,0.0
2,유동자산회전률,0.0
3,총자본회전률,0.0
4,당좌자산회전률,0.0
5,순운전자본회전률,0.0
6,차입금의존도,6.457497e-287
7,자기자본회전률,1.349787e-276
8,총자본증가율,4.577537e-271
9,순이익증가율,1.849513e-221


In [79]:
result.sort_values('p-value', ascending=True).reset_index(drop=True)

Unnamed: 0,Variable,p-value
0,자기자본구성비율,0.0
1,파부비초과여부,0.0
2,기업수명주기,0.0
3,순운전자본회전률,0.0
4,파차의초과여부,0.0
5,총자본회전률,0.0
6,유동자산회전률,0.0
7,총자산대비잉여현금흐름,0.0
8,당좌자산회전률,0.0
9,차입금의존도,6.457497e-287


---

In [80]:
# 스케일링된 연속형 변수와 범주형 변수 합치기
logit_ = pd.concat([train_sc_total, X_train_cat], axis=1)
X_logit = logit_.drop('t-1감사의견코드', axis=1)
y_logit = logit_[['t-1감사의견코드']]

#### Feature 개수 정하기 위해 Logit

In [88]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
import statsmodels.api as sm
import numpy as np
lr_clf = LogisticRegression()

feature = X_logit
target = y_logit

logit = SelectFromModel(LogisticRegression())
logit.fit(feature, target)
logit_support = logit.get_support()
lr_feature = feature.loc[:,logit_support].columns.tolist()

In [89]:
lr_feature

['자기자본구성비율',
 '총자본순이익률',
 '순운전자본회전률',
 '당좌자산회전률',
 '유동자산회전률',
 '총자본증가율',
 '자기자본증가율',
 '순이익증가율',
 '총자본투자효율',
 'log자산총계',
 '총자산대비현금흐름',
 '총자산대비잉여현금흐름',
 '파부비초과여부',
 '파당비초과여부',
 '파차의초과여부']

In [90]:
len(lr_feature)

15

---

### Embedded Method

> Randomforeset

In [52]:
from sklearn.ensemble import RandomForestClassifier

In [53]:
selector = SelectFromModel(estimator=RandomForestClassifier(), threshold=0.02398).fit(X_logit, y_logit)
rf = selector.get_support()
count = np.count_nonzero(rf)
count

13

In [53]:
rf_features = X_logit.loc[:, rf].columns.tolist()
rf_features

['자기자본순이익률',
 '총자본회전률',
 '자기자본회전률',
 '순운전자본회전률',
 '재고자산회전률',
 '당좌자산회전률',
 '매출액증가율',
 '총자본증가율',
 '자기자본증가율',
 '유동자산증가율',
 'log자산총계',
 '매출액대비금융비용상환능력',
 '총자산대비잉여현금흐름']

> LASSO

In [106]:
lasso = SelectFromModel(estimator=LogisticRegression(penalty='l1', solver='liblinear', C=0.0115)).fit(X_logit, y_logit)
lasso_support = lasso.get_support()
lasso_feature = X_logit.loc[:,lasso_support].columns.tolist()

In [107]:
len(lasso_feature)

13

# Wrapper Method

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector

In [61]:
selector = SequentialFeatureSelector(estimator=LogisticRegression(), n_features_to_select=13, direction='backward', scoring='f1', cv=5, n_jobs=-1)

# 변수 선택 수행
selector.fit(X_logit, y_logit)

# 선택된 변수의 인덱스
selected_features = selector.get_support(indices=True)

# 선택된 변수 출력
for i in selected_features:
    print(X_logit.columns[i])

차입금의존도
총자본순이익률
총자본회전률
자기자본회전률
총자본증가율
총자본투자효율
노동소득분배율
log자산총계
총자산대비현금흐름
총자산대비잉여현금흐름
기업수명주기
파부비초과여부
파차의초과여부


In [62]:
selected_columns = X_logit.columns[selected_features]
wrapper_features = list(selected_columns)

# 종합

In [108]:
result = result.sort_values('p-value', ascending=True).reset_index(drop=True)
result = result.head(13)
filter = result[['Variable']]

In [109]:
rf_features = pd.DataFrame(rf_features)
lasso_features = pd.DataFrame(lasso_feature)
wrapper_features = pd.DataFrame(wrapper_features)

In [110]:
total = pd.concat([filter, rf_features, lasso_features, wrapper_features], axis=1)

In [115]:
total.columns = ['t&chi', 'rf', 'lasso', 'wrapper']
total

Unnamed: 0,t&chi,rf,lasso,wrapper
0,자기자본구성비율,자기자본순이익률,자기자본구성비율,차입금의존도
1,log자산총계,총자본회전률,총자본순이익률,총자본순이익률
2,파부비초과여부,자기자본회전률,총자본회전률,총자본회전률
3,기업수명주기,순운전자본회전률,자기자본회전률,자기자본회전률
4,파차의초과여부,재고자산회전률,총자본증가율,총자본증가율
5,총자본증가율,당좌자산회전률,유동자산증가율,총자본투자효율
6,파로이초과여부,매출액증가율,log자산총계,노동소득분배율
7,총자본회전률,총자본증가율,부채상환계수,log자산총계
8,총자산대비현금흐름,자기자본증가율,장기부채상환능력,총자산대비현금흐름
9,재고자산회전률,유동자산증가율,총자산대비현금흐름,총자산대비잉여현금흐름


In [118]:
filter = total['t&chi'].tolist()
rf = total['rf'].tolist()
lasso = total['lasso'].tolist()
wrapper = total['wrapper'].tolist()

In [113]:
X_logit.columns

Index(['부채비율', '당좌비율', '유동비율', '이자보상배율', '차입금의존도', '자기자본구성비율', '매출액영업이익률',
       '자기자본순이익률', '총자본순이익률', '총자본회전률', '자기자본회전률', '운전자본회전률', '순운전자본회전률',
       '재고자산회전률', '당좌자산회전률', '유동자산회전률', '매출액증가율', '총자본증가율', '자기자본증가율',
       '순이익증가율', '유형자산증가율', '유동자산증가율', '재고자산증가율', '영업이익증가율', '총자본투자효율',
       '부가가치율', '노동소득분배율', '자본분배율', '이윤분배율', 'log자산총계', 'OCF이자보상배율', '부채상환계수',
       '장기부채상환능력', '매출액대비금융비용상환능력', '연구개발비대비매출액', '매출액대비현금흐름', '매출액대비잉여현금흐름',
       '총자산대비현금흐름', '총자산대비영업현금흐름', '총자산대비잉여현금흐름', '기업수명주기', '이보배초과여부',
       '파부비초과여부', '파당비초과여부', '파차의초과여부', '파로이초과여부'],
      dtype='object')

In [122]:
total_result = pd.DataFrame(index=X_logit.columns)

# 각 컬럼의 값에 인덱스 포함 여부에 따라 True 또는 False 입력
total_result['t&chi'] = total_result.index.isin(filter)
total_result['wrapper'] = total_result.index.isin(wrapper)
total_result['rf'] = total_result.index.isin(rf)
total_result['lasso'] = total_result.index.isin(lasso)
total_result["true_sum"] = total_result.sum(axis=1)

total_result.sort_values('true_sum', ascending=False, inplace=True)
total_result

Unnamed: 0,t&chi,wrapper,rf,lasso,true_sum
총자본회전률,True,True,True,True,4
log자산총계,True,True,True,True,4
총자본증가율,True,True,True,True,4
파차의초과여부,True,True,False,True,3
파부비초과여부,True,True,False,True,3
총자본순이익률,True,True,False,True,3
자기자본회전률,False,True,True,True,3
총자산대비현금흐름,True,True,False,True,3
기업수명주기,True,True,False,True,3
자기자본구성비율,True,False,False,True,2


In [126]:
total_result_2 = total_result[total_result['true_sum']>=2]
total_result_2.reset_index()

Unnamed: 0,index,t&chi,wrapper,rf,lasso,true_sum
0,총자본회전률,True,True,True,True,4
1,log자산총계,True,True,True,True,4
2,총자본증가율,True,True,True,True,4
3,파차의초과여부,True,True,False,True,3
4,파부비초과여부,True,True,False,True,3
5,총자본순이익률,True,True,False,True,3
6,자기자본회전률,False,True,True,True,3
7,총자산대비현금흐름,True,True,False,True,3
8,기업수명주기,True,True,False,True,3
9,자기자본구성비율,True,False,False,True,2


In [127]:
total_result_2.index

Index(['총자본회전률', 'log자산총계', '총자본증가율', '파차의초과여부', '파부비초과여부', '총자본순이익률',
       '자기자본회전률', '총자산대비현금흐름', '기업수명주기', '자기자본구성비율', '재고자산회전률', '유동자산증가율',
       '총자산대비잉여현금흐름'],
      dtype='object')