# 0. 라이브러리

In [91]:
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm

In [92]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 1. 데이터 불러오기

In [93]:
data = pd.read_csv('./data/기업별재무거시데이터.csv', dtype={'종목코드':str})

In [94]:
data.head(1)

Unnamed: 0,기업명,종목코드,상장일,지속기간,기준연도,부채비율,자기자본비율,총자산영업이익율,ROA,ROE,매출액영업이익율,총자산증가율,매출액증가율,당기순이익증가율,영업이익증가율,유동비율,자산회전율,부채회전율,자본회전율,총매출액규모,총자산규모,GDP성장률,원달러환율,CD91일,코스닥종가,전산업생산지수,경제심리지수,뉴스심리지수,부도,폐지일
0,3S,60310,2002-04-23,7861,2022,69.873715,0.588673,0.022297,0.017986,0.031284,0.053563,16.142265,13.399616,40.639176,196.979059,1.182665,0.416268,1.012011,0.707129,24.023192,24.899618,2.61,1291.4,2.49,679.29,110.1,100.28,94.21,0,


# 2. 데이터 전처리

In [95]:
data.상장일 = pd.to_datetime(data.상장일)
data.폐지일 = pd.to_datetime(data.폐지일)

In [96]:
data.shape

(1522, 30)

In [97]:
data.columns

Index(['기업명', '종목코드', '상장일', '지속기간', '기준연도', '부채비율', '자기자본비율', '총자산영업이익율',
       'ROA', 'ROE', '매출액영업이익율', '총자산증가율', '매출액증가율', '당기순이익증가율', '영업이익증가율',
       '유동비율', '자산회전율', '부채회전율', '자본회전율', '총매출액규모', '총자산규모', 'GDP성장률', '원달러환율',
       'CD91일', '코스닥종가', '전산업생산지수', '경제심리지수', '뉴스심리지수', '부도', '폐지일'],
      dtype='object')

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522 entries, 0 to 1521
Data columns (total 30 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   기업명       1522 non-null   object        
 1   종목코드      1522 non-null   object        
 2   상장일       1522 non-null   datetime64[ns]
 3   지속기간      1522 non-null   int64         
 4   기준연도      1522 non-null   int64         
 5   부채비율      1522 non-null   float64       
 6   자기자본비율    1522 non-null   float64       
 7   총자산영업이익율  1522 non-null   float64       
 8   ROA       1522 non-null   float64       
 9   ROE       1522 non-null   float64       
 10  매출액영업이익율  1522 non-null   float64       
 11  총자산증가율    1522 non-null   float64       
 12  매출액증가율    1522 non-null   float64       
 13  당기순이익증가율  1522 non-null   float64       
 14  영업이익증가율   1522 non-null   float64       
 15  유동비율      1522 non-null   float64       
 16  자산회전율     1522 non-null   float64       
 17  부채회전율     1522

# 3. train test split

In [99]:
X = data.drop(['기업명', '종목코드', '상장일', '기준연도', '폐지일', '부도'], axis=1)
y = data['부도']

In [100]:
X.shape

(1522, 24)

In [101]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [102]:
train_X.shape, test_X.shape, train_y.shape, test_y.shape

((1217, 24), (305, 24), (1217,), (305,))

In [103]:
train_X.head(2)

Unnamed: 0,지속기간,부채비율,자기자본비율,총자산영업이익율,ROA,ROE,매출액영업이익율,총자산증가율,매출액증가율,당기순이익증가율,영업이익증가율,유동비율,자산회전율,부채회전율,자본회전율,총매출액규모,총자산규모,GDP성장률,원달러환율,CD91일,코스닥종가,전산업생산지수,경제심리지수,뉴스심리지수
1279,6846,53.757556,0.650375,0.001696,0.010262,0.015766,0.007433,5.575008,5.990395,-21.150772,-443.470011,3.590756,0.228236,0.652802,0.35093,25.405702,26.883076,2.61,1291.4,2.49,679.29,110.1,100.28,94.21
81,8708,295.241389,0.25301,0.034108,0.05341,0.230802,0.021797,39.21865,14.140817,-33.812269,-108.034685,1.119656,1.564759,2.094753,6.184577,28.051922,27.60419,2.61,1291.4,2.49,679.29,110.1,100.28,94.21


# 4. Scaling

In [104]:
train_X.head(7)

Unnamed: 0,지속기간,부채비율,자기자본비율,총자산영업이익율,ROA,ROE,매출액영업이익율,총자산증가율,매출액증가율,당기순이익증가율,영업이익증가율,유동비율,자산회전율,부채회전율,자본회전율,총매출액규모,총자산규모,GDP성장률,원달러환율,CD91일,코스닥종가,전산업생산지수,경제심리지수,뉴스심리지수
1279,6846,53.757556,0.650375,0.001696,0.010262,0.015766,0.007433,5.575008,5.990395,-21.150772,-443.470011,3.590756,0.228236,0.652802,0.35093,25.405702,26.883076,2.61,1291.4,2.49,679.29,110.1,100.28,94.21
81,8708,295.241389,0.25301,0.034108,0.05341,0.230802,0.021797,39.21865,14.140817,-33.812269,-108.034685,1.119656,1.564759,2.094753,6.184577,28.051922,27.60419,2.61,1291.4,2.49,679.29,110.1,100.28,94.21
1425,1133,68.510592,0.593435,0.099035,0.03844,0.06541,0.08739,10.038812,20.43131,-41.014341,52.233493,1.323628,1.133261,2.787401,1.909665,25.996203,25.871103,2.61,1291.4,2.49,679.29,110.1,100.28,94.21
1033,7238,35.671961,0.737072,-0.008471,0.264934,0.44444,-0.040286,9.921957,3.589038,111.162384,-131.147855,3.059514,0.210268,0.799717,0.285275,23.850424,25.409796,2.61,1291.4,2.49,679.29,110.1,100.28,94.21
1172,4774,63.547444,0.611443,-0.129295,-0.987039,-0.949737,-1.151751,-93.401284,-45.997471,31.608254,6.755652,0.547695,0.11226,0.288915,0.183598,22.655886,24.842826,2.61,1291.4,2.49,679.29,110.1,100.28,94.21
111,10163,177.770564,0.360009,0.007672,0.015022,0.042897,0.009896,44.395443,7.700744,-0.914019,-90.742796,0.889228,0.775292,1.211411,2.153533,24.803173,25.057689,2.61,1291.4,2.49,679.29,110.1,100.28,94.21
1296,1441,60.479955,0.623131,-0.073245,-0.16937,-0.278274,-0.300194,-10.248669,-39.567754,16.481461,-38.147798,0.479562,0.243991,0.647417,0.391557,22.743539,24.154161,2.61,1291.4,2.49,679.29,110.1,100.28,94.21


In [105]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

scaler = StandardScaler()

cols = train_X.columns
idx_tr = train_X.index
idx_te = test_X.index

# DataFrame으로 변환 후 Scaler 적용
train_X = pd.DataFrame(scaler.fit_transform(train_X), columns=cols, index=idx_tr)
test_X = pd.DataFrame(scaler.transform(test_X), columns=cols, index=idx_te)

In [106]:
train_X.head(7)

Unnamed: 0,지속기간,부채비율,자기자본비율,총자산영업이익율,ROA,ROE,매출액영업이익율,총자산증가율,매출액증가율,당기순이익증가율,영업이익증가율,유동비율,자산회전율,부채회전율,자본회전율,총매출액규모,총자산규모,GDP성장률,원달러환율,CD91일,코스닥종가,전산업생산지수,경제심리지수,뉴스심리지수
1279,0.569487,-0.144282,0.219479,0.045882,0.203046,0.099474,0.037661,0.171676,0.036516,-0.003324,-0.626682,0.081248,-1.011632,-0.767653,-0.463091,0.2497,1.001909,0.066897,0.230381,0.222797,-0.16273,0.222095,0.143297,-0.182148
81,1.190862,0.637741,-1.194185,0.253265,0.341916,0.254117,0.037873,1.020212,0.050238,-0.009571,-0.162656,-0.332657,1.76625,-0.144732,1.997024,1.838046,1.562452,0.066897,0.230381,0.222797,-0.16273,0.222095,0.143297,-0.182148
1425,-1.337018,-0.096506,0.016909,0.668706,0.293735,0.135175,0.038841,0.284259,0.060829,-0.013125,0.059052,-0.298492,0.869406,0.154491,0.194245,0.604138,0.215273,0.066897,0.230381,0.222797,-0.16273,0.222095,0.143297,-0.182148
1033,0.700303,-0.202851,0.527913,-0.019172,1.022706,0.407754,0.036957,0.281311,0.032473,0.061957,-0.19463,-0.007735,-1.048978,-0.704186,-0.490778,-0.683827,-0.143314,0.066897,0.230381,0.222797,-0.16273,0.222095,0.143297,-0.182148
1172,-0.121967,-0.112579,0.080977,-0.792269,-3.00677,-0.594869,0.020558,-2.324635,-0.051012,0.022706,-0.00386,-0.42846,-1.252682,-0.924852,-0.533657,-1.400827,-0.584037,0.066897,0.230381,0.222797,-0.16273,0.222095,0.143297,-0.182148
111,1.676415,0.257322,-0.813524,0.084117,0.218366,0.118984,0.037698,1.150778,0.039396,0.00666,-0.138735,-0.371253,0.12539,-0.526334,0.297087,-0.111957,-0.417017,0.066897,0.230381,0.222797,-0.16273,0.222095,0.143297,-0.182148
1296,-1.234234,-0.122512,0.122556,-0.433629,-0.375101,-0.111986,0.033123,-0.227418,-0.040186,0.015243,-0.065978,-0.439872,-0.978886,-0.76998,-0.445958,-1.348215,-1.119355,0.066897,0.230381,0.222797,-0.16273,0.222095,0.143297,-0.182148


# 5. SMOTE

In [107]:
train_y.value_counts()

0    1148
1      69
Name: 부도, dtype: int64

In [108]:
test_y.value_counts()

0    287
1     18
Name: 부도, dtype: int64

In [109]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)

# Use fit_resample instead of fit_sample
train_X_over, train_y_over = smote.fit_resample(train_X, train_y)

print("SMOTE 적용 전 training set:", train_X.shape, train_y.shape)
print("SMOTE 적용 전 부도여부 count:\n", train_y.value_counts())
print("----------- SMOTE 적용 -----------")
print("SMOTE 적용 후 training set:", train_X_over.shape, train_y_over.shape)
print("SMOTE 적용 후 부도여부 count:\n", train_y_over.value_counts())

SMOTE 적용 전 training set: (1217, 24) (1217,)
SMOTE 적용 전 부도여부 count:
 0    1148
1      69
Name: 부도, dtype: int64
----------- SMOTE 적용 -----------
SMOTE 적용 후 training set: (2296, 24) (2296,)
SMOTE 적용 후 부도여부 count:
 0    1148
1    1148
Name: 부도, dtype: int64


In [110]:
train_X = train_X_over.copy()
train_y = train_y_over.copy()

## 6. 기업 부도 예측

In [111]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score

### DecisionTree

In [112]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(train_X, train_y)

print(f"Accuracy for DecisionTree : {accuracy_score(test_y, dt.predict(test_X))}")
print(f"F1_score for DecisionTree : {f1_score(test_y, dt.predict(test_X))}")

Accuracy for DecisionTree : 0.9967213114754099
F1_score for DecisionTree : 0.972972972972973


#### DecisionTree Tuning

In [113]:
from sklearn.model_selection import GridSearchCV

# Create the random forest regressor
dt = DecisionTreeClassifier(random_state=42)

# Define the parameter grid to search
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a grid search object using the defined parameter grid
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid, scoring='f1_macro', cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search_dt.fit(train_X, train_y)

# Get the best parameters from the grid search
best_params = grid_search_dt.best_params_
print("Best hyperparameters for DecisionTree: ", grid_search_dt.best_params_)

Best hyperparameters for DecisionTree:  {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}


In [114]:
dt = DecisionTreeClassifier(max_depth=None, min_samples_leaf=2, min_samples_split=2, random_state=42)
dt.fit(train_X, train_y)

print(f"Accuracy for DecisionTree : {accuracy_score(test_y, dt.predict(test_X))}")
print(f"F1_score for DecisionTree : {f1_score(test_y, dt.predict(test_X))}")

Accuracy for DecisionTree : 1.0
F1_score for DecisionTree : 1.0


### RandomForest

In [115]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(train_X, train_y)

print(f"Accuracy for RandomForest : {accuracy_score(test_y, rf.predict(test_X))}")
print(f"F1_score for RandomForest : {f1_score(test_y, rf.predict(test_X))}")

Accuracy for RandomForest : 0.9967213114754099
F1_score for RandomForest : 0.972972972972973


#### RandomForest Tuning

In [116]:
from sklearn.model_selection import GridSearchCV

# Create the random forest regressor
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a grid search object using the defined parameter grid
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='f1_macro', cv=5, n_jobs=-1)

# Fit the grid search to the training data
grid_search_rf.fit(train_X, train_y)

# Get the best parameters from the grid search
best_params = grid_search_rf.best_params_
print("Best hyperparameters for RandomForest: ", grid_search_rf.best_params_)

Best hyperparameters for RandomForest:  {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}


In [117]:
rf = RandomForestClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=50, random_state=42)
rf.fit(train_X, train_y)

print(f"Accuracy for RandomForest : {accuracy_score(test_y, rf.predict(test_X))}")
print(f"F1_score for RandomForest : {f1_score(test_y, rf.predict(test_X))}")

Accuracy for RandomForest : 0.9967213114754099
F1_score for RandomForest : 0.9714285714285714


### LightGBM

In [118]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(random_state=42)
lgbm.fit(train_X, train_y)

print(f"Accuracy for LightGBM : {accuracy_score(test_y, lgbm.predict(test_X))}")
print(f"F1_score for LightGBM : {f1_score(test_y, lgbm.predict(test_X))}")

Accuracy for LightGBM : 0.9901639344262295
F1_score for LightGBM : 0.923076923076923


#### LightGBM Tuning

In [119]:
from sklearn.model_selection import GridSearchCV

# Define the model
lgbm = LGBMClassifier(random_state=42)

# Define the hyperparameters to tune
param_grid = {
    'num_leaves': [5, 10],
    'learning_rate': [0.001, 0.01, 0.1],
    'n_estimators': [100, 500, 1000]
}

# Perform grid search
grid_search_lgbm = GridSearchCV(lgbm, param_grid, scoring='f1_macro', cv=5, n_jobs=-1)
grid_search_lgbm.fit(train_X, train_y)

# Print the best hyperparameters
print("Best hyperparameters for LightGBM: ", grid_search_lgbm.best_params_)

Best hyperparameters for LightGBM:  {'learning_rate': 0.01, 'n_estimators': 1000, 'num_leaves': 10}


In [120]:
lgbm = LGBMClassifier(learning_rate=0.01, n_estimators=1000, num_leaves=10, random_state=42)
lgbm.fit(train_X, train_y)

print(f"Accuracy for LightGBM : {accuracy_score(test_y, lgbm.predict(test_X))}")
print(f"F1_score for LightGBM : {f1_score(test_y, lgbm.predict(test_X))}")

Accuracy for LightGBM : 0.9901639344262295
F1_score for LightGBM : 0.923076923076923


### XGBoost

In [121]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42)
xgb.fit(train_X, train_y)

print(f"Accuracy for XGBoost : {accuracy_score(test_y, xgb.predict(test_X))}")
print(f"F1_score for XGBoost : {f1_score(test_y, xgb.predict(test_X))}")

Accuracy for XGBoost : 0.9868852459016394
F1_score for XGBoost : 0.9


#### XGBoost Tuning

In [122]:
from sklearn.model_selection import GridSearchCV

# Define the model
xgb = XGBClassifier(random_state=42)

# Define the hyperparameters to be tuned
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.001, 0.01, 0.1]
}

# Define the GridSearchCV
grid_search_xgb = GridSearchCV(xgb, param_grid, scoring='f1_macro', cv=5, n_jobs=-1)

# Fit the GridSearchCV with the data
grid_search_xgb.fit(train_X, train_y)

# Get the best hyperparameters and print the best score
best_params = grid_search_xgb.best_params_
print("Best hyperparameters for XGBoost: ", best_params)

Best hyperparameters for XGBoost:  {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50}


In [123]:
xgb = XGBClassifier(learning_rate=0.1, max_depth=7, n_estimators=50, random_state=42)
xgb.fit(train_X, train_y)

print(f"Accuracy for XGBoost : {accuracy_score(test_y, xgb.predict(test_X))}")
print(f"F1_score for XGBoost : {f1_score(test_y, xgb.predict(test_X))}")

Accuracy for XGBoost : 0.9868852459016394
F1_score for XGBoost : 0.9


### MLP

In [124]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(max_iter=300, random_state=42)
mlp.fit(train_X, train_y)

print(f"Accuracy for MLP : {accuracy_score(test_y, mlp.predict(test_X))}")
print(f"F1_score for MLP : {f1_score(test_y, mlp.predict(test_X))}")

Accuracy for MLP : 0.9934426229508196
F1_score for MLP : 0.9473684210526316


#### MLP Tuning

In [125]:
from sklearn.model_selection import GridSearchCV

# Define the MLPRegressor model
mlp = MLPClassifier(max_iter=300, random_state=42)

# Define the hyperparameters to be tuned
param_grid = {'hidden_layer_sizes': [(10,), (50,)], 
              'activation': ['logistic', 'tanh', 'relu'],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'alpha': [0.001, 0.01, 0.1]}

# Create the GridSearchCV object
grid_search_mlp = GridSearchCV(mlp, param_grid, scoring='f1_macro', cv=5, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search_mlp.fit(train_X, train_y)

# Get the best parameters and the best MLPRegressor model
best_params = grid_search_mlp.best_params_

print("Best hyperparameters for MLP: ", best_params)



Best hyperparameters for MLP:  {'activation': 'relu', 'alpha': 0.1, 'hidden_layer_sizes': (50,), 'solver': 'lbfgs'}


In [126]:
mlp = MLPClassifier(activation='relu', alpha=0.1, hidden_layer_sizes=(50,), solver='lbfgs', max_iter=300, random_state=42)
mlp.fit(train_X, train_y)

print(f"Accuracy for MLP : {accuracy_score(test_y, mlp.predict(test_X))}")
print(f"F1_score for MLP : {f1_score(test_y, mlp.predict(test_X))}")

Accuracy for MLP : 0.9901639344262295
F1_score for MLP : 0.918918918918919


### SVM

In [127]:
from sklearn.svm import SVC

svm = SVC(random_state=42)
svm.fit(train_X, train_y)

print(f"Accuracy for SVM: {accuracy_score(test_y, svm.predict(test_X))}")
print(f"F1 Score for SVM: {f1_score(test_y, svm.predict(test_X))}")

Accuracy for SVM: 0.9934426229508196
F1 Score for SVM: 0.9411764705882353


### Naive Bayes

In [128]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(train_X, train_y)

print(f"Accuracy for Naive Bayes: {accuracy_score(test_y, nb.predict(test_X))}")
print(f"F1 Score for Naive Bayes: {f1_score(test_y, nb.predict(test_X))}")


Accuracy for Naive Bayes: 0.9934426229508196
F1 Score for Naive Bayes: 0.9473684210526316
