# **<< Early model development : 초기 모델 작성 >>**

# **1. 모듈 호출**

In [191]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

# 1. 데이터(학습/평가) 분리
from sklearn.model_selection import train_test_split


# 2. 피처 엔지니어링
# 오버샘플링 --> 타겟 데이터 불균형 해결을 위한
from imblearn.over_sampling import SMOTE   # imblearn 의 pipline을 써야함. (사이킷런의 pipline을 쓰면 잘못된 것.)
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
# 정규화 --> 연속형 피처 불균형 해결을 위한
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# # 범주형 변수 원핫인코딩
# from sklearn.preprocessing import OneHotEncoder


# 3. (분류) 학습 모델
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


# 4. 학습 검정 모델
# from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

# 5. (분류) 평가 모델
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

# **2. 함수 만들기**

### **1) 분류 모델의 성능 검정**

In [192]:
def cross_model_validation (model, X, y, X_val, y_val) :
    """
    선정한 모델과 학습 피처, 학습 타겟값을 인자받아 
    cross_val_score를 활용한 모델 검증 함수.
    """
    
    # K-Fold Cross Validation
    cross_val_scores = cross_validate(model, X=X, y=y, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'])

    model.fit(X, y)
    pred = model.predict(X_val)

    # 점수 
    accuracy_scores = cross_val_scores['test_accuracy']
    precision_scores = cross_val_scores['test_precision']
    recall_scores = cross_val_scores['test_recall']
    f1_scores = cross_val_scores['test_f1']
    auc_scores = cross_val_scores['test_roc_auc']


    # 각각의 지표 점수 출력
    print('<<', model, '>>')
    print('===' * 50)
    print('1. 예측 Accuracy(정확도) : {0:.4f}'.format(accuracy_score(y_val, pred)))
    print('===' * 50)
    print('2. 예측 Precision(정밀도) : {0:.4f}'.format(precision_score(y_val, pred)))
    print('===' * 50)
    print('3. 예측 Recall(재현율): {0:.4f}'.format(recall_score(y_val, pred)))
    print('===' * 50)
    print('4. 예측 F1 Score : {0:.4f}'.format(f1_score(y_val, pred)))
    print('===' * 50)
    print('5. 예측 ROC-AUC Score : {0:.4f}'.format(roc_auc_score(y_val, pred)))

# **3. 데이터 로드(Data Loading)**

In [193]:
Bankruptcy_org_df = pd.read_csv('data.csv')

In [194]:
print('Bankruptcy 데이터 세트 크기 :\n', 
      '(1) row 갯수 :', Bankruptcy_org_df.shape[0], '\n',
      '(2) features 갯수 :', Bankruptcy_org_df.shape[1])


# 모든 컬럼을 보디 위한 설정
pd.set_option('display.max_columns', None)

Bankruptcy_org_df = pd.read_csv('data.csv')
Bankruptcy_org_df.tail(1)

Bankruptcy 데이터 세트 크기 :
 (1) row 갯수 : 6819 
 (2) features 갯수 : 96


Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),Operating Expense Rate,Research and development expense rate,Cash flow rate,Interest-bearing debt interest rate,Tax rate (A),Net Value Per Share (B),Net Value Per Share (A),Net Value Per Share (C),Persistent EPS in the Last Four Seasons,Cash Flow Per Share,Revenue Per Share (Yuan ¥),Operating Profit Per Share (Yuan ¥),Per Share Net profit before tax (Yuan ¥),Realized Sales Gross Profit Growth Rate,Operating Profit Growth Rate,After-tax Net Profit Growth Rate,Regular Net Profit Growth Rate,Continuous Net Profit Growth Rate,Total Asset Growth Rate,Net Value Growth Rate,Total Asset Return Growth Rate Ratio,Cash Reinvestment %,Current Ratio,Quick Ratio,Interest Expense Ratio,Total debt/Total net worth,Debt ratio %,Net worth/Assets,Long-term fund suitability ratio (A),Borrowing dependency,Contingent liabilities/Net worth,Operating profit/Paid-in capital,Net profit before tax/Paid-in capital,Inventory and accounts receivable/Net value,Total Asset Turnover,Accounts Receivable Turnover,Average Collection Days,Inventory Turnover Rate (times),Fixed Assets Turnover Frequency,Net Worth Turnover Rate (times),Revenue per person,Operating profit per person,Allocation rate per person,Working Capital to Total Assets,Quick Assets/Total Assets,Current Assets/Total Assets,Cash/Total Assets,Quick Assets/Current Liability,Cash/Current Liability,Current Liability to Assets,Operating Funds to Liability,Inventory/Working Capital,Inventory/Current Liability,Current Liabilities/Liability,Working Capital/Equity,Current Liabilities/Equity,Long-term Liability to Current Assets,Retained Earnings to Total Assets,Total income/Total expense,Total expense/Assets,Current Asset Turnover Rate,Quick Asset Turnover Rate,Working capitcal Turnover Rate,Cash Turnover Rate,Cash Flow to Sales,Fixed Assets to Assets,Current Liability to Liability,Current Liability to Equity,Equity to Long-term Liability,Cash Flow to Total Assets,Cash Flow to Liability,CFO to Assets,Cash Flow to Equity,Current Liability to Current Assets,Liability-Assets Flag,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
6818,0,0.493053,0.570105,0.549548,0.627409,0.627409,0.99808,0.801987,0.8138,0.313415,0.786079,0.001432,0.0,0.427721,590000000.0,0.0,0.220766,0.220766,0.220766,0.227758,0.305793,0.000665,0.092501,0.182119,0.025316,0.848053,0.689527,0.689527,0.217605,9350000000.0,0.000519,0.264186,0.360102,0.051348,0.040897,0.630618,0.000461,0.014149,0.985851,0.058476,0.370049,0.006368,0.092465,0.179911,0.393883,0.002999,0.000325,0.019474,19100000.0,0.0003,0.009194,0.002097,0.385767,0.000963,0.873759,0.527136,0.50501,0.238147,0.051481,0.066674,0.018517,0.239585,0.276975,0.0,1.0,0.737286,0.32669,0.0,0.938005,0.002791,0.006089,0.007864,0.008238,0.598674,0.009506,0.672096,0.005016,1.0,0.32669,0.110933,0.659917,0.483285,0.505531,0.316238,0.005579,0,0.815956,0.000707,0.62668,0.627408,0.841019,0.275114,0.026793,0.565167,1,0.233902


# **4. 범주형과 연속형 변수 나누기**

In [195]:
Bankruptcy_org_df.columns.tolist()

# 컬럼명 앞에 공백 지우기
Bankruptcy_org_df.columns = Bankruptcy_org_df.columns.str.strip()

In [196]:
# 카테고리형 변수
categorical_vars = ['Liability-Assets Flag', 'Net Income Flag']


# 연속형 변수
continuous_vars = [
    col 
    for col in Bankruptcy_org_df.columns 
    if col not in categorical_vars and col != 'Bankrupt?']


categorical = Bankruptcy_org_df[categorical_vars]
print('범주형 피처의 갯수 : ', categorical.shape[1], '개')

continuous = Bankruptcy_org_df[continuous_vars] 
print('연속형 피처의 갯수 : ', continuous.shape[1], '개')

범주형 피처의 갯수 :  2 개
연속형 피처의 갯수 :  93 개


# **5. 피처 엔지니어링(Feature Engineering)**

### **~~1) Oversampling - 타겟 불균형 처리~~**
- 학습 데이터에 대해서만
- 주의 : imblearn 의 pipline을 써야함. (사이킷런의 pipline을 쓰면 잘못된 것.)
- pipline 안에 oversample = SMOTE()

In [197]:
# y_target = Bankruptcy_org_df['Bankrupt?']
# X_features = Bankruptcy_org_df.drop(columns=['Bankrupt?'], axis=1)

In [198]:
# X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, stratify=y_target)

In [199]:
# pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=42)],
#                                 ['scaler', MinMaxScaler()],
#                                 ['classifier', LogisticRegression(random_state=11, max_iter=1000)]])

In [200]:
# pipeline.get_params()

In [201]:
# stratified_kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
# param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}

# grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='roc_auc', cv=stratified_kfold, n_jobs=-1)
# grid_search.fit(X_train, y_train)

In [202]:
# cv_score = grid_search.best_score_
# print(f'Cross-validation score: {cv_score}')

In [203]:
# test_score = grid_search.score(X_test, y_test)
# print(f'Test score: {test_score}')

### **2) 연속형 피처의 불균형 --> 스케일링 (StandardScaler)**

In [204]:
# 연속형 피처의 이름만 가져오기
continuous_features_name = continuous.columns

In [205]:
scaler = StandardScaler()

Bankruptcy_new_df = Bankruptcy_org_df.copy()
Bankruptcy_new_df[continuous_features_name] = scaler.fit_transform(Bankruptcy_new_df[continuous_features_name])

In [207]:
scaler = StandardScaler()

s_scaler_df = Bankruptcy_org_df.copy()
s_scaler_df[continuous_features_name] = scaler.fit_transform(s_scaler_df[continuous_features_name])

### **3) 연속형 피처의 불균형 --> 정규화 (Normalization)**

In [208]:
norm_scaler = MinMaxScaler()

n_scaler_df = Bankruptcy_org_df.copy()
n_scaler_df[continuous_features_name] = norm_scaler.fit_transform(n_scaler_df[continuous_features_name])

# **6. 훈련 및 시험 데이터 분리**

### **1) 스케일링 적용 후 사용할 변수**

In [209]:
s_scaler_y_target = s_scaler_df['Bankrupt?']
s_scaler_X_features = s_scaler_df.drop(columns=['Bankrupt?'], axis=1)

In [210]:
s_scaler_X_train, s_scaler_X_test, s_scaler_y_train, s_scaler_y_test = train_test_split(s_scaler_X_features, s_scaler_y_target, test_size=0.2, random_state=42)

In [211]:
print('s스케일러 적용된 학습용 피처의 Shape :', s_scaler_X_train.shape)
print('s스케일러 적용된 시험용 피처의 Shape :', s_scaler_X_test.shape)
print('s스케일러 적용된 학습용 라벨의 Shape :', s_scaler_y_train.shape)
print('s스케일러 적용된 시험용 라벨의 Shape :', s_scaler_y_test.shape)

s스케일러 적용된 학습용 피처의 Shape : (5455, 95)
s스케일러 적용된 시험용 피처의 Shape : (1364, 95)
s스케일러 적용된 학습용 라벨의 Shape : (5455,)
s스케일러 적용된 시험용 라벨의 Shape : (1364,)


In [212]:
n_scaler_y_target = n_scaler_df['Bankrupt?']
n_scaler_X_features = n_scaler_df.drop(columns=['Bankrupt?'], axis=1)

In [213]:
n_scaler_X_train, n_scaler_X_test, n_scaler_y_train, n_scaler_y_test = train_test_split(n_scaler_X_features, n_scaler_y_target, test_size=0.2,random_state=42)

In [214]:
print('n스케일러 적용된 학습용 피처의 Shape :', n_scaler_X_train.shape)
print('n스케일러 적용된 시험용 피처의 Shape :', n_scaler_X_test.shape)
print('n스케일러 적용된 학습용 라벨의 Shape :', n_scaler_y_train.shape)
print('n스케일러 적용된 시험용 라벨의 Shape :', n_scaler_y_test.shape)

n스케일러 적용된 학습용 피처의 Shape : (5455, 95)
n스케일러 적용된 시험용 피처의 Shape : (1364, 95)
n스케일러 적용된 학습용 라벨의 Shape : (5455,)
n스케일러 적용된 시험용 라벨의 Shape : (1364,)


# **7. 모델 훈련 및 예측 검증**

#### **(1) 로지스틱 회귀(LogisticRegression) 모델의 성능검증**

In [253]:
lr_clf = LogisticRegression(max_iter=100, solver='saga', random_state=42)

In [248]:
print('1. 로지스틱 회귀 모델 성능검증 (Standardization)')
cross_model_validation(lr_clf, s_scaler_X_train, s_scaler_y_train, s_scaler_X_test, s_scaler_y_test)

1. 로지스틱 회귀 모델 성능검증 (Standardization)
<< LogisticRegression(random_state=42, solver='saga') >>
1. 예측 Accuracy(정확도) : 0.9685
2. 예측 Precision(정밀도) : 0.7857
3. 예측 Recall(재현율): 0.2157
4. 예측 F1 Score : 0.3385
5. 예측 ROC-AUC Score : 0.6067


In [249]:
print('1-2. 로지스틱 회귀 모델 성능검증 (MinMaxScaler)')
cross_model_validation(lr_clf, n_scaler_X_train, n_scaler_y_train, n_scaler_X_test, n_scaler_y_test)

1-2. 로지스틱 회귀 모델 성능검증 (MinMaxScaler)
<< LogisticRegression(random_state=42, solver='saga') >>
1. 예측 Accuracy(정확도) : 0.9663
2. 예측 Precision(정밀도) : 0.8571
3. 예측 Recall(재현율): 0.1176
4. 예측 F1 Score : 0.2069
5. 예측 ROC-AUC Score : 0.5584


#### **(2) 랜덤포레스트(RandomForestClassifier) 모델의 성능검증**

In [259]:
rf_clf = RandomForestClassifier(n_estimators=300, max_depth=5, random_state=42) 

In [255]:
print('2. 랜덤포레스트 모델 성능검증 (Standardization)')
cross_model_validation(rf_clf, s_scaler_X_train, s_scaler_y_train, s_scaler_X_test, s_scaler_y_test)

2. 랜덤포레스트 모델 성능검증 (Standardization)
<< RandomForestClassifier(max_depth=5, n_estimators=300, random_state=42) >>
1. 예측 Accuracy(정확도) : 0.9670
2. 예측 Precision(정밀도) : 1.0000
3. 예측 Recall(재현율): 0.1176
4. 예측 F1 Score : 0.2105
5. 예측 ROC-AUC Score : 0.5588


In [256]:
print('2-2. 랜덤포레스트 모델 성능검증 (MinMaxScaler)')
cross_model_validation(rf_clf, n_scaler_X_train, n_scaler_y_train, n_scaler_X_test, n_scaler_y_test)

2-2. 랜덤포레스트 모델 성능검증 (MinMaxScaler)
<< RandomForestClassifier(max_depth=5, n_estimators=300, random_state=42) >>
1. 예측 Accuracy(정확도) : 0.9670
2. 예측 Precision(정밀도) : 1.0000
3. 예측 Recall(재현율): 0.1176
4. 예측 F1 Score : 0.2105
5. 예측 ROC-AUC Score : 0.5588


#### **(3) XGBClassifier 모델의 성능검증**

In [262]:
xgb_clf = XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=42)

In [263]:
print('3. XGB 모델 성능검증 (Standardization)')
cross_model_validation(xgb_clf, s_scaler_X_train, s_scaler_y_train, s_scaler_X_test, s_scaler_y_test)

3. XGB 모델 성능검증 (Standardization)
<< XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...) >>
1. 예측 Accuracy(정확도) : 0.9670
2. 예측 Precision(정밀도) : 0.6667
3. 예측 Recall(재현율): 0.2353
4. 예측 F1 Score : 0.3478
5. 예측 ROC-AUC Score : 0.6154


In [264]:
print('3. XGB 모델 성능검증 (MinMaxScaler)')
cross_model_validation(xgb_clf, n_scaler_X_train, n_scaler_y_train, n_scaler_X_test, n_scaler_y_test)

3. XGB 모델 성능검증 (MinMaxScaler)
<< XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=500, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, ...) >>
1. 예측 Accuracy(정확도) : 0.9641
2. 예측 Precision(정밀도) : 0.5500
3. 예측 Recall(재현율): 0.2157
4. 예측 F1 Score : 0.3099
5. 예측 ROC-AUC Score : 0.6044


#### **(4) LGBMClassifier 모델의 성능검증**

In [266]:
lgbm_clf = LGBMClassifier(learning_rate=0.1, n_estimators=100, max_depth=5, num_leaves=20, random_state=42)

In [267]:
print('4. LGBM 모델 성능검증 (Standardization)')
cross_model_validation(lgbm_clf, s_scaler_X_train, s_scaler_y_train, s_scaler_X_test, s_scaler_y_test)

4. LGBM 모델 성능검증 (Standardization)
<< LGBMClassifier(max_depth=5, num_leaves=20, random_state=42) >>
1. 예측 Accuracy(정확도) : 0.9663
2. 예측 Precision(정밀도) : 0.6000
3. 예측 Recall(재현율): 0.2941
4. 예측 F1 Score : 0.3947
5. 예측 ROC-AUC Score : 0.6433


In [268]:
print('4. LGBM 모델 성능검증 (MinMaxScaler)')
cross_model_validation(lgbm_clf, n_scaler_X_train, n_scaler_y_train, n_scaler_X_test, n_scaler_y_test)

4. LGBM 모델 성능검증 (MinMaxScaler)
<< LGBMClassifier(max_depth=5, num_leaves=20, random_state=42) >>
1. 예측 Accuracy(정확도) : 0.9655
2. 예측 Precision(정밀도) : 0.5909
3. 예측 Recall(재현율): 0.2549
4. 예측 F1 Score : 0.3562
5. 예측 ROC-AUC Score : 0.6240
