In [1]:
import pandas as pd
import numpy as np
import pickle, os
import warnings
warnings.filterwarnings('ignore')


In [2]:
path_read ="/Users/a06411/Documents/data_hub/lending_club/df_preprocessed_ld_0.pkl"

In [3]:
df = pd.read_pickle(path_read)

In [4]:
df.shape

(1382351, 104)

## 분산분석(ANOVA)
- 연속형 변수에 대해 분산분석을 적용한다

In [5]:
from sklearn.feature_selection import chi2, f_classif
from sklearn.feature_selection import SelectKBest


In [6]:
num_col = np.setdiff1d(df.select_dtypes('number').columns.values, ['id','loan_status'])
X, y = df[num_col], df['loan_status']

In [7]:
num_col

array(['acc_now_delinq', 'acc_open_past_24mths', 'addr_state', 'all_util',
       'annual_inc', 'application_type', 'avg_cur_bal', 'bc_open_to_buy',
       'bc_util', 'chargeoff_within_12_mths', 'collection_recovery_fee',
       'collections_12_mths_ex_med', 'debt_settlement_flag',
       'delinq_2yrs', 'delinq_amnt', 'disbursement_method', 'dti',
       'emp_length', 'fico_range_high', 'fico_range_low', 'funded_amnt',
       'funded_amnt_inv', 'grade', 'hardship_flag', 'home_ownership',
       'il_util', 'initial_list_status', 'inq_fi', 'inq_last_12m',
       'inq_last_6mths', 'installment', 'int_rate',
       'last_fico_range_high', 'last_fico_range_low', 'last_pymnt_amnt',
       'loan_amnt', 'max_bal_bc', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_last_delinq',
       'mths_since_last_major_derog', 'mths_since_last_record',
       'mths_since_rcnt_il', 'mths_since_recent_bc',
       'mths_since_rece

In [8]:
X = X.fillna(0) 

In [9]:
X.shape

(1382351, 101)

In [10]:
selector = SelectKBest(f_classif, k=50)
selector.fit(X,y)

In [11]:
cols = selector.get_support(indices = True)


In [12]:
cols

array([  1,   3,   6,   7,   8,  10,  12,  16,  18,  19,  20,  21,  22,
        24,  25,  27,  28,  29,  30,  31,  32,  33,  34,  35,  41,  52,
        58,  63,  65,  66,  67,  68,  69,  70,  71,  72,  74,  79,  81,
        82,  84,  86,  87,  91,  94,  95,  96,  97,  98, 100])

In [13]:
df_anova = df.iloc[:,cols]
print(df_anova.shape)

(1382351, 50)


## F-value를 기준으로 가장 유의한 50개 변수를 확인한다.

- F-value: 사기 데이터와 정상 데이터 간의 분산 비교 척도(높을수록 유의한 변수)

In [14]:
cols = X.columns.values[selector.get_support()]
scores = np.round(selector.scores_[selector.get_support()]*0.1, 2)
cols_scores = list(zip(cols, scores))


In [15]:
ns_df = pd.DataFrame(data = cols_scores, columns=['Feat_names', 'F_Scores'])
ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True], inplace=True)
ns_df.head(10)

Unnamed: 0,Feat_names,F_Scores
20,last_fico_range_high,108219.69
21,last_fico_range_low,65255.72
37,recoveries,40854.02
5,collection_recovery_fee,35764.78
48,total_rec_prncp,33685.1
22,last_pymnt_amnt,19964.12
44,total_pymnt,15679.36
45,total_pymnt_inv,15617.2
6,debt_settlement_flag,14237.99
39,sub_grade,10537.48


# 카이제곱검정(Chi-Squared Test)
- 범주형 변수에 대해 카이제곱검정을 적용한다.

In [16]:
obj_col = ['application_type', 'debt_settlement_flag', 'disbursement_method',
           'hardship_flag', 'home_ownership',
       'initial_list_status', 'purpose', 'pymnt_plan', 'sub_grade', 'term',
       'verification_status','grade']

In [17]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [18]:
X, y = df[obj_col], df['loan_status']

In [19]:
selector = SelectKBest(chi2, k='all')
selector.fit(X,y)


In [20]:
cols = selector.get_support(indices = True)
df_chi = df.iloc[:,cols]
print(df_chi.shape)

(1382351, 12)


## P-value을 기준으로 유의한 변수를 확인한다.

- P-value: 사기 데이터와 정상 데이터가 동일한 분포를 가지는지에 대한 척도(낮을수록 유의한 변수)

In [21]:
cols = X.columns.values[selector.get_support()]
scores = selector.pvalues_[selector.get_support()]
cols_scores = list(zip(cols, scores))


In [22]:
ns_df = pd.DataFrame(data = cols_scores, columns=['Feat_names', 'P_Values'])
ns_df.sort_values(['P_Values', 'Feat_names'], ascending = [True, True],inplace= True)
ns_df

Unnamed: 0,Feat_names,P_Values
0,application_type,0.0
1,debt_settlement_flag,0.0
11,grade,0.0
3,hardship_flag,0.0
4,home_ownership,0.0
6,purpose,0.0
7,pymnt_plan,0.0
8,sub_grade,0.0
9,term,0.0
10,verification_status,0.0


# 머신러닝 모델(LightGBM)
- 머신러닝 모델을 적용해 각 변수의 중요도를 살펴본다.

In [23]:
import time
import lightgbm as lgb
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 

import gc, warnings
warnings.filterwarnings('ignore')

## 모델링에 적용되지 않는 변수(ID, 타겟변수, 날짜변수 등) 제외:

In [24]:
SEED = 2021
TARGET = 'loan_status'

remove_features = ['id', 'loan_status','earliest_cr_line', 'issue_d', 'last_credit_pull_d', 'last_pymnt_d', 'next_pymnt_d','Unnamed: 0']
features_columns = [col for col in list(df) if col not in remove_features]

## 데이터를 7:3 비율로 나눠 학습/평가 데이터 생성:

In [25]:
X,y = df[features_columns], df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

## LightGBM 모델 학습:

In [None]:
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

## Feature Importance Value를 기준으로 유의한 변수를 확인한다.

In [None]:
feature_imp = pd.DataFrame(sorted(zip(X.columns,clf.feature_importances_)), columns=['Feature','Value'])
feature_imp.sort_values(by="Value", ascending=False).head(10)

# SHAP
- SHAP을 적용해 각 변수의 중요도를 살펴본다.


## 데이터 크기로 인한 RAM 이슈를 막기 위해 일정 비율 랜덤 샘플링한다.

In [None]:
df_sample = X_train.copy()
df_sample.sample(frac=0.1, replace=True, random_state=2020)

## SHAP value를 기준으로 유의한 변수를 확인한다.

In [None]:
import shap
shap_values = shap.TreeExplainer(clf).shap_values(df_sample)
shap.summary_plot(shap_values, X_train, plot_type="bar")

## 추가 변수 조정
- 4가지 변수선택법(ANOVA, Chi-Squared Test, ML Model, SHAP)에서 공통으로 유의한 변수를 추출한다.
- 그 결과, 머신러닝 모델 기준으로 지나치게 높은 성능(AUC: 0.9914)을 보인다.
- 과도한 성능을 낮추고 더 적은 지표에도 안정적인 성능을 가지도록 특정 변수를 직접 제거 후 최종 독립변수를 선택한다.

### 기존 성능 지표 확인:

In [None]:
y_pred=clf.predict(X_test)
print('LightGBM Model AUC score: {0:0.4f}'.format(roc_auc_score(y_test, y_pred)))

### 영향력이 높은 변수 확인:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

feature_imp = pd.DataFrame(sorted(zip(X.columns,clf.feature_importances_)), columns=['Feature','Value'])

plt.figure(figsize=(30, 20))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features')
plt.tight_layout()
plt.show()

### 총 58개 독립변수 선택:

In [None]:
feat_imp = ["int_rate","dti","annual_inc","mo_sin_old_rev_tl_op",
            "acc_open_past_24mths","loan_amnt","emp_length",
            "revol_bal", "term", "funded_amnt_inv", "installment", "purpose",
            "total_rev_hi_lim","fico_range_low","debt_settlement_flag", 
            "mort_acc","total_bc_limit","home_ownership",
            "avg_cur_bal","all_util","mths_since_recent_bc","total_acc",
            "open_acc_6m","bc_util","num_actv_rev_tl",    
            "funded_amnt","hardship_flag","num_rev_tl_bal_gt_0",
            "mths_since_recent_inq","inq_last_6mths", "num_il_tl",
            "mo_sin_old_il_acct","num_rev_accts","num_tl_120dpd_2m",
            "total_il_high_credit_limit","application_type",
            "revol_util","tot_hi_cred_lim","delinq_2yrs",
            "mo_sin_rcnt_tl","num_actv_bc_tl","mths_since_last_record",
            "percent_bc_gt_75", "bc_open_to_buy","max_bal_bc",
            "grade","open_rv_24m","mo_sin_rcnt_rev_tl_op",
            "pct_tl_nvr_dlq","verification_status","tot_cur_bal","total_bal_ex_mort",
            "mths_since_last_major_derog","inq_fi","mths_since_rcnt_il",
            "inq_last_12m","mths_since_last_delinq","num_bc_tl"]

### LightGBM 모델 재학습:

In [None]:
X,y = df[feat_imp], df[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)

### 낮춘 모델 성능 지표 확인:

In [None]:
y_pred=clf.predict(X_test)
print('LightGBM Model AUC score: {0:0.4f}'.format(roc_auc_score(y_test, y_pred)))

### 최종 데이터셋(ID, 58개 독립변수, 종속변수, 날짜변수) 저장:

In [None]:
feat_imp += ['loan_status','issue_d','id']

In [None]:
path_write ="/Users/a06411/Documents/data_hub/lending_club/feature_selected_ld_1.pkl"

In [None]:
df[feat_imp].to_pickle(path_write)