## 용어
- 과소표본(undersampling): 분류 모델에서 개수가 많은 클래스 데이터 중 일부 소수만을 사용하는 것(유의어: 다운샘플)
- 과잉표본(oversampling): 분류 모델에서 희귀 클래스 데이터를 중복하여, 필요하면 부트스트랩해서 사용하는 것
- 상향 가중치(up weight) or 하향 가중치(down weight): 모델에서 희귀(혹은 다수) 클래스에 높은(혹은 낮은) 가중치를 주는것
- 데이터 생성(data generation): 부트스트랩과 비슷하게 다시 샘플링한 레코드를 빼고 원래 원본과 살짝 다르게 데이터를 생성하는 것
- z 점수(z-score): 표준화 결과
- k: 최근접 이웃 알고리즘에서 이웃들의 개수

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE, ADASYN

In [2]:
full_train_set = pd.read_csv('../../data/full_train_set.csv')

In [3]:
full_train_set

Unnamed: 0,status,loan_amnt,term,annual_inc,dti,payment_inc_ratio,revol_bal,revol_util,purpose,home_ownership,delinq_2yrs_zero,pub_rec_zero,open_acc,grade,outcome,emp_length,purpose_,home_,emp_len_
0,Fully Paid,5000,36 months,24000,27.65,8.14350,13648.0,83.7,credit_card,RENT,1,1,3,5.4,paid off,11,credit_card,RENT,> 1 Year
1,Charged Off,2500,60 months,30000,1.00,2.39320,1687.0,9.4,car,RENT,1,1,3,4.8,default,1,major_purchase,RENT,> 1 Year
2,Fully Paid,2400,36 months,12252,8.72,8.25955,2956.0,98.5,small_business,RENT,1,1,2,5.0,paid off,11,small_business,RENT,> 1 Year
3,Fully Paid,10000,36 months,49200,20.00,8.27585,5598.0,21.0,other,RENT,1,1,10,4.2,paid off,11,other,RENT,> 1 Year
4,Fully Paid,5000,36 months,36000,11.20,5.21533,7963.0,28.3,wedding,RENT,1,1,9,6.8,paid off,4,other,RENT,> 1 Year
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119982,Fully Paid,7000,36 months,46200,30.05,6.11325,6835.0,43.0,debt_consolidation,MORTGAGE,1,1,12,5.8,paid off,11,debt_consolidation,MORTGAGE,> 1 Year
119983,Fully Paid,21000,60 months,91800,8.04,6.82092,12187.0,23.9,medical,RENT,1,1,14,3.2,paid off,3,medical,RENT,> 1 Year
119984,Fully Paid,28775,60 months,60000,13.72,15.40780,16542.0,78.4,home_improvement,MORTGAGE,1,0,14,2.2,paid off,11,home_improvement,MORTGAGE,> 1 Year
119985,Fully Paid,6000,36 months,57500,23.61,4.25113,5188.0,31.6,debt_consolidation,RENT,0,1,12,6.0,paid off,4,debt_consolidation,RENT,> 1 Year


## 데이터 불균형이 있을때의 문제점

In [4]:
# 전체 데이터중 연체 데이터의 비율
print(f"percentage of loans in default: {(full_train_set['outcome'] == 'default').mean() *100}")

percentage of loans in default: 18.894546909248504


In [5]:
# 이 데이터를 그대로 학습시킨 경우의 결과

features = ['payment_inc_ratio', 'purpose_', 'home_', 'emp_len_', 
              'dti', 'revol_bal', 'revol_util']
label = 'outcome'
X = pd.get_dummies(full_train_set[features], prefix='', prefix_sep='', 
                   drop_first=True)
y = full_train_set[label]

full_model = LogisticRegression(C=1)
full_model.fit(X, y)
print(f"percentage of loans predicted to default: {np.mean(full_model.predict(X) == 'default')*100}")

# 전부 paid off 로 예측한 것을 볼 수 있다.

percentage of loans predicted to default: 0.0


## 상향/하향 가중치(Up/Down Weighting)

In [6]:
default_wt = len(full_train_set[full_train_set['outcome'] == 'paid off']) / len(full_train_set[full_train_set['outcome'] == 'default'])

wt = [default_wt if outcome == 'default' else 1 for outcome in full_train_set['outcome']]

full_model = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
full_model.fit(X, y, sample_weight=wt)

print(f"percentage of loans predicted to default: {np.mean(full_model.predict(X) == 'default')*100}")

percentage of loans predicted to default: 42.69545867468976


## 데이터 생성(Data Generation)

In [7]:
X_resampled, y_resampled = SMOTE().fit_resample(X, y)
print('percentage of loans in default (SMOTE resampled): ', 
      100 * np.mean(y_resampled == 'default'))

full_model = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
full_model.fit(X_resampled, y_resampled)
print('percentage of loans predicted to default (SMOTE): ', 
      100 * np.mean(full_model.predict(X) == 'default'))


X_resampled, y_resampled = ADASYN().fit_resample(X, y)
print('percentage of loans in default (ADASYN resampled): ', 
      100 * np.mean(y_resampled == 'default'))

full_model = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
full_model.fit(X_resampled, y_resampled)
print(f"percentage of loans predicted to default (ADASYN): {100 * np.mean(full_model.predict(X) == 'default')}")

percentage of loans in default (SMOTE resampled):  50.0
percentage of loans predicted to default (SMOTE):  29.300674239709302
percentage of loans in default (ADASYN resampled):  48.56040383751355
percentage of loans predicted to default (ADASYN): 27.91219048730279
