# SMOTE（Synthetic Minority Over-sampling Technique）の実装
- オーバーサンプリングの一種、既存の正例データの分布の間に正例を作る。局所的な正例が作成されるので実際のデータとは異なる可能性がある。

### 不均衡データの準備

In [1]:
import pandas as pd
from sklearn.preprocessing import Imputer

df = pd.read_csv('./data/av_loan_u6lujuX_CVtuZ9i.csv', header=0)
X = df.iloc[:,:-1] # 最終行列以外を特徴量X
X = X.drop('Loan_ID', axis=1) # Loan_IDはID情報で不要の為削除
y = df.iloc[:,-1] # 最終列を正解データ

In [2]:
# ローン審査でNOとなったサンプルを1に変換する
class_mapping = {'N':1, 'Y':0}
y = y.map(class_mapping)

In [3]:
# one-hotエンコーディング
ohe_columns = ['Dependents', 'Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

X_ohe = pd.get_dummies(X, dummy_na=True, columns=ohe_columns)

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(X_ohe)

X_ohe_columns = X_ohe.columns.values
X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)

display(X_ohe.head())



Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,...,Education_Graduate,Education_Not Graduate,Education_nan,Self_Employed_No,Self_Employed_Yes,Self_Employed_nan,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849.0,0.0,146.412162,360.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


### アンダーサンプリング
- 多い負例をアンダーサンプリングする

In [9]:
from collections import Counter
# ランダムアンダーサンプリングを行うモジュール
from imblearn.under_sampling import RandomUnderSampler

# インスタンスを作成
rus = RandomUnderSampler(random_state=0)
# fit_sampleでアンダーサンプリングを返す
X_under, y_under = rus.fit_sample(X_ohe, y)

# カウンターで数える
Counter(y_under)

Counter({0: 192, 1: 192})

### オーバーサンプリング
- ランダムオーバーサンプリングとSMOTEを行う

In [11]:
# ランダムオーバーサンプリングとSMOTEを読み込み
from imblearn.over_sampling import RandomOverSampler, SMOTE

# それぞれのインスタンスを作成
ros = RandomOverSampler(random_state=0)
smt = SMOTE(random_state=0)

X_over, y_over = ros.fit_sample(X_ohe, y)
X_smt, y_smt = smt.fit_sample(X_ohe, y)

print('Random Over Samoler', Counter(y_over))
print('SMOTE', Counter(y_smt))

Random Over Samoler Counter({0: 422, 1: 422})
SMOTE Counter({0: 422, 1: 422})


### 比較してみる

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,f1_score

# ホールドアウト
X_train,X_test,y_train,y_test= train_test_split(X_ohe, y, test_size=0.20, random_state=0)

# サンプリング
X_train_under, y_train_under = rus.fit_sample(X_train, y_train) # ランダムアンダーサンプリング
X_train_over, y_train_over = ros.fit_sample(X_train, y_train) # ランダムオーダーサンプリング
X_train_smt, y_train_smt = smt.fit_sample(X_train, y_train)# SMOTE

# パイプラインを作成
pipe_gb = Pipeline([('scl',StandardScaler()),
                    ('est',GradientBoostingClassifier(random_state=1))])
# 学習評価
###############################################
pipe_gb.fit(X_train, y_train)
print('Original Train:', f1_score(y_train, pipe_gb.predict(X_train)))
print('Original Test:', f1_score(y_test, pipe_gb.predict(X_test)))
###############################################
pipe_gb.fit(X_train_under, y_train_under)
print('Undersampling Train:', f1_score(y_train_under, pipe_gb.predict(X_train_under)))
print('Undersampling Test:',  f1_score(y_test, pipe_gb.predict(X_test)))
###############################################
pipe_gb.fit(X_train_over, y_train_over)
print('Oversampling Train:', f1_score(y_train_over, pipe_gb.predict(X_train_over)))
print('Oversampling Test:', f1_score(y_test, pipe_gb.predict(X_test)))
###############################################
pipe_gb.fit(X_train_smt, y_train_smt)
print('SMOTE Train:', f1_score(y_train_smt, pipe_gb.predict(X_train_smt)))
print('SMOTE Test:', f1_score(y_test, pipe_gb.predict(X_test)))

Original Train: 0.7940074906367042
Original Test: 0.6181818181818182
Undersampling Train: 0.9411764705882353
Undersampling Test: 0.6024096385542169
Oversampling Train: 0.9079365079365079
Oversampling Test: 0.6086956521739131
SMOTE Train: 0.8990228013029316
SMOTE Test: 0.5555555555555556
