## データの準備

In [34]:
import pandas as pd

# csvファイルから読み込み
df = pd.read_csv('./data/av_loan_u6lujuX_CVtuZ9i.csv', header=0)
# 一番最後の列を正解データとする
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
# 最初の列はidのため除去
X = X.drop('Loan_ID', axis=1)

# ローン審査でNOとなったサンプルを1（正例）に変換する。
class_mapping = {'N':1, 'Y':0}
y = y.map(class_mapping)
X.join(y).head()

# DFの表示列数を50に拡張しておく
pd.options.display.max_columns = 50

# One-hotエンコーディングを行う変数を指定する。
ohe_columns = ['Dependents', 'Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

# One-hotエンコーディング、dummy_na=Trueの時、欠損時には欠損=1とdummy化される。drop_first=TrueでN-1の表現が可能
X_new = pd.get_dummies(X, dummy_na=True, columns=ohe_columns, drop_first=True)

# 欠損値補完
from sklearn.impute import SimpleImputer

# インピュータクラスをインスタンス化する、デフォルトでmeanに置き換える。
imp = SimpleImputer()
# fitで学習
imp.fit(X_new)

# 学習済みImputerをtransformで適用すると置換される
# カラム名の取り出し
X_new_columns = X_new.columns.values
X_new = pd.DataFrame(imp.transform(X_new), columns=X_new_columns)

display(X_new.head())

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Male,Gender_nan,Married_Yes,Married_nan,Education_Not Graduate,Education_nan,Self_Employed_Yes,Self_Employed_nan,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849.0,0.0,146.412162,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### RFE（Recursive Feature Elimination）
- 各特徴量の予測性能を評価し、指定した次元数まで削減する

In [40]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# RFEのインスタンスを作成
selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=0), n_features_to_select=10, step=.05)
'''
estimatorは特徴量因子の重要度を推定する分類器
n_features_to_selectは最終的に残す特徴量の量
stepは1回のstepで削除する特徴量の次元数
'''

# fitで学習
selector.fit(X_new, y)
print('Done normally')

# 計算に使用される特徴量はsupport_属性に入っている
print(selector.support_)

Done normally
[ True  True  True  True  True  True False False False False False  True
 False  True False False False  True  True False]


In [43]:
# transformで特徴量の抽出ができる
X_new_selected = selector.transform(X_new)
# カラムはselector.support_内のTrue,Falseを使用し取り出すことができる
X_new_selected = pd.DataFrame(X_new_selected, columns=X_new_columns[selector.support_])

print('X shape after RFE', X_new_selected.shape)
print('--------------------------------')
print(X_new_selected.dtypes)
X_new_selected.head()

X shape after RFE (614, 10)
--------------------------------
ApplicantIncome            float64
CoapplicantIncome          float64
LoanAmount                 float64
Loan_Amount_Term           float64
Credit_History             float64
Dependents_1               float64
Married_Yes                float64
Education_Not Graduate     float64
Property_Area_Semiurban    float64
Property_Area_Urban        float64
dtype: object


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_1,Married_Yes,Education_Not Graduate,Property_Area_Semiurban,Property_Area_Urban
0,5849.0,0.0,146.412162,360.0,1.0,0.0,0.0,0.0,0.0,1.0
1,4583.0,1508.0,128.0,360.0,1.0,1.0,1.0,0.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,0.0,1.0,0.0,0.0,1.0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,1.0,1.0,0.0,1.0
4,6000.0,0.0,141.0,360.0,1.0,0.0,0.0,0.0,0.0,1.0


### PCA（Principle Component Analysis）
- 次元のベクトルの向きを回転させることで次元を圧縮する

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score

# パイプラインにPCAを埋め込み自動的に次元圧縮を行う
clf = Pipeline([('scl', StandardScaler()),
                ('reduct', PCA(n_components=10, random_state=1)),
                ('clf', GradientBoostingClassifier(random_state=1))])

# fitで学習を行う、Pipelineで自動的に次元圧縮が行われる
clf.fit(X_new, y)

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('reduct',
                 PCA(copy=True, iterated_power='auto', n_components=10,
                     random_state=1, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf',
                 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
        

### データの永続化を行う

In [50]:
import pickle

# 永続化したいデータをpickle.dumps()に指定する
s = pickle.dumps(clf)

In [55]:
# 永続化したデータを使用する場合はpickle.loads()で呼び出せる
clf2 = pickle.loads(s)
print(clf2)

Pipeline(memory=None,
         steps=[('scl',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('reduct',
                 PCA(copy=True, iterated_power='auto', n_components=10,
                     random_state=1, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf',
                 GradientBoostingClassifier(criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
        