### スコアリングフェーズにおける前処理

### データの読み込み

In [1]:
import pandas as pd

# csvからデータを読み込み
df = pd.read_csv('./data/av_loan_u6lujuX_CVtuZ9i.csv', header=0)
# 最終列を正解データとしてdfに格納する.
X = df.iloc[:,:-1]
ID = X.iloc[:,[0]]
X = X.drop('Loan_ID', axis=1)
y = df.iloc[:,-1]

print('---------------------')
print('Raw shape:', df.shape)
print('X shape:', X.shape)

---------------------
Raw shape: (614, 13)
X shape: (614, 11)


### モデル用データの前処理

In [2]:
# mapを使用しclass_mappingのdictを適応する。
class_mapping = {'N':1, 'Y':0}
y = y.map(class_mapping)
print('---------------------')
print(y.value_counts())
print('---------------------')
print(ID.join(X).join(y).dtypes)
display(ID.join(X).join(y).head())

# 表示列数のオプション変更
pd.options.display.max_columns = 50

---------------------
0    422
1    192
Name: Loan_Status, dtype: int64
---------------------
Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status            int64
dtype: object


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,0
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,1
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,0
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,0
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,0


### モデル用データの前処理（One-hotエンコーディング）

In [3]:
# One-hotエンコーディングを行う変数を指定する。
ohe_columns = ['Dependents', 'Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

X_ohe = pd.get_dummies(X, dummy_na=True, columns=ohe_columns, drop_first=True)

print('X_ohe shape:', X_ohe.shape)
display(X_ohe.head())

X_ohe shape: (614, 20)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Male,Gender_nan,Married_Yes,Married_nan,Education_Not Graduate,Education_nan,Self_Employed_Yes,Self_Employed_nan,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849,0.0,,360.0,1.0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,4583,1508.0,128.0,360.0,1.0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0
3,2583,2358.0,120.0,360.0,1.0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0
4,6000,0.0,141.0,360.0,1.0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0


### モデル用データの前処理（欠損値補完）

In [4]:
from sklearn.impute import SimpleImputer

# 欠損値NaNを平均値（mean）で置換
imp = SimpleImputer()
imp.fit(X_ohe)

# 学習済みImputerをtransformで適応する
X_ohe_columns = X_ohe.columns.values
X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns)

# 結果表示
display(X_ohe.head())

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Male,Gender_nan,Married_Yes,Married_nan,Education_Not Graduate,Education_nan,Self_Employed_Yes,Self_Employed_nan,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849.0,0.0,146.412162,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### モデル用データの前処理（次元圧縮）

In [7]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

selector = RFE(RandomForestClassifier(n_estimators=100, random_state=1),
               n_features_to_select=10,
               step=.05)

selector.fit(X_ohe, y)

X_fin = pd.DataFrame(selector.transform(X_ohe),
                     columns=X_ohe_columns[selector.support_])

print('X_fin shape:', X_fin.shape)
display(X_fin.head())

X_fin shape: (614, 10)


Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_1,Gender_Male,Married_Yes,Education_Not Graduate,Property_Area_Semiurban
0,5849.0,0.0,146.412162,360.0,1.0,0.0,1.0,0.0,0.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,1.0,1.0,1.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,0.0,1.0,1.0,0.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,1.0,1.0,1.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,0.0,1.0,0.0,0.0,0.0


### スコア用データの前処理

In [9]:
# カラムは同じなのでモデル用データと同じデータ構成を行う
df_s = pd.read_csv('./data/av_loan_test_Y3wMUE5_7gLdaTN.csv', header=0)
ID_s = df_s.iloc[:,[0]]
X_s = df_s.drop('Loan_ID', axis=1)

print('Raw shape:', df_s.shape)
print('X shape:',X_s.shape)
print('-----------------------')
print(X_s.dtypes)

Raw shape: (333, 12)
X shape: (333, 11)
-----------------------
Gender                object
Married               object
Dependents           float64
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome      int64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
dtype: object


### スコア用データの前処理（One-hotエンコーディング）