## データの読み込み

In [11]:
import pandas as pd

# csvファイルから読み込み
df = pd.read_csv('./data/av_loan_u6lujuX_CVtuZ9i.csv', header=0)
# 一番最後の列を正解データとする
X = df.iloc[:,:-1]
display(X.head())
y = df.iloc[:,-1]
display(y.head())
# 最初の列はidのため除去
X = X.drop('Loan_ID', axis=1)
display(X.head())

# shapeの確認
print(f'X shape: {X.shape}')

# ローン審査でNOとなったサンプルを1（正例）に変換する。
class_mapping = {'N':1, 'Y':0}
y = y.map(class_mapping)
print('-----------------------------')
# yの中身のデータを確認
print(y.value_counts())
X.join(y).head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


0    Y
1    N
2    Y
3    Y
4    Y
Name: Loan_Status, dtype: object

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


X shape: (614, 11)
-----------------------------
0    422
1    192
Name: Loan_Status, dtype: int64


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,0
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,1
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,0
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,0
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,0


In [12]:
# DFの表示列数を50に拡張しておく
pd.options.display.max_columns = 50

### One-hotエンコーディング
- Genderなどのカテゴリ変数を0/1のバイナリ変数に変換する

In [18]:
# One-hotエンコーディングを行う変数を指定する。
ohe_columns = ['Dependents', 'Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']

# One-hotエンコーディング、dummy_na=Trueの時、欠損時には欠損=1とdummy化される。drop_first=TrueでN-1の表現が可能
X_new = pd.get_dummies(X, dummy_na=True, columns=ohe_columns, drop_first=True)

display(X_new.head())

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Male,Gender_nan,Married_Yes,Married_nan,Education_Not Graduate,Education_nan,Self_Employed_Yes,Self_Employed_nan,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849,0.0,,360.0,1.0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,4583,1508.0,128.0,360.0,1.0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0
3,2583,2358.0,120.0,360.0,1.0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,0
4,6000,0.0,141.0,360.0,1.0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0


### 欠損値補完
- LoanAmountの1行目の欠損値をLoanAmount列の平均値で置き換える

In [26]:
from sklearn.impute import SimpleImputer

# インピュータクラスをインスタンス化する、デフォルトでmeanに置き換える。
imp = SimpleImputer()
# fitで学習
imp.fit(X_new)

# 学習済みImputerをtransformで適用すると置換される
# カラム名の取り出し
X_new_columns = X_new.columns.values
X_new = pd.DataFrame(imp.transform(X_new), columns=X_new_columns)

display(X_new.head())

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Dependents_1,Dependents_2,Dependents_3+,Dependents_nan,Gender_Male,Gender_nan,Married_Yes,Married_nan,Education_Not Graduate,Education_nan,Self_Employed_Yes,Self_Employed_nan,Property_Area_Semiurban,Property_Area_Urban,Property_Area_nan
0,5849.0,0.0,146.412162,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4583.0,1508.0,128.0,360.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3000.0,0.0,66.0,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,2583.0,2358.0,120.0,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,6000.0,0.0,141.0,360.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


funcyimputeを使用すると他のKNN方などを使用し欠損値補完が可能となる