## 欠損値補完方法を複数試す

In [1]:
# ライブラリーのインポート
import pandas as pd
import numpy as np
import random as rnd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import GradientBoostingRegressor

import warnings
warnings.simplefilter('ignore')

In [2]:
# データの読み込み（タイタニックデータ）
df = pd.read_csv('../data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# データの基本情報を確認
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# 基本統計量を確認
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
# 欠損値の確認
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
# 欠損値の欠損率を表示す
df.isnull().apply(lambda col: col.value_counts(), axis=0).fillna(0).astype(np.float).apply(lambda col: col/col.sum(), axis=0)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
False,1.0,1.0,1.0,1.0,1.0,0.801347,1.0,1.0,1.0,1.0,0.228956,0.997755
True,0.0,0.0,0.0,0.0,0.0,0.198653,0.0,0.0,0.0,0.0,0.771044,0.002245


In [7]:
# 欠損値の欠損率を表示す
df.isnull().apply(lambda col: col.value_counts(), axis=0).fillna(0).astype(np.float).apply(lambda col: col/col.sum(), axis=0)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
False,1.0,1.0,1.0,1.0,1.0,0.801347,1.0,1.0,1.0,1.0,0.228956,0.997755
True,0.0,0.0,0.0,0.0,0.0,0.198653,0.0,0.0,0.0,0.0,0.771044,0.002245


In [8]:
#性別の数値変換
df.loc[(df['Sex'] == 'male'), 'Sex'] = 0
df.loc[(df['Sex'] == 'female'), 'Sex'] = 1

In [9]:
# 不要なカラムの削除
df = df.drop(['Name',
              'Cabin',
              'Ticket',
              'SibSp',
              'Parch',
              'Embarked',
              'Fare'],
             axis=1)

df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
0,1,0,3,0,22.0
1,2,1,1,1,38.0
2,3,1,3,1,26.0
3,4,1,1,1,35.0
4,5,0,3,0,35.0
5,6,0,3,0,
6,7,0,1,0,54.0
7,8,0,3,0,2.0
8,9,1,3,1,27.0
9,10,1,2,1,14.0


## 欠損値処理

## リストワイズ

In [10]:
df_listwise = df.dropna()
df_listwise.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
0,1,0,3,0,22.0
1,2,1,1,1,38.0
2,3,1,3,1,26.0
3,4,1,1,1,35.0
4,5,0,3,0,35.0
6,7,0,1,0,54.0
7,8,0,3,0,2.0
8,9,1,3,1,27.0
9,10,1,2,1,14.0
10,11,1,3,1,4.0


In [11]:
# 欠損値の確認
df_listwise.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
dtype: int64

In [12]:
# 基本統計量を確認
df_listwise.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age
count,714.0,714.0,714.0,714.0
mean,448.582633,0.406162,2.236695,29.699118
std,259.119524,0.49146,0.83825,14.526497
min,1.0,0.0,1.0,0.42
25%,222.25,0.0,1.0,20.125
50%,445.0,0.0,2.0,28.0
75%,677.75,1.0,3.0,38.0
max,891.0,1.0,3.0,80.0


## 平均値補完

In [13]:
mean = df.mean()
df_mean = df.fillna(mean)
df_mean.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
0,1,0,3,0,22.0
1,2,1,1,1,38.0
2,3,1,3,1,26.0
3,4,1,1,1,35.0
4,5,0,3,0,35.0
5,6,0,3,0,29.699118
6,7,0,1,0,54.0
7,8,0,3,0,2.0
8,9,1,3,1,27.0
9,10,1,2,1,14.0


In [14]:
# 欠損値の確認
df_mean.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
dtype: int64

In [15]:
# 基本統計量を確認
df_mean.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
count,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,29.699118
std,257.353842,0.486592,0.836071,0.47799,13.002015
min,1.0,0.0,1.0,0.0,0.42
25%,223.5,0.0,2.0,0.0,22.0
50%,446.0,0.0,3.0,0.0,29.699118
75%,668.5,1.0,3.0,1.0,35.0
max,891.0,1.0,3.0,1.0,80.0


## 中央値補完

In [16]:
median = df.median()
df_median = df.fillna(median)
df_median.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
0,1,0,3,0,22.0
1,2,1,1,1,38.0
2,3,1,3,1,26.0
3,4,1,1,1,35.0
4,5,0,3,0,35.0
5,6,0,3,0,28.0
6,7,0,1,0,54.0
7,8,0,3,0,2.0
8,9,1,3,1,27.0
9,10,1,2,1,14.0


In [17]:
# 欠損値の確認
df_median.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
dtype: int64

In [18]:
# 基本統計量を確認
df_median.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
count,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,29.361582
std,257.353842,0.486592,0.836071,0.47799,13.019697
min,1.0,0.0,1.0,0.0,0.42
25%,223.5,0.0,2.0,0.0,22.0
50%,446.0,0.0,3.0,0.0,28.0
75%,668.5,1.0,3.0,1.0,35.0
max,891.0,1.0,3.0,1.0,80.0


## 回帰代入法（GBDT）

In [19]:
df_master = df.dropna()
df_miss = df[df.isnull().any(axis=1)]
model = GradientBoostingRegressor().fit(df_master.drop(["Age","Survived"], axis=1), df_master["Age"])
df_miss["Age"] = model.predict(df_miss.drop(["Age","Survived"], axis=1))

df_GBDT = pd.concat([df_master, df_miss]).sort_index()
df_GBDT.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
0,1,0,3,0,22.0
1,2,1,1,1,38.0
2,3,1,3,1,26.0
3,4,1,1,1,35.0
4,5,0,3,0,35.0
5,6,0,3,0,30.534716
6,7,0,1,0,54.0
7,8,0,3,0,2.0
8,9,1,3,1,27.0
9,10,1,2,1,14.0


In [20]:
# 欠損値の確認
df_GBDT.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
dtype: int64

In [21]:
# 基本統計量を確認
df_GBDT.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age
count,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.355368
std,257.353842,0.486592,0.836071,13.323389
min,1.0,0.0,1.0,0.42
25%,223.5,0.0,2.0,21.048114
50%,446.0,0.0,3.0,28.0
75%,668.5,1.0,3.0,36.0
max,891.0,1.0,3.0,80.0


## IterativeImputerによる代入法

In [22]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=10, random_state=1)
df_IterativeImputer = pd.DataFrame(imp.fit_transform(df))
df_IterativeImputer.columns = df.columns
df_IterativeImputer.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
0,1.0,0.0,3.0,0.0,22.0
1,2.0,1.0,1.0,1.0,38.0
2,3.0,1.0,3.0,1.0,26.0
3,4.0,1.0,1.0,1.0,35.0
4,5.0,0.0,3.0,0.0,35.0
5,6.0,0.0,3.0,0.0,26.181867
6,7.0,0.0,1.0,0.0,54.0
7,8.0,0.0,3.0,0.0,2.0
8,9.0,1.0,3.0,1.0,27.0
9,10.0,1.0,2.0,1.0,14.0


In [23]:
# 欠損値の確認
df_IterativeImputer.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
dtype: int64

In [24]:
# 基本統計量を確認
df_IterativeImputer.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
count,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,29.296629
std,257.353842,0.486592,0.836071,0.47799,13.322095
min,1.0,0.0,1.0,0.0,0.42
25%,223.5,0.0,2.0,0.0,21.0
50%,446.0,0.0,3.0,0.0,27.029149
75%,668.5,1.0,3.0,1.0,36.0
max,891.0,1.0,3.0,1.0,80.0


## KNNImputerによる代入法（K近傍法）

In [25]:
from missingpy import KNNImputer
imp = KNNImputer(n_neighbors=2, weights='uniform')
df_KNN = pd.DataFrame(imp.fit_transform(df))
df_KNN.columns = df.columns
df_KNN.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
0,1.0,0.0,3.0,0.0,22.0
1,2.0,1.0,1.0,1.0,38.0
2,3.0,1.0,3.0,1.0,26.0
3,4.0,1.0,1.0,1.0,35.0
4,5.0,0.0,3.0,0.0,35.0
5,6.0,0.0,3.0,0.0,18.5
6,7.0,0.0,1.0,0.0,54.0
7,8.0,0.0,3.0,0.0,2.0
8,9.0,1.0,3.0,1.0,27.0
9,10.0,1.0,2.0,1.0,14.0


In [26]:
# 欠損値の確認
df_KNN.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
dtype: int64

In [27]:
# 基本統計量を確認
df_KNN.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
count,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,29.519085
std,257.353842,0.486592,0.836071,0.47799,13.883655
min,1.0,0.0,1.0,0.0,0.42
25%,223.5,0.0,2.0,0.0,21.0
50%,446.0,0.0,3.0,0.0,28.0
75%,668.5,1.0,3.0,1.0,38.0
max,891.0,1.0,3.0,1.0,80.0


## MissForestによる代入法（ランダムフォレスト）

In [28]:
from missingpy import MissForest
imp = MissForest(max_iter=10)
df_MF = pd.DataFrame(imp.fit_transform(df))
df_MF.columns = df.columns
df_MF.head(10)

Iteration: 0
Iteration: 1
Iteration: 2


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
0,1.0,0.0,3.0,0.0,22.0
1,2.0,1.0,1.0,1.0,38.0
2,3.0,1.0,3.0,1.0,26.0
3,4.0,1.0,1.0,1.0,35.0
4,5.0,0.0,3.0,0.0,35.0
5,6.0,0.0,3.0,0.0,24.9
6,7.0,0.0,1.0,0.0,54.0
7,8.0,0.0,3.0,0.0,2.0
8,9.0,1.0,3.0,1.0,27.0
9,10.0,1.0,2.0,1.0,14.0


In [29]:
# 欠損値の確認
df_MF.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
dtype: int64

In [30]:
# 基本統計量を確認
df_MF.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age
count,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,29.520606
std,257.353842,0.486592,0.836071,0.47799,13.808408
min,1.0,0.0,1.0,0.0,0.42
25%,223.5,0.0,2.0,0.0,21.0
50%,446.0,0.0,3.0,0.0,28.0
75%,668.5,1.0,3.0,1.0,37.185
max,891.0,1.0,3.0,1.0,80.0
