#### サンプルデータの作成



In [1]:
# サンプルとなるデータセットの作成
import numpy as np
from numpy import nan # 欠損値NaN（Not a Number）
import pandas as pd

In [2]:
# 商品の購買履歴データのサンプルを生成
df = pd.DataFrame({'DATE':  [20170801, 20170801, 20170802, 20170802, 20170803, 20170803, 20170803, 20170805, 20170805, 20170805],
                   'NECK':  [41, nan, 38, 46, nan, 37, nan, 38, nan, 42],
                   'BODY':  [84, 92, nan, 90, nan, 64, 78, 74, 82, 86],
                   'SIZE':  ['L', 'XL', 'L', 'XL', 'M', 'S', 'M', 'L', 'L', 'XL'],
                   'COLOR': ['BL', 'RD', 'Y', 'GR', 'GR', 'RD', 'BL', 'Y', 'BL', 'GR'],
                   'class': ['A', 'C', 'B', 'B', 'C', 'A', 'A', 'A', 'C', 'C']},
                 columns=['DATE', 'NECK', 'BODY', 'SIZE', 'COLOR', 'class'])
df

Unnamed: 0,DATE,NECK,BODY,SIZE,COLOR,class
0,20170801,41.0,84.0,L,BL,A
1,20170801,,92.0,XL,RD,C
2,20170802,38.0,,L,Y,B
3,20170802,46.0,90.0,XL,GR,B
4,20170803,,,M,GR,C
5,20170803,37.0,64.0,S,RD,A
6,20170803,,78.0,M,BL,A
7,20170805,38.0,74.0,L,Y,A
8,20170805,,82.0,L,BL,C
9,20170805,42.0,86.0,XL,GR,C


### 10.2.3 欠損値処理の実装

まずは､データの欠損の傾向を確認します｡

In [78]:
#写経【１】
# データの確認
df

Unnamed: 0,DATE,NECK,BODY,SIZE,COLOR,class
0,20170801,41.0,84.0,L,BL,A
1,20170801,,92.0,XL,RD,C
2,20170802,38.0,,L,Y,B
3,20170802,46.0,90.0,XL,GR,B
4,20170803,,,M,GR,C
5,20170803,37.0,64.0,S,RD,A
6,20170803,,78.0,M,BL,A
7,20170805,38.0,74.0,L,Y,A
8,20170805,,82.0,L,BL,C
9,20170805,42.0,86.0,XL,GR,C


次に､**欠損の発生傾向**を把握しましょう｡

In [79]:
#写経【２】
# 各特徴量の欠損値をカウント
df.isnull().sum()

DATE     0
NECK     4
BODY     2
SIZE     0
COLOR    0
class    0
dtype: int64

In [80]:
#写経【３】
# 欠損値を含む行を削除
df.dropna()

Unnamed: 0,DATE,NECK,BODY,SIZE,COLOR,class
0,20170801,41.0,84.0,L,BL,A
3,20170802,46.0,90.0,XL,GR,B
5,20170803,37.0,64.0,S,RD,A
7,20170805,38.0,74.0,L,Y,A
9,20170805,42.0,86.0,XL,GR,C


In [81]:
#写経【４】
# BODYに欠損があるサンプルのみ削除する
df.dropna(subset=['BODY'])

Unnamed: 0,DATE,NECK,BODY,SIZE,COLOR,class
0,20170801,41.0,84.0,L,BL,A
1,20170801,,92.0,XL,RD,C
3,20170802,46.0,90.0,XL,GR,B
5,20170803,37.0,64.0,S,RD,A
6,20170803,,78.0,M,BL,A
7,20170805,38.0,74.0,L,Y,A
8,20170805,,82.0,L,BL,C
9,20170805,42.0,86.0,XL,GR,C


また､各サンプルの持つ**欠損値の個数**で絞ることもできます｡

In [82]:
#写経【５】
# 欠損していない値が5つ以上あるサンプルのみを残して削除
df.dropna(thresh=5)

Unnamed: 0,DATE,NECK,BODY,SIZE,COLOR,class
0,20170801,41.0,84.0,L,BL,A
1,20170801,,92.0,XL,RD,C
2,20170802,38.0,,L,Y,B
3,20170802,46.0,90.0,XL,GR,B
5,20170803,37.0,64.0,S,RD,A
6,20170803,,78.0,M,BL,A
7,20170805,38.0,74.0,L,Y,A
8,20170805,,82.0,L,BL,C
9,20170805,42.0,86.0,XL,GR,C


続いて**平均値補完**です｡

In [83]:
#写経【６】
# 平均値補完
df.fillna(df.mean())

Unnamed: 0,DATE,NECK,BODY,SIZE,COLOR,class
0,20170801,41.0,84.0,L,BL,A
1,20170801,40.333333,92.0,XL,RD,C
2,20170802,38.0,81.25,L,Y,B
3,20170802,46.0,90.0,XL,GR,B
4,20170803,40.333333,81.25,M,GR,C
5,20170803,37.0,64.0,S,RD,A
6,20170803,40.333333,78.0,M,BL,A
7,20170805,38.0,74.0,L,Y,A
8,20170805,40.333333,82.0,L,BL,C
9,20170805,42.0,86.0,XL,GR,C


また､**任意の値**で**補完**することもできます｡



In [84]:
#写経【７】
# 任意の値で補完
df.fillna(0)

Unnamed: 0,DATE,NECK,BODY,SIZE,COLOR,class
0,20170801,41.0,84.0,L,BL,A
1,20170801,0.0,92.0,XL,RD,C
2,20170802,38.0,0.0,L,Y,B
3,20170802,46.0,90.0,XL,GR,B
4,20170803,0.0,0.0,M,GR,C
5,20170803,37.0,64.0,S,RD,A
6,20170803,0.0,78.0,M,BL,A
7,20170805,38.0,74.0,L,Y,A
8,20170805,0.0,82.0,L,BL,C
9,20170805,42.0,86.0,XL,GR,C


続いて､**回帰補完**を行います｡**<font color='red'>首周りの長さ</font>**と**<font color='red'>着丈</font>**は相関が強いので､回帰補完が有効であると思われます｡



In [85]:
#写経【８】
# 回帰補完を行う
# 線形回帰を選択
ip = df.interpolate(method='linear')
ip

Unnamed: 0,DATE,NECK,BODY,SIZE,COLOR,class
0,20170801,41.0,84.0,L,BL,A
1,20170801,39.5,92.0,XL,RD,C
2,20170802,38.0,91.0,L,Y,B
3,20170802,46.0,90.0,XL,GR,B
4,20170803,41.5,77.0,M,GR,C
5,20170803,37.0,64.0,S,RD,A
6,20170803,37.5,78.0,M,BL,A
7,20170805,38.0,74.0,L,Y,A
8,20170805,40.0,82.0,L,BL,C
9,20170805,42.0,86.0,XL,GR,C


## 10.3 カテゴリデータの扱い



### 10.3.1 順序特徴量のマッピング



In [86]:
#写経【９】
# シャツのサイズと整数を対応させる辞書を生成
size_mapping = {'S':1, 'M':2, 'L':3, 'XL':4}

In [87]:
#写経【１０】
# マッピングの実行
df['SIZE'] = df['SIZE'].map(size_mapping)
df

Unnamed: 0,DATE,NECK,BODY,SIZE,COLOR,class
0,20170801,41.0,84.0,3,BL,A
1,20170801,,92.0,4,RD,C
2,20170802,38.0,,3,Y,B
3,20170802,46.0,90.0,4,GR,B
4,20170803,,,2,GR,C
5,20170803,37.0,64.0,1,RD,A
6,20170803,,78.0,2,BL,A
7,20170805,38.0,74.0,3,Y,A
8,20170805,,82.0,3,BL,C
9,20170805,42.0,86.0,4,GR,C


### 10.3.2 名義特徴量のエンコーディング



In [88]:
#写経【１１】
# pandasでone-hotエンコーディングを使い、ダミー変数を作成
pd.get_dummies(df['COLOR'])

Unnamed: 0,BL,GR,RD,Y
0,1,0,0,0
1,0,0,1,0
2,0,0,0,1
3,0,1,0,0
4,0,1,0,0
5,0,0,1,0
6,1,0,0,0
7,0,0,0,1
8,1,0,0,0
9,0,1,0,0
