## 第8章　数値型

In [31]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

### 8-1　数値型への変換

In [32]:
print(type(40000 / 3))

print(type(int(40000 / 3)))

print(type(float(40000 / 3)))

<class 'float'>
<class 'int'>
<class 'float'>


In [33]:
df = pd.DataFrame({'value': [40000 / 3]})

print(df.dtypes)
df.head()

value    float64
dtype: object


Unnamed: 0,value
0,13333.333333


In [34]:
df['value'].astype('int8')
df['value'].astype('int16')
df['value'].astype('int32')
df['value'].astype('int64')

0    13333
Name: value, dtype: int64

In [35]:
df['value'].astype('float16')
df['value'].astype('float32')
df['value'].astype('float64')

0    13333.333333
Name: value, dtype: float64

大まかなデータ型指定の場合、クォート（' '）は不要となる。

In [36]:
df['value'].astype(int)
df['value'].astype(float)

0    13333.333333
Name: value, dtype: float64

### 8-2　対数化による非線形な変化

対数化とは、値が大きくなるほど、値の差の意味を小さくしたいときに有効な手法である。<br>
例えば、10から11歳への身長の伸びと、50から51歳への身長の伸びは異なる場合などに必要となる。

In [37]:
reserve_tb = pd.read_csv('reserve.csv')

reserve_tb.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100


In [38]:
reserve_tb['total_price_log'] = reserve_tb['total_price'].apply(lambda x: np.log10(x / 1000 + 1))

reserve_tb.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,total_price_log
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,1.992111
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600,1.334454
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600,1.539076
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400,2.290925
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100,1.839478


上の表を見ての通り、値が大きいものほど、なるべく小さく対数変換されていることが分かる。

### 8-3　カテゴリ化による非線形な変化

In [39]:
customer_tb = pd.read_csv('customer.csv')

customer_tb.head()

Unnamed: 0,customer_id,age,sex,home_latitude,home_longitude
0,c_1,41,man,35.092193,136.512347
1,c_2,38,man,35.325076,139.410551
2,c_3,49,woman,35.120543,136.511179
3,c_4,43,man,43.034868,141.240314
4,c_5,31,man,35.102661,136.523797


In [40]:
customer_tb['age_rank'] = (np.floor(customer_tb['age'] / 10) * 10).astype('category')

print(customer_tb.dtypes)
print(customer_tb.shape)
customer_tb.head()

customer_id         object
age                  int64
sex                 object
home_latitude      float64
home_longitude     float64
age_rank          category
dtype: object
(1000, 6)


Unnamed: 0,customer_id,age,sex,home_latitude,home_longitude,age_rank
0,c_1,41,man,35.092193,136.512347,40.0
1,c_2,38,man,35.325076,139.410551,30.0
2,c_3,49,woman,35.120543,136.511179,40.0
3,c_4,43,man,43.034868,141.240314,40.0
4,c_5,31,man,35.102661,136.523797,30.0


np.floor関数は、引数の小数点以下を切り捨てることができる。（正確には、引数以下の最大整数を返す）

そのため、1行目の41歳であれば、41を10で割って4.1する。<br>
小数点を切り捨てして4.0にしてから、10倍して40にすることで、rank値を与えることができる。

### 8-4　正規化

In [41]:
reserve_tb = pd.read_csv('reserve.csv')

reserve_tb.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100


In [42]:
reserve_tb['people_num'] = reserve_tb['people_num'].astype(float)

ss = StandardScaler()
result = ss.fit_transform(reserve_tb[['people_num', 'total_price']])

reserve_tb['people_num_normalized'] = [x[0] for x in result]
reserve_tb['total_price_normalized'] = [x[1] for x in result]

reserve_tb.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,people_num_normalized,total_price_normalized
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4.0,97200,1.300709,-0.053194
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2.0,20600,-0.483753,-0.747822
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2.0,33600,-0.483753,-0.629935
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4.0,194400,1.300709,0.82824
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3.0,68100,0.408478,-0.31708


### 8-5　外れ値の除去

In [43]:
reserve_tb = pd.read_csv('reserve.csv')

reserve_tb.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100


In [44]:
reserve_tb = reserve_tb[(abs(reserve_tb['total_price']
                             - np.mean(reserve_tb['total_price'])) / np.std(reserve_tb['total_price']) <= 3)].reset_index()

reserve_tb.head()

Unnamed: 0,index,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
0,0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200
1,1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600
2,2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600
3,3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400
4,4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100


データから平均値を引いた値の絶対値を標準偏差で割ることによって、データが平均値から標準偏差の何倍離れているかを計算している。

つまり、平均値との差を、標準偏差で割っている（標準偏差の何倍かを計算している）。

### 8-6　主成分分析による次元圧縮

In [45]:
production_tb = pd.read_csv('production.csv')

print(production_tb.shape)
production_tb.head()

(1000, 4)


Unnamed: 0,type,length,thickness,fault_flg
0,E,274.027383,40.241131,False
1,D,86.319269,16.906715,False
2,E,123.940388,1.018462,False
3,B,175.554886,16.414924,False
4,B,244.93474,29.061081,False


In [46]:
pca = PCA(n_components=2)

pca_values = pca.fit_transform(production_tb[['length', 'thickness']])

print('累積寄与率: {0}'.format(sum(pca.explained_variance_ratio_)))
print('各次元の寄与率: {0}'.format(pca.explained_variance_ratio_))

pca_newvalues = pca.transform(production_tb[['length', 'thickness']])

累積寄与率: 1.0
各次元の寄与率: [0.97897794 0.02102206]


fitは回帰や分類と同様にオブジェクトの作成、transformは回帰や分類でいうpredictと同じ意味合いを持つ。

fit_transformは、fitとtransformを両方同時に実行することができる。

In [47]:
pd.DataFrame(pca_values).head()

Unnamed: 0,0,1
0,76.968382,-13.389069
1,-112.114693,-8.248848
2,-76.199434,11.190271
3,-23.341625,0.838485
4,46.933896,-5.064103


In [48]:
pd.DataFrame(pca_newvalues).head()

Unnamed: 0,0,1
0,76.968382,-13.389069
1,-112.114693,-8.248848
2,-76.199434,11.190271
3,-23.341625,0.838485
4,46.933896,-5.064103


### 8-7　数値型の補完

#### 欠損レコードの削除

In [49]:
production_miss_num = pd.read_csv('production_missing_num.csv')

print(production_miss_num.shape)
production_miss_num.head()

(1000, 4)


Unnamed: 0,type,length,thickness,fault_flg
0,E,274.027383,40.24113135955541,False
1,D,86.319269,16.906714630016268,False
2,E,123.940388,1.0184619943950777,False
3,B,175.554886,16.41492419553766,False
4,B,244.93474,29.061080805480326,False


dropnaはnanを認識しても、Noneを認識することはできない。<br>
そのため、欠損値の除去をする前に、Noneをnanに変換する作業が必要となる。

In [50]:
production_miss_num.replace('None', np.nan, inplace=True)
production_miss_num.dropna(subset=['thickness'], inplace=True)

print(production_miss_num['thickness'].isnull().sum())

0


subsetは、欠損値があるかどうかチェックする列を指定できる。

#### 定数補完

In [51]:
production_miss_num = pd.read_csv('production_missing_num.csv')

print(production_miss_num.shape)
production_miss_num.head()

(1000, 4)


Unnamed: 0,type,length,thickness,fault_flg
0,E,274.027383,40.24113135955541,False
1,D,86.319269,16.906714630016268,False
2,E,123.940388,1.0184619943950777,False
3,B,175.554886,16.41492419553766,False
4,B,244.93474,29.061080805480326,False


In [52]:
production_miss_num.replace('None', np.nan, inplace=True)
production_miss_num['thickness'].fillna(1, inplace=True)

print(production_miss_num['thickness'].isnull().sum())

0


#### 平均値補完

Noneが含まれているため、thicknessが数値型になっていないので、変換しておく。

In [53]:
production_miss_num.replace('None', np.nan, inplace=True)
production_miss_num['thickness'] = production_miss_num['thickness'].astype('float64')

In [54]:
thickness_mean = production_miss_num['thickness'].mean()
production_miss_num['thickness'].fillna(thickness_mean, inplace=True)

print(production_miss_num['thickness'].isnull().sum())

0


#### PMMによる多重代入

In [55]:
production_miss_num = pd.read_csv('production_missing_num.csv')

print(production_miss_num.shape)
production_miss_num.head()

(1000, 4)


Unnamed: 0,type,length,thickness,fault_flg
0,E,274.027383,40.24113135955541,False
1,D,86.319269,16.906714630016268,False
2,E,123.940388,1.0184619943950777,False
3,B,175.554886,16.41492419553766,False
4,B,244.93474,29.061080805480326,False


In [56]:
production_miss_num.replace('None', np.nan, inplace=True)

production_miss_num['thickness'] = production_miss_num['thickness'].astype('float64')
production_miss_num['type'] = production_miss_num['type'].astype('category')
production_miss_num['fault_flg'].astype('category')

production_miss_num.head()

Unnamed: 0,type,length,thickness,fault_flg
0,E,274.027383,40.241131,False
1,D,86.319269,16.906715,False
2,E,123.940388,1.018462,False
3,B,175.554886,16.414924,False
4,B,244.93474,29.061081,False


In [57]:
production_dummy_flg = pd.get_dummies(production_miss_num[['type', 'fault_flg']])

print(production_dummy_flg.shape)
production_dummy_flg.head()

(1000, 6)


Unnamed: 0,fault_flg,type_A,type_B,type_C,type_D,type_E
0,False,0,0,0,0,1
1,False,0,0,0,1,0
2,False,0,0,0,0,1
3,False,0,1,0,0,0
4,False,0,1,0,0,0


get_dummies()は `drop_first=True` を使用することで、ダミー変数（特徴量の数）を1つ減らすことができる。<br>
k-1個のダミー変数が分かれば、残りの1個も自動的に分かるので、特に問題はない。

In [58]:
production_dummy_flg = pd.get_dummies(production_miss_num[['type', 'fault_flg']], drop_first=True)

print(production_dummy_flg.shape)
production_dummy_flg.head()

(1000, 5)


Unnamed: 0,fault_flg,type_B,type_C,type_D,type_E
0,False,0,0,0,1
1,False,0,0,1,0
2,False,0,0,0,1
3,False,1,0,0,0
4,False,1,0,0,0


fancyimputeは廃止されてしまったようなので、様々な方法を試してみたが、上手くインストールできなかった。<br>
そのため、PMMによる多重代入はスキップとする。