## 欠損値への対応

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

In [2]:
df01 = pd.DataFrame({
    "A" : [1, np.nan, 3, 4],
    "B" : [5, 6, np.nan, 8],
    "C" : [9, 10, 11, 12]
})
df01

Unnamed: 0,A,B,C
0,1.0,5.0,9
1,,6.0,10
2,3.0,,11
3,4.0,8.0,12


In [3]:
df02 = df01.copy()
df03 = df01.copy()

In [4]:
df01.isna()   #isnull

Unnamed: 0,A,B,C
0,False,False,False
1,True,False,False
2,False,True,False
3,False,False,False


In [5]:
#method01:dropna
df01 = df01.dropna()
df01

Unnamed: 0,A,B,C
0,1.0,5.0,9
3,4.0,8.0,12


In [6]:
df02

Unnamed: 0,A,B,C
0,1.0,5.0,9
1,,6.0,10
2,3.0,,11
3,4.0,8.0,12


In [7]:
#strategy：欠損値を設定、mean,median,most_frequent,constant
imputer01 = SimpleImputer(strategy='mean')
imputer01.fit(df02)
df02_trans = imputer01.transform(df02)
df02_trans

array([[ 1.        ,  5.        ,  9.        ],
       [ 2.66666667,  6.        , 10.        ],
       [ 3.        ,  6.33333333, 11.        ],
       [ 4.        ,  8.        , 12.        ]])

In [8]:
#strategy = "constant"の場合、fill_valueの値を設定
imputer02 = SimpleImputer(strategy = "constant",fill_value=99)
imputer02.fit(df03)
df03_trans = imputer02.transform(df03)
df03_trans

array([[ 1.,  5.,  9.],
       [99.,  6., 10.],
       [ 3., 99., 11.],
       [ 4.,  8., 12.]])

In [9]:
#DataFrame => Numpy
print(type(df02))
print(type(df02_trans))

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>


## カテゴリー変数のエンコーディング

In [10]:
df01 = pd.DataFrame({
    "A": [1,2,3,4,5],
    "B": ["a", "b", "a", "b", "c"]
})
df01

Unnamed: 0,A,B
0,1,a
1,2,b
2,3,a
3,4,b
4,5,c


In [11]:
df02 = df01.copy()
df03 = df01.copy()
df04 = df01.copy()

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
#LabelEncoder:カテゴリー変数のエンコーディング
le = LabelEncoder()
le.fit(df01["B"])
df01_trans = le.transform(df01["B"])
print(df01_trans)
print(le.classes_)   #変更された値と元の値の対応

[0 1 0 1 2]
['a' 'b' 'c']


In [14]:
df01["B_new"] = df01_trans
df01

Unnamed: 0,A,B,B_new
0,1,a,0
1,2,b,1
2,3,a,0
3,4,b,1
4,5,c,2


In [15]:
#one-hot,method01:get_dummies
df02 = pd.get_dummies(df02)
df02

Unnamed: 0,A,B_a,B_b,B_c
0,1,1,0,0
1,2,0,1,0
2,3,1,0,0
3,4,0,1,0
4,5,0,0,1


In [16]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
df03

Unnamed: 0,A,B
0,1,a
1,2,b
2,3,a
3,4,b
4,5,c


In [18]:
#one-hot,method02:OneHotEncoder

#le = LabelEncoder()
#le.fit(df03["B"])
#df03["B"] = le.transform(df03["B"])

#version update,A列も処理された
ohe = OneHotEncoder(categories="auto")
ohe.fit(df03)
df03_trans = ohe.transform(df03)

In [19]:
df03_trans.toarray()

array([[1., 0., 0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0., 1.]])

In [20]:
df04

Unnamed: 0,A,B
0,1,a
1,2,b
2,3,a
3,4,b
4,5,c


In [21]:
from sklearn.compose import ColumnTransformer

In [22]:
#one-hot,method03:ColumnTransformer and OneHotEncoder
ctf = ColumnTransformer([("n1", OneHotEncoder(), [1])], remainder="passthrough")  #remainder:non-specified pass
df04_trans = ctf.fit_transform(df04)
df04_trans

array([[1., 0., 0., 1.],
       [0., 1., 0., 2.],
       [1., 0., 0., 3.],
       [0., 1., 0., 4.],
       [0., 0., 1., 5.]])

## 特徴量の正規化

### 分散正規化

In [23]:
df = pd.DataFrame({
    "A":[1, 2, 3, 4, 5],
    "B":[100, 200, 300, 400, 500]
})
df

Unnamed: 0,A,B
0,1,100
1,2,200
2,3,300
3,4,400
4,5,500


In [24]:
df01 = df.copy()

In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
#特徴量の平均：０、標準偏差：１
sd = StandardScaler()
sd.fit(df)   #各列の平均と標準偏差を求め
df_stand = sd.transform(df)

In [27]:
sd.__dict__

{'with_mean': True,
 'with_std': True,
 'copy': True,
 'n_samples_seen_': 5,
 'mean_': array([  3., 300.]),
 'var_': array([2.e+00, 2.e+04]),
 'scale_': array([  1.41421356, 141.42135624])}

In [28]:
print("mean:{},variance:{},standard:{}".format(np.mean(df_stand), 
                                               np.var(df_stand), 
                                               np.std(df_stand)))

mean:0.0,variance:0.9999999999999998,standard:0.9999999999999999


### 最小最大正規化

In [29]:
#MinMaxScaler:特徴量の最小値：０；最大値：１
from sklearn.preprocessing import MinMaxScaler

In [30]:
mms = MinMaxScaler()
mms.fit(df01)
mms.transform(df01)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [0.75, 0.75],
       [1.  , 1.  ]])