# 資料前處理(Label encoding、 One hot encoding)
這兩個編碼方式的目的是為了將類別 (categorical)或是文字(text)的資料轉換成數字，而讓程式能夠更好的去理解及運算。
> Label encoding : 把每個類別 mapping 到某個整數，不會增加新欄位

> One hot encoding : 為每個類別新增一個欄位，用 0/1 表示是否

![](images/Encoder.PNG)


### Encoding Categorical features (or label)
![](images/Encoding.PNG)


In [1]:
import pandas as pd
import numpy as np


## 練習二：Keras - label encoder + to_categorical

In [2]:
def to_categorical(y, num_classes=None, dtype='float32'):
    """Converts a class vector (integers) to binary class matrix.
    E.g. for use with categorical_crossentropy.
    # Arguments
        y: class vector to be converted into a matrix
            (integers from 0 to num_classes).
        num_classes: total number of classes.
        dtype: The data type expected by the input, as a string
            (`float32`, `float64`, `int32`...)
    # Returns
        A binary matrix representation of the input. The classes axis
        is placed last.
    # Example
    ```python
    # Consider an array of 5 labels out of a set of 3 classes {0, 1, 2}:
    > labels
    array([0, 2, 1, 2, 0])
    # `to_categorical` converts this into a matrix with as many
    # columns as there are classes. The number of rows
    # stays the same.
    > to_categorical(labels)
    array([[ 1.,  0.,  0.],
           [ 0.,  0.,  1.],
           [ 0.,  1.,  0.],
           [ 0.,  0.,  1.],
           [ 1.,  0.,  0.]], dtype=float32)
    ```
    """
    #將輸入y向量轉換為陣列
    y = np.array(y, dtype='int')
    #獲取陣列的行列大小
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    #y變為1維陣列
    y = y.ravel()
    #如果使用者沒有輸入分類個數，則自行計算分類個數
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    #生成全為0的n行num_classes列的值全為0的矩陣
    categorical = np.zeros((n, num_classes), dtype=dtype)
    #np.arange(n)得到每個行的位置值，y裡邊則是每個列的位置值
    categorical[np.arange(n), y] = 1
    #進行reshape矯正
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

In [3]:
country=['Taiwan','Australia','Ireland','Australia','Ireland','Taiwan']
age=[25,30,45,35,22,36]
salary=[20000,32000,59000,60000,43000,52000]
dic={'Country':country,'Age':age,'Salary':salary}
data=pd.DataFrame(dic)


from sklearn.preprocessing import LabelEncoder
import np_utils

# label encoder 
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(data['Country'])
print(encoded_Y)
data['Country']=encoded_Y
data

# convert integers to one hot encoding
dummy_y = to_categorical(encoded_Y)  # one-hot encoding [0. 1. 0.] 
dummy_y


[2 0 1 0 1 2]


array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

In [8]:
import numpy
import scipy
from scipy import sparse
import sklearn
import tensorflow as tf

print("numpy :", numpy.__version__)
print("scipy :", scipy.__version__)
print("sklearn :", sklearn.__version__)
print("tf :", tf.__version__)

ModuleNotFoundError: No module named 'scipy.sparse'