In [1]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## 데이터 전처리
1. 인코딩
2. 스케일링


*인코딩 스케일링 둘다 ndarray형태로 돌려준다고 생각하자*

### 1.인코딩
기본적으로 sklearn의 알고리즘은 문자열 인식하지 못한다 => 숫자로 바꿔줘야한다\
대표적인 두가지 방법 존재
- label encoding 
    - 1d array로만 가능하다
    - 문제점 1: 모델에 따라서 ordering을 수치로 해석할수 있음
    - 문제점 2: unseen data 처리 불가능
- one-hot encoding
    - - matrix형태로 데이터가 들어가야한다 => 즉 단일 컬럼은 (-1,1) 형태여야한다 
    - 두개 이상의 column도 가능하다 => 즉 Dataframe형태로도 가능하다
    - 문제점 : 차원의 저주
    - 장점 : unknown handle 가능 
**그냥 닥치고 one-hot encoding 쓴다**



#### Label Encoding

In [43]:
# label encoding
items = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기'])

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(items)
label_encoder.transform(items)

label_encoder.classes_

array([0, 1, 4, 3, 3, 2, 4])

array(['TV', '냉장고', '믹서', '선풍기', '세탁기'], dtype='<U3')

In [3]:
# 애초에 label encoding은 unknown handle 옵션도 없다
# 대처할수 없다
label_encoder.transform(['핸드폰'])

ValueError: y contains previously unseen labels: '핸드폰'

#### one-hot encoder

In [44]:
# error => reshape(1,-1)
from sklearn.preprocessing import OneHotEncoder
items = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기'])

onehot_encoder = OneHotEncoder()
onehot_encoder.fit_transform(items)


ValueError: Expected 2D array, got 1D array instead:
array=['TV' '냉장고' '세탁기' '선풍기' '선풍기' '믹서' '세탁기'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [47]:
# one hot encodr 예시
from sklearn.preprocessing import OneHotEncoder
items = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기']).reshape(-1,1)

onehot_encoder = OneHotEncoder()
onehot_encoder.fit_transform(items)
onehot_encoder.fit_transform(items).toarray()

onehot_encoder.categories_


<7x5 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]])

[array(['TV', '냉장고', '믹서', '선풍기', '세탁기'], dtype='<U3')]

In [46]:
# toarray() 귀찮으면 그냥, sparse=Fasle 옵션주면된다 
from sklearn.preprocessing import OneHotEncoder
items = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기']).reshape(-1,1)

onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit_transform(items)

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]])

In [50]:
# one hot encoder 결과 dataframe으로 받아보자
from sklearn.preprocessing import OneHotEncoder
items = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기']).reshape(-1,1)

onehot_encoder = OneHotEncoder(sparse=False)
result = onehot_encoder.fit_transform(items)
onehot_encoder.categories_

pd.DataFrame(result,columns=onehot_encoder.categories_,dtype=int)

[array(['TV', '냉장고', '믹서', '선풍기', '세탁기'], dtype='<U3')]

Unnamed: 0,TV,냉장고,믹서,선풍기,세탁기
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,0,0,1
3,0,0,0,1,0
4,0,0,0,1,0
5,0,0,1,0,0
6,0,0,0,0,1


In [68]:
# 데이터 프레임 형태로 input 가능
# return은 still ndarray

# 데이터 프레임 생성 only consist of categorical
items1 = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기'])
items2 = np.array(['samsung','apple','samsung','apple','samsung','apple','samsung'])
cat_df =pd.DataFrame({'cat1':items1,'cat2':items2})
cat_df

onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit_transform(cat_df)

col_names = np.hstack([onehot_encoder.categories_[0],onehot_encoder.categories_[1]])

pd.DataFrame(onehot_encoder.fit_transform(cat_df),columns=col_names,dtype=int)

Unnamed: 0,cat1,cat2
0,TV,samsung
1,냉장고,apple
2,세탁기,samsung
3,선풍기,apple
4,선풍기,samsung
5,믹서,apple
6,세탁기,samsung


array([[1., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 1.]])

Unnamed: 0,TV,냉장고,믹서,선풍기,세탁기,apple,samsung
0,1,0,0,0,0,0,1
1,0,1,0,0,0,1,0
2,0,0,0,0,1,0,1
3,0,0,0,1,0,1,0
4,0,0,0,1,0,0,1
5,0,0,1,0,0,1,0
6,0,0,0,0,1,0,1


In [70]:
# pd.get_dummies() 도 가능하다 
# 훨씬 쉬어보이는데 => depreicated due to unseen data and cross validation scheme

items1 = np.array(['TV','냉장고','세탁기','선풍기','선풍기','믹서','세탁기'])
items2 = np.array(['samsung','apple','samsung','apple','samsung','apple','samsung'])
cat_df =pd.DataFrame({'cat1':items1,'cat2':items2})
pd.get_dummies(cat_df,columns=['cat1'])

pd.get_dummies(cat_df,columns=['cat1','cat2'])

Unnamed: 0,cat2,cat1_TV,cat1_냉장고,cat1_믹서,cat1_선풍기,cat1_세탁기
0,samsung,1,0,0,0,0
1,apple,0,1,0,0,0
2,samsung,0,0,0,0,1
3,apple,0,0,0,1,0
4,samsung,0,0,0,1,0
5,apple,0,0,1,0,0
6,samsung,0,0,0,0,1


Unnamed: 0,cat1_TV,cat1_냉장고,cat1_믹서,cat1_선풍기,cat1_세탁기,cat2_apple,cat2_samsung
0,1,0,0,0,0,0,1
1,0,1,0,0,0,1,0
2,0,0,0,0,1,0,1
3,0,0,0,1,0,1,0
4,0,0,0,1,0,0,1
5,0,0,1,0,0,1,0
6,0,0,0,0,1,0,1


### 2.스케일링
- 정규화(0과 1사이의 값으로 변환 - MinMaxScalar) (x - x최소값) / (x최대값 - x최소값) 만약 음수일 경우 1 대체됨, 
- 표준화(정규분포를 따르는 형식으로 평균이 0 분산 1 가우시안분포 변환 - StandardScalar)(x - x 평균값) / (x표준편차)

In [73]:
# iris data로 진행하겠다
from sklearn.datasets import load_iris

iris = load_iris()

#### Standarad Scaler

In [75]:
from sklearn.preprocessing import StandardScaler
s_scaler = StandardScaler()

s_iris   = s_scaler.fit_transform(iris.data)
s_iris_df = pd.DataFrame(data    = s_iris , 
                          columns = iris.feature_names)
s_iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


#### MinMax Scaler

In [77]:
from sklearn.preprocessing import MinMaxScaler
m_scaler = MinMaxScaler()

m_iris   = m_scaler.fit_transform(iris.data)
m_iris_df = pd.DataFrame(data    = m_iris , 
                          columns = iris.feature_names)
m_iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667


> #### **인코딩 스케일링 언제해야되냐**?

항상 굉장히 헷갈리는 질문이다.\
당연하게도 train 에서만 학습이 되어야하고, test에서는 변형만 시켜야한다
- for train set => fit() and transform()\
- for test set => transform()

Scaling에서는 이게 별 문제가 없을수 있는데 Encoding에서는 문제가 발생할 수 있다.\
Why? train set에는 없던 cateogry가 test set에서 튀어나올 수 있다 => 이를 대처하기 위해서는 one-hot encoding을 사용해줘야한다\
이에 대해 조금 후에 자세히 보겠다

검색해봐도 너무 의견이 다양하고 심지어 책에서도 의견이 다양하다. 그렇기 때문에 아래 offical document 참고하자
![picture 1](images/0050116989a1eef3ab1262cb317246918f21667daa2c0fa427db094bc5ec903e.png)  
![picture 2](images/54de8867cccc976d2fa7769a81a0c0cb9ea131a7e7f059d64bbc1a8bbb032f5f.png)  



> 직접 pipeline을 만들어보자 for data with both numeric and categorical \
> numerical -> standardscaler\
> categorical -> one-hot encoder 사용할거다
> 의도적으로 categorical data in test set에 unseen data 집어넣을거다

In [23]:
train_ex_df = pd.DataFrame({'num1':np.random.randint(1,10,size=(10,)),'num2': np.random.randint(10,20,size=(10,)),'cat': [s for s in 'baabbbccbb']})
train_ex_df
test_ex_df = pd.DataFrame({'num1':np.random.randint(1,10,size=(4,)),'num2': np.random.randint(10,20,size=(4,)),'cat': [s for s in 'abcd']})
test_ex_df

Unnamed: 0,num1,num2,cat
0,2,12,b
1,3,11,a
2,1,13,a
3,3,12,b
4,4,11,b
5,9,18,b
6,6,16,c
7,2,17,c
8,4,11,b
9,3,14,b


Unnamed: 0,num1,num2,cat
0,8,19,a
1,4,12,b
2,2,17,c
3,7,16,d


In [78]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_features = ["num1","num2"]

numeric_transformer = StandardScaler()

categorical_features = ["cat"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ] # 원한다면 여기서 추가해서 특정 numeric 혹은 categorical column은 또 다른 scaling 적용가능하다
)

preprocessor.fit_transform(train_ex_df)
preprocessor.transform(test_ex_df) # 맨 마지막 row를 보면 'd'가 unseen 이기때문에 0 0 0으로 들어갔다

array([[-0.77513328, -0.6       ,  0.        ,  1.        ,  0.        ],
       [-0.31917253, -1.        ,  1.        ,  0.        ,  0.        ],
       [-1.23109403, -0.2       ,  1.        ,  0.        ,  0.        ],
       [-0.31917253, -0.6       ,  0.        ,  1.        ,  0.        ],
       [ 0.13678823, -1.        ,  0.        ,  1.        ,  0.        ],
       [ 2.41659199,  1.8       ,  0.        ,  1.        ,  0.        ],
       [ 1.04870973,  1.        ,  0.        ,  0.        ,  1.        ],
       [-0.77513328,  1.4       ,  0.        ,  0.        ,  1.        ],
       [ 0.13678823, -1.        ,  0.        ,  1.        ,  0.        ],
       [-0.31917253,  0.2       ,  0.        ,  1.        ,  0.        ]])

array([[ 1.96063124,  2.2       ,  1.        ,  0.        ,  0.        ],
       [ 0.13678823, -0.6       ,  0.        ,  1.        ,  0.        ],
       [-0.77513328,  1.4       ,  0.        ,  0.        ,  1.        ],
       [ 1.50467048,  1.        ,  0.        ,  0.        ,  0.        ]])