# 데이터 인코딩

## 레이블 인코딩(Label Encoding)

In [3]:
from sklearn.preprocessing import LabelEncoder

items =['TV','냉장고','책상','의자','선반']

# LabelEncoder를 객체로 생성한 후, 인코딩 수행
encoder = LabelEncoder()
labels = encoder.fit_transform(items)
print(encoder.classes_)
print(labels)
print('디코딩 변환: ',encoder.inverse_transform([1,4,3,2,0,1,1]))

['TV' '냉장고' '선반' '의자' '책상']
[0 1 4 3 2]
디코딩 변환:  ['냉장고' '책상' '의자' '선반' 'TV' '냉장고' '냉장고']


## 원-핫 인코딩(One-Hot encoding)

In [10]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np
# items 2차원 array 변환
items_array = np.array(items).reshape(-1,1)

oh_encoder = OneHotEncoder()
oh_labels = oh_encoder.fit_transform(items_array)
print(oh_labels.toarray())

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]]


In [12]:
import pandas as pd
df = pd.DataFrame( {'items':['TV','냉장고','책상','선반','냉장고','TV'] } )
pd.get_dummies(df)

Unnamed: 0,items_TV,items_냉장고,items_선반,items_책상
0,1,0,0,0
1,0,1,0,0
2,0,0,0,1
3,0,0,1,0
4,0,1,0,0
5,1,0,0,0


# 피처 스케일링과 정규화

## StandardScaler

In [18]:
from sklearn.datasets import load_iris
import pandas as pd

# iris 데이터 로딩
iris = load_iris()
features = iris.data
target = iris.target
iris_df = pd.DataFrame(features, columns = iris.feature_names)
iris_df.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [19]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris_df)
iris_scaled_df = pd.DataFrame(iris_scaled, columns = iris.feature_names)
iris_scaled_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


## MinMaxScaler

In [20]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
iris_minmax = minmax.fit_transform(iris_df)

iris_minmax_df = pd.DataFrame(iris_minmax, columns = iris.feature_names)
iris_minmax_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667
