# 데이터 전처리 (pre processing)

In [2]:
from sklearn.preprocessing import LabelEncoder
items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']
# LabelEncoder를 객체로 생성한 후, fit( ) 과 transform( )으로 label 인코딩 수행. 
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print('인코딩 변환값:', labels)
encoder.inverse_transform([4, 5, 2, 0, 1, 1, 3, 3])

인코딩 변환값: [0 1 4 5 3 3 2 2]


array(['전자렌지', '컴퓨터', '믹서', 'TV', '냉장고', '냉장고', '선풍기', '선풍기'], dtype='<U4')

In [3]:
labels = encoder.fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2], dtype=int64)

In [4]:
encoder.classes_

array(['TV', '냉장고', '믹서', '선풍기', '전자렌지', '컴퓨터'], dtype='<U4')

In [5]:
encoder.inverse_transform([3,4,3,2,3,4,2,1,3,0])

array(['선풍기', '전자렌지', '선풍기', '믹서', '선풍기', '전자렌지', '믹서', '냉장고', '선풍기',
       'TV'], dtype='<U4')

In [10]:
encoder.inverse_transform([3])[0]

'선풍기'

In [11]:
def get_name(model, num):
    return model.inverse_transform([num])[0]

In [12]:
get_name(encoder, 4)

'전자렌지'

## one-hot encoding

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
# 먼저 숫자값으로 변환을 위해 Label Encoder로 변화 
encoder = LabelEncoder()
labels = encoder.fit_transform(items)
labels.shape

(8,)

In [16]:
labels = labels.reshape(-1, 1)
labels.shape

(8, 1)

In [17]:
labels

array([[0],
       [1],
       [4],
       [5],
       [3],
       [3],
       [2],
       [2]], dtype=int64)

In [27]:
# 원-핫 인코딩을 적용
oh_encoder = OneHotEncoder()
oh_labels = oh_encoder.fit_transform(labels)
oh_labels.shape

(8, 6)

In [32]:
oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [36]:
labels = LabelEncoder().fit_transform(items).reshape(-1, 1)
oh_labels = OneHotEncoder().fit_transform(labels)
oh_labels.shape

(8, 6)

In [37]:
import pandas as pd 

In [39]:
df = pd.DataFrame({'items':items})
df

Unnamed: 0,items
0,TV
1,냉장고
2,전자렌지
3,컴퓨터
4,선풍기
5,선풍기
6,믹서
7,믹서


In [41]:
pd.get_dummies(df)

Unnamed: 0,items_TV,items_냉장고,items_믹서,items_선풍기,items_전자렌지,items_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


# Feature scaling and normalization

In [43]:
from sklearn.datasets import load_iris

iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [45]:
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


 - 1) 표준 정규 분포 - standard scaler 

In [50]:
from sklearn.preprocessing import StandardScaler

In [51]:
scaler = StandardScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)

In [52]:
type(iris_scaled)

numpy.ndarray

In [53]:
iris_scaled[-5, :]

array([ 1.03800476, -0.13197948,  0.8195957 ,  1.44883158])

In [54]:
import numpy as np 

In [56]:
np.mean(iris_scaled, axis=0)

array([-1.69031455e-15, -1.84297022e-15, -1.69864123e-15, -1.40924309e-15])

In [62]:
np.std(iris_scaled, axis=0)

array([1., 1., 1., 1.])

 - 2) 0~1 사이의 값으로 - min max scaler

In [63]:
from sklearn.preprocessing import MinMaxScaler

In [64]:
scaler = MinMaxScaler()
iris_scaled = scaler.fit_transform(iris_df)

In [67]:
np.max(iris_scaled, axis=0)

array([1., 1., 1., 1.])

In [66]:
np.min(iris_scaled, axis=0)

array([0., 0., 0., 0.])

In [68]:
np.std(iris_scaled, axis=0)

array([0.22925036, 0.18100457, 0.29820408, 0.31653859])

In [69]:
np.mean(iris_scaled, axis=0)

array([0.4287037 , 0.44055556, 0.46745763, 0.45805556])