# 데이터 전처리

### 1. 레이블 인코딩

In [1]:
from sklearn.preprocessing import LabelEncoder

In [2]:
items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

In [3]:
# 레이블 인코더 객체 생성
le = LabelEncoder()

In [4]:
# 학습, 데이터를 훑어 봄
le.fit(items)

LabelEncoder()

In [5]:
# 인코딩 실행
labels = le.transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [6]:
# 실전에서는
le2 = LabelEncoder()
labels = le2.fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2], dtype=int64)

In [7]:
# 고수들은
    # 변수를 쓰지 않으려면, 객체를 생성한 다음에 메소드를 적용하는 식으로 진행할 수 있다. 
    #   - 메소드 체이닝:
labels = LabelEncoder().fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2], dtype=int64)

In [11]:
# 변수를 쓰지 않으려면, 객체를 생성한 다음에 메소드를 적용하는 식으로 진행할 수 있다. 
# method chaining 1 : 메소드를 연결해서 값을 출력함
s = '  A quick brown fox  '
s.lower().strip().split() 


['a', 'quick', 'brown', 'fox']

In [12]:
# method chaining 2
s = '  A quick brown fox  '
s.lower().lstrip().rstrip().replace('a', 'the').replace('fox','wolf').split() 


['the', 'quick', 'brown', 'wolf']

In [13]:
# inverse_transform은 label encoder에서 사용할 수 없음
le.inverse_transform([4,5,2,3,0,1])

array(['전자렌지', '컴퓨터', '믹서', '선풍기', 'TV', '냉장고'], dtype='<U4')

### 2. One-hot encoding
    - 사이킷런에서 잘 사용하지 않는다. but, 신경망, 딥러닝에서는 사용하므로 알아두는 것이 좋다.
    - 6 종류가 있다면, 사이즈가 6인 벡터를 만든다
    - 1개만 1로 남겨두고, 나머지는 0으로 설정한다.


In [14]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

In [15]:
oh_labels = ohe.fit_transform(labels.reshape(-1,1))
oh_labels

<8x6 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [17]:
oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

- reshape 공부

In [22]:
import numpy as np
a = np.arange(24).reshape(4,6)
a # a의 element 개수는 24개

array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])

In [23]:
a.reshape(2,-1) # 행이 2개 열이 12개

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]])

In [24]:
a.reshape(2, -1, 4) # 3차원 배열이 2개, 행은 3개, 열은 4개

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])

### 3. 표준화 (Standardization) - 표준정규분포 변환

In [25]:
from sklearn.datasets import load_iris
iris = load_iris()

In [26]:
import pandas as pd
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [28]:
from sklearn.preprocessing import StandardScaler

iris_std = StandardScaler().fit_transform(iris.data)

In [30]:
df2 = pd.DataFrame(iris_std, columns=iris.feature_names)
df2.describe()
# 평균이 0이고 표준편차가 1인 정규분포 생성 
#   N(1,0)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,-1.690315e-15,-1.84297e-15,-1.698641e-15,-1.409243e-15
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


- Logistic Regression으로 분류


In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, stratify=iris.target, test_size=0.2, random_state=2021
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [36]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression()
lrc.fit(X_train, y_train)
# Increase the number of iterations (max_iter) or scale the data as shown in => scaling이 필요하다는 의미

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [None]:
# 표준정규분포로 변환된 데이터로 학습하면 에러가 발생하지 않음

X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, stratify=iris.target, test_size=0.2, random_state=2021
)

lrc = LogisticRegression()
lrc.fit(X_train, y_train)