## 학습목표
- 범주형 데이터 전처리를 할 수 있다.
- 수치형 데이터 전처리를 할 수 있다.
- 머신러닝을 통해 예측할 수 있다.

## 1. 데이터 전처리
### 1) 범주형 데이터
- 레이블 인코딩
- 원핫 인코딩

### 2)수치형 데이터
- 표준화
- 정규화(Min-Max)

## 데이터 준비

In [44]:
# 데이터 생성
import pandas as pd
import numpy as np
data = pd.DataFrame({
    '메뉴': ['[인기]아이펠치킨','닭강정','간장치킨','마늘치킨','파닭','승일양념치킨','양념반후라이드반','황금후라이드','[베스트]풀잎치킨'],
    '가격': [16000,15000,14000,14000,14000,13000,13000,12000,9900],
    '호수' : [11,12,9,9,11,10,10,10,10],
    '칼로리' : [1200.0,1500.0,1600.0,1800.0,1300.0,1400.0,1300.0,1000.0,1000.0],
    '할인율' : [0.5,0.2,0.2,0.2,0.2,0.2,0.2,0.2,np.nan],
    '할인가' : [8000.0,12000.0,11200.0,11200.0,11200.0,10400.0,10400.0,9600.0,np.nan],
    '원산지' : ['국내산','브라질','국내산','국내산','브라질','국내산','국내산','국내산','국내산'],
    '살찔까요' : ['no','yes','yes','yes','yes','yes','yes','no','no'],
    '고민' : ['무조건먹자','먹지말자','먹지말자','먹지말자','먹지말자','먹지말자','먹지말자','무조건먹자','무조건먹자']
})
data.to_csv('final_modudak.csv', index=False)
data

Unnamed: 0,메뉴,가격,호수,칼로리,할인율,할인가,원산지,살찔까요,고민
0,[인기]아이펠치킨,16000,11,1200.0,0.5,8000.0,국내산,no,무조건먹자
1,닭강정,15000,12,1500.0,0.2,12000.0,브라질,yes,먹지말자
2,간장치킨,14000,9,1600.0,0.2,11200.0,국내산,yes,먹지말자
3,마늘치킨,14000,9,1800.0,0.2,11200.0,국내산,yes,먹지말자
4,파닭,14000,11,1300.0,0.2,11200.0,브라질,yes,먹지말자
5,승일양념치킨,13000,10,1400.0,0.2,10400.0,국내산,yes,먹지말자
6,양념반후라이드반,13000,10,1300.0,0.2,10400.0,국내산,yes,먹지말자
7,황금후라이드,12000,10,1000.0,0.2,9600.0,국내산,no,무조건먹자
8,[베스트]풀잎치킨,9900,10,1000.0,,,국내산,no,무조건먹자


In [45]:
# 데이터 변경
data.loc[2,"원산지"] = "미국"

In [46]:
df = data[["가격", "호수", "칼로리", "원산지","살찔까요"]].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


## 데이터 전처리
- 범주형 데이터(=문자형 데이터)
- 수치형 데이터

### 1) 범주형 데이터(원산지, 살찔까요)를 숫자 형태로 변경  
HOW? 
- 레이블 인코딩
  fit, transform 혹은 fit_transform
  ex) 국내산 0 브라질 1 미국 2
    
- 원핫 인코딩
  
  ex) 열이 생성되어 해당 컬럼에 1값이, 그렇지 않은 컬럼에 0

In [47]:
df.info() #타입 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   가격      9 non-null      int64  
 1   호수      9 non-null      int64  
 2   칼로리     9 non-null      float64
 3   원산지     9 non-null      object 
 4   살찔까요    9 non-null      object 
dtypes: float64(1), int64(2), object(2)
memory usage: 492.0+ bytes


### 레이블 인코딩

In [48]:
from sklearn.preprocessing import LabelEncoder

In [49]:
le = LabelEncoder()
le.fit(df['원산지'])
le.transform(df['원산지'])

array([0, 2, 1, 0, 2, 0, 0, 0, 0])

In [50]:
# 간편한 방법
le.fit_transform(df['원산지'])

array([0, 2, 1, 0, 2, 0, 0, 0, 0])

In [51]:
# 적용하기
df['원산지'] = le.fit_transform(df['원산지'])
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,no
1,15000,12,1500.0,2,yes
2,14000,9,1600.0,1,yes
3,14000,9,1800.0,0,yes
4,14000,11,1300.0,2,yes
5,13000,10,1400.0,0,yes
6,13000,10,1300.0,0,yes
7,12000,10,1000.0,0,no
8,9900,10,1000.0,0,no


In [52]:
df['살찔까요'] = le.fit_transform(df['살찔까요'])
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,0
1,15000,12,1500.0,2,1
2,14000,9,1600.0,1,1
3,14000,9,1800.0,0,1
4,14000,11,1300.0,2,1
5,13000,10,1400.0,0,1
6,13000,10,1300.0,0,1
7,12000,10,1000.0,0,0
8,9900,10,1000.0,0,0


In [53]:
# 여러 개 컬럼 한 번에 인코딩
df = data[["가격", "호수", "칼로리", "원산지","살찔까요"]].copy()
cols = ['원산지','살찔까요']

from sklearn.preprocessing import LabelEncoder

for col in cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,0
1,15000,12,1500.0,2,1
2,14000,9,1600.0,1,1
3,14000,9,1800.0,0,1
4,14000,11,1300.0,2,1
5,13000,10,1400.0,0,1
6,13000,10,1300.0,0,1
7,12000,10,1000.0,0,0
8,9900,10,1000.0,0,0


In [54]:
# object형태를 한 번에 리스트로 바꾸기
df = data[["가격", "호수", "칼로리", "원산지","살찔까요"]].copy()
cols = df.select_dtypes(include='object').columns

from sklearn.preprocessing import LabelEncoder

for col in cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,0
1,15000,12,1500.0,2,1
2,14000,9,1600.0,1,1
3,14000,9,1800.0,0,1
4,14000,11,1300.0,2,1
5,13000,10,1400.0,0,1
6,13000,10,1300.0,0,1
7,12000,10,1000.0,0,0
8,9900,10,1000.0,0,0


### 원핫(one-hot) 인코딩

In [55]:
df = data[["가격", "호수", "칼로리", "원산지","살찔까요"]].copy()

In [56]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False) # sparse_output 기본값은 True, False로 해주어야 배열 형태로 반환
cat = ohe.fit_transform(df[['원산지']]) # 데이터프레임 형태로 들어가야 함
cat

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [57]:
# 컬럼 이름 설정 방법1
ohe.categories_

[array(['국내산', '미국', '브라질'], dtype=object)]

In [58]:
# 컬럼 이름 설정 방2
ohe.get_feature_names_out()

array(['원산지_국내산', '원산지_미국', '원산지_브라질'], dtype=object)

In [59]:
# 데이터프레임으로 변환
df_cat = pd.DataFrame(cat,columns=ohe.get_feature_names_out())

In [60]:
# 원본 데이터프레임에 합기기
df = pd.concat([df,df_cat], axis=1)
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요,원산지_국내산,원산지_미국,원산지_브라질
0,16000,11,1200.0,국내산,no,1.0,0.0,0.0
1,15000,12,1500.0,브라질,yes,0.0,0.0,1.0
2,14000,9,1600.0,미국,yes,0.0,1.0,0.0
3,14000,9,1800.0,국내산,yes,1.0,0.0,0.0
4,14000,11,1300.0,브라질,yes,0.0,0.0,1.0
5,13000,10,1400.0,국내산,yes,1.0,0.0,0.0
6,13000,10,1300.0,국내산,yes,1.0,0.0,0.0
7,12000,10,1000.0,국내산,no,1.0,0.0,0.0
8,9900,10,1000.0,국내산,no,1.0,0.0,0.0


In [62]:
df = df.drop(['원산지'], axis=1)
df

Unnamed: 0,가격,호수,칼로리,살찔까요,원산지_국내산,원산지_미국,원산지_브라질
0,16000,11,1200.0,no,1.0,0.0,0.0
1,15000,12,1500.0,yes,0.0,0.0,1.0
2,14000,9,1600.0,yes,0.0,1.0,0.0
3,14000,9,1800.0,yes,1.0,0.0,0.0
4,14000,11,1300.0,yes,0.0,0.0,1.0
5,13000,10,1400.0,yes,1.0,0.0,0.0
6,13000,10,1300.0,yes,1.0,0.0,0.0
7,12000,10,1000.0,no,1.0,0.0,0.0
8,9900,10,1000.0,no,1.0,0.0,0.0


In [69]:
# 여러 개 컬럼 한 번에 인코딩
# 레이블 인코딩은 반복문을 사용했지만, 
# 원핫 인코딩은 편리함

from sklearn.preprocessing import OneHotEncoder
df = data[["가격", "호수", "칼로리", "원산지","살찔까요"]].copy()

cols = df.select_dtypes(include='object').columns
ohe = OneHotEncoder(sparse_output=False)
cat = ohe.fit_transform(df[cols])
df_cat = pd.DataFrame(cat, columns=ohe.get_feature_names_out())
df_cat

Unnamed: 0,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0
5,1.0,0.0,0.0,0.0,1.0
6,1.0,0.0,0.0,0.0,1.0
7,1.0,0.0,0.0,1.0,0.0
8,1.0,0.0,0.0,1.0,0.0


In [70]:
df = pd.concat([df, df_cat], axis=1)

In [71]:
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,국내산,no,1.0,0.0,0.0,1.0,0.0
1,15000,12,1500.0,브라질,yes,0.0,0.0,1.0,0.0,1.0
2,14000,9,1600.0,미국,yes,0.0,1.0,0.0,0.0,1.0
3,14000,9,1800.0,국내산,yes,1.0,0.0,0.0,0.0,1.0
4,14000,11,1300.0,브라질,yes,0.0,0.0,1.0,0.0,1.0
5,13000,10,1400.0,국내산,yes,1.0,0.0,0.0,0.0,1.0
6,13000,10,1300.0,국내산,yes,1.0,0.0,0.0,0.0,1.0
7,12000,10,1000.0,국내산,no,1.0,0.0,0.0,1.0,0.0
8,9900,10,1000.0,국내산,no,1.0,0.0,0.0,1.0,0.0


In [72]:
# df.drop(['원산지','살찔까요'],axis=1)
df = df.drop(cols, axis=1)
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,1.0,0.0,0.0,1.0,0.0
1,15000,12,1500.0,0.0,0.0,1.0,0.0,1.0
2,14000,9,1600.0,0.0,1.0,0.0,0.0,1.0
3,14000,9,1800.0,1.0,0.0,0.0,0.0,1.0
4,14000,11,1300.0,0.0,0.0,1.0,0.0,1.0
5,13000,10,1400.0,1.0,0.0,0.0,0.0,1.0
6,13000,10,1300.0,1.0,0.0,0.0,0.0,1.0
7,12000,10,1000.0,1.0,0.0,0.0,1.0,0.0
8,9900,10,1000.0,1.0,0.0,0.0,1.0,0.0


### [Tips] OneHotEncoding - Pandas이용

In [74]:
df = data[["가격", "호수", "칼로리", "원산지","살찔까요"]].copy()
df = pd.get_dummies(df)
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,True,False,False,True,False
1,15000,12,1500.0,False,False,True,False,True
2,14000,9,1600.0,False,True,False,False,True
3,14000,9,1800.0,True,False,False,False,True
4,14000,11,1300.0,False,False,True,False,True
5,13000,10,1400.0,True,False,False,False,True
6,13000,10,1300.0,True,False,False,False,True
7,12000,10,1000.0,True,False,False,True,False
8,9900,10,1000.0,True,False,False,True,False


train, test 데이터의 컬럼 수가 일치해야 하는데  
그렇지 않을 경우에 `get_dummies` 를 사용하면 안 된다.  
머신러닝 모델이 입력 차원 오류로 학습/예측을 못하게 된다.  
이때, `OneHotEncoder`를 사용하여 일관된 컬럼 구성을 보장한다.

### 2) 수치형 데이터
- 표준화  
모든값을 평균이 0, 분산이 1인 표준정규분포로 변환  
z =  (x − μ) / σ
 
- 정규화(Min-Max)
0과 1 사이의 값으로 변환

1) 표준화만
2) 정규화만
3) 표준화 -> 정규화

In [75]:
df.head()

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,True,False,False,True,False
1,15000,12,1500.0,False,False,True,False,True
2,14000,9,1600.0,False,True,False,False,True
3,14000,9,1800.0,True,False,False,False,True
4,14000,11,1300.0,False,False,True,False,True


In [76]:
# 표준화
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit_transform(df[['가격']])

array([[ 1.54246993],
       [ 0.94150762],
       [ 0.34054531],
       [ 0.34054531],
       [ 0.34054531],
       [-0.260417  ],
       [-0.260417  ],
       [-0.86137931],
       [-2.12340016]])

In [79]:
# 여러 컬럼을 한 번에 표준화
from sklearn.preprocessing import StandardScaler
cols = ['가격','호수','칼로리']
scaler = StandardScaler()
df[cols] = scaler.fit_transform(df[cols])
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.54247,0.848875,-0.57792,True,False,False,True,False
1,0.941508,1.940285,0.622376,False,False,True,False,True
2,0.340545,-1.333946,1.022475,False,True,False,False,True
3,0.340545,-1.333946,1.822672,True,False,False,False,True
4,0.340545,0.848875,-0.177822,False,False,True,False,True
5,-0.260417,-0.242536,0.222277,True,False,False,False,True
6,-0.260417,-0.242536,-0.177822,True,False,False,False,True
7,-0.861379,-0.242536,-1.378118,True,False,False,True,False
8,-2.1234,-0.242536,-1.378118,True,False,False,True,False


In [80]:
# Min-Max 정규화 (0과 1사이의 값)
from sklearn.preprocessing import MinMaxScaler
cols = ['가격','호수','칼로리']
mm = MinMaxScaler()
df[cols] = mm.fit_transform(df[cols])
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.0,0.666667,0.25,True,False,False,True,False
1,0.836066,1.0,0.625,False,False,True,False,True
2,0.672131,0.0,0.75,False,True,False,False,True
3,0.672131,0.0,1.0,True,False,False,False,True
4,0.672131,0.666667,0.375,False,False,True,False,True
5,0.508197,0.333333,0.5,True,False,False,False,True
6,0.508197,0.333333,0.375,True,False,False,False,True
7,0.344262,0.333333,0.0,True,False,False,True,False
8,0.0,0.333333,0.0,True,False,False,True,False
