### 데이터 전처리

In [1]:
import pandas as pd
import numpy as np
data = pd.DataFrame({
    '메뉴': ['[인기]아이펠치킨','닭강정','간장치킨','마늘치킨','파닭','승일양념치킨','양념반후라이드반','황금후라이드','[베스트]풀잎치킨'],
    '가격': [16000,15000,14000,14000,14000,13000,13000,12000,9900],
    '호수' : [11,12,9,9,11,10,10,10,10],
    '칼로리' : [1200.0,1500.0,1600.0,1800.0,1300.0,1400.0,1300.0,1000.0,1000.0],
    '할인율' : [0.5,0.2,0.2,0.2,0.2,0.2,0.2,0.2,np.nan],
    '할인가' : [8000.0,12000.0,11200.0,11200.0,11200.0,10400.0,10400.0,9600.0,np.nan],
    '원산지' : ['국내산','브라질','국내산','국내산','브라질','국내산','국내산','국내산','국내산'],
    '살찔까요' : ['no','yes','yes','yes','yes','yes','yes','no','no'],
    '고민' : ['무조건먹자','먹지말자','먹지말자','먹지말자','먹지말자','먹지말자','먹지말자','무조건먹자','무조건먹자']
})
data.to_csv('final_modudak.csv', index=False)
data

# Q. 위 data 변수의 데이터에서 '간장치킨'의 원산지를 미국으로 수정해주세요.
data.loc[2, "원산지"] = "미국"

df = data[["가격", "호수", "칼로리", "원산지", "살찔까요"]].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


### 범주형 데이터

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   가격      9 non-null      int64  
 1   호수      9 non-null      int64  
 2   칼로리     9 non-null      float64
 3   원산지     9 non-null      object 
 4   살찔까요    9 non-null      object 
dtypes: float64(1), int64(2), object(2)
memory usage: 492.0+ bytes


### label encoding

In [3]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df["원산지"])
le.transform(df["원산지"])

array([0, 2, 1, 0, 2, 0, 0, 0, 0])

In [4]:
le.fit_transform(df["원산지"])

array([0, 2, 1, 0, 2, 0, 0, 0, 0])

In [5]:
df["원산지"] = le.fit_transform(df["원산지"])
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,no
1,15000,12,1500.0,2,yes
2,14000,9,1600.0,1,yes
3,14000,9,1800.0,0,yes
4,14000,11,1300.0,2,yes
5,13000,10,1400.0,0,yes
6,13000,10,1300.0,0,yes
7,12000,10,1000.0,0,no
8,9900,10,1000.0,0,no


In [6]:
le = LabelEncoder()
df["살찔까요"] = le.fit_transform(df["살찔까요"])
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,0
1,15000,12,1500.0,2,1
2,14000,9,1600.0,1,1
3,14000,9,1800.0,0,1
4,14000,11,1300.0,2,1
5,13000,10,1400.0,0,1
6,13000,10,1300.0,0,1
7,12000,10,1000.0,0,0
8,9900,10,1000.0,0,0


### 여러 Feature 한번에 label encoding

In [7]:
df = data[["가격", "호수", "칼로리", "원산지", "살찔까요"]].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


In [8]:
cols = ["원산지", "살찔까요"]

In [9]:
from sklearn.preprocessing import LabelEncoder

for col in cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,0
1,15000,12,1500.0,2,1
2,14000,9,1600.0,1,1
3,14000,9,1800.0,0,1
4,14000,11,1300.0,2,1
5,13000,10,1400.0,0,1
6,13000,10,1300.0,0,1
7,12000,10,1000.0,0,0
8,9900,10,1000.0,0,0


In [10]:
df = data[["가격", "호수", "칼로리", "원산지", "살찔까요"]].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


In [11]:
cols = df.select_dtypes(include="object").columns

In [12]:
for col in cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,0,0
1,15000,12,1500.0,2,1
2,14000,9,1600.0,1,1
3,14000,9,1800.0,0,1
4,14000,11,1300.0,2,1
5,13000,10,1400.0,0,1
6,13000,10,1300.0,0,1
7,12000,10,1000.0,0,0
8,9900,10,1000.0,0,0


### one-hot encoding

In [13]:
df = data[["가격", "호수", "칼로리", "원산지", "살찔까요"]].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


In [14]:
# 원핫인코딩
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)
cat = ohe.fit_transform(df[["원산지"]])
cat

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [15]:
# 카테고리
ohe.categories_

[array(['국내산', '미국', '브라질'], dtype=object)]

In [16]:
# 피쳐 이름과 카테고리
ohe.get_feature_names_out()

array(['원산지_국내산', '원산지_미국', '원산지_브라질'], dtype=object)

In [17]:
# 데이터프레임으로 변환
df_cat = pd.DataFrame(cat, columns=ohe.get_feature_names_out())
df_cat

Unnamed: 0,원산지_국내산,원산지_미국,원산지_브라질
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0
6,1.0,0.0,0.0
7,1.0,0.0,0.0
8,1.0,0.0,0.0


In [18]:
# 데이터 프레임 합치기
df = pd.concat([df, df_cat], axis="columns")
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요,원산지_국내산,원산지_미국,원산지_브라질
0,16000,11,1200.0,국내산,no,1.0,0.0,0.0
1,15000,12,1500.0,브라질,yes,0.0,0.0,1.0
2,14000,9,1600.0,미국,yes,0.0,1.0,0.0
3,14000,9,1800.0,국내산,yes,1.0,0.0,0.0
4,14000,11,1300.0,브라질,yes,0.0,0.0,1.0
5,13000,10,1400.0,국내산,yes,1.0,0.0,0.0
6,13000,10,1300.0,국내산,yes,1.0,0.0,0.0
7,12000,10,1000.0,국내산,no,1.0,0.0,0.0
8,9900,10,1000.0,국내산,no,1.0,0.0,0.0


In [19]:
# 기존 컬럼 삭제
df = df.drop(["원산지"], axis="columns")
df

Unnamed: 0,가격,호수,칼로리,살찔까요,원산지_국내산,원산지_미국,원산지_브라질
0,16000,11,1200.0,no,1.0,0.0,0.0
1,15000,12,1500.0,yes,0.0,0.0,1.0
2,14000,9,1600.0,yes,0.0,1.0,0.0
3,14000,9,1800.0,yes,1.0,0.0,0.0
4,14000,11,1300.0,yes,0.0,0.0,1.0
5,13000,10,1400.0,yes,1.0,0.0,0.0
6,13000,10,1300.0,yes,1.0,0.0,0.0
7,12000,10,1000.0,no,1.0,0.0,0.0
8,9900,10,1000.0,no,1.0,0.0,0.0


### 여러 컬럼 인코딩

In [20]:
df = data[["가격", "호수", "칼로리", "원산지", "살찔까요"]].copy()

cols = df.select_dtypes(include=["object"]).columns
ohe = OneHotEncoder(sparse_output=False)
cat = ohe.fit_transform(df[cols])
df_cat = pd.DataFrame(data=cat, columns=ohe.get_feature_names_out())
df_cat

Unnamed: 0,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,1.0
5,1.0,0.0,0.0,0.0,1.0
6,1.0,0.0,0.0,0.0,1.0
7,1.0,0.0,0.0,1.0,0.0
8,1.0,0.0,0.0,1.0,0.0


In [21]:
df = pd.concat([df, df_cat], axis="columns")
df.head()

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,국내산,no,1.0,0.0,0.0,1.0,0.0
1,15000,12,1500.0,브라질,yes,0.0,0.0,1.0,0.0,1.0
2,14000,9,1600.0,미국,yes,0.0,1.0,0.0,0.0,1.0
3,14000,9,1800.0,국내산,yes,1.0,0.0,0.0,0.0,1.0
4,14000,11,1300.0,브라질,yes,0.0,0.0,1.0,0.0,1.0


In [22]:
df = df.drop(labels=cols, axis="columns")
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,1.0,0.0,0.0,1.0,0.0
1,15000,12,1500.0,0.0,0.0,1.0,0.0,1.0
2,14000,9,1600.0,0.0,1.0,0.0,0.0,1.0
3,14000,9,1800.0,1.0,0.0,0.0,0.0,1.0
4,14000,11,1300.0,0.0,0.0,1.0,0.0,1.0
5,13000,10,1400.0,1.0,0.0,0.0,0.0,1.0
6,13000,10,1300.0,1.0,0.0,0.0,0.0,1.0
7,12000,10,1000.0,1.0,0.0,0.0,1.0,0.0
8,9900,10,1000.0,1.0,0.0,0.0,1.0,0.0


### 판다스를 이용한 원핫 인코딩

In [23]:
df = data[["가격", "호수", "칼로리", "원산지", "살찔까요"]].copy()
df

Unnamed: 0,가격,호수,칼로리,원산지,살찔까요
0,16000,11,1200.0,국내산,no
1,15000,12,1500.0,브라질,yes
2,14000,9,1600.0,미국,yes
3,14000,9,1800.0,국내산,yes
4,14000,11,1300.0,브라질,yes
5,13000,10,1400.0,국내산,yes
6,13000,10,1300.0,국내산,yes
7,12000,10,1000.0,국내산,no
8,9900,10,1000.0,국내산,no


In [24]:
df = pd.get_dummies(df)
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,True,False,False,True,False
1,15000,12,1500.0,False,False,True,False,True
2,14000,9,1600.0,False,True,False,False,True
3,14000,9,1800.0,True,False,False,False,True
4,14000,11,1300.0,False,False,True,False,True
5,13000,10,1400.0,True,False,False,False,True
6,13000,10,1300.0,True,False,False,False,True
7,12000,10,1000.0,True,False,False,True,False
8,9900,10,1000.0,True,False,False,True,False


In [30]:
# Q. 다음 df 변수의 데이터 중 범주형 데이터 칼럼을 모두 원핫 인코딩하세요.
df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()
df
pd.get_dummies(df)

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,16000,11,1200.0,True,False,False,True,False
1,15000,12,1500.0,False,False,True,False,True
2,14000,9,1600.0,False,True,False,False,True
3,14000,9,1800.0,True,False,False,False,True
4,14000,11,1300.0,False,False,True,False,True
5,13000,10,1400.0,True,False,False,False,True
6,13000,10,1300.0,True,False,False,False,True
7,12000,10,1000.0,True,False,False,True,False
8,9900,10,1000.0,True,False,False,True,False


### 수치형 데이터

In [25]:
# StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit_transform(df[["가격"]])

array([[ 1.54246993],
       [ 0.94150762],
       [ 0.34054531],
       [ 0.34054531],
       [ 0.34054531],
       [-0.260417  ],
       [-0.260417  ],
       [-0.86137931],
       [-2.12340016]])

In [27]:
# 여러 변수에 적용
cols = ["가격", "호수", "칼로리"]
scaler = StandardScaler()
df[cols] = scaler.fit_transform(df[cols])
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.54247,0.848875,-0.57792,True,False,False,True,False
1,0.941508,1.940285,0.622376,False,False,True,False,True
2,0.340545,-1.333946,1.022475,False,True,False,False,True
3,0.340545,-1.333946,1.822672,True,False,False,False,True
4,0.340545,0.848875,-0.177822,False,False,True,False,True
5,-0.260417,-0.242536,0.222277,True,False,False,False,True
6,-0.260417,-0.242536,-0.177822,True,False,False,False,True
7,-0.861379,-0.242536,-1.378118,True,False,False,True,False
8,-2.1234,-0.242536,-1.378118,True,False,False,True,False


In [28]:
# MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
cols = ["가격", "호수", "칼로리"]
df[cols] = scaler.fit_transform(df[cols])
df


Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.0,0.666667,0.25,True,False,False,True,False
1,0.836066,1.0,0.625,False,False,True,False,True
2,0.672131,0.0,0.75,False,True,False,False,True
3,0.672131,0.0,1.0,True,False,False,False,True
4,0.672131,0.666667,0.375,False,False,True,False,True
5,0.508197,0.333333,0.5,True,False,False,False,True
6,0.508197,0.333333,0.375,True,False,False,False,True
7,0.344262,0.333333,0.0,True,False,False,True,False
8,0.0,0.333333,0.0,True,False,False,True,False


In [32]:
# Q. 다음 df 변수에서 수치형 데이터 컬럼('가격', '호수', '칼로리')을 모두 표준화해주세요.
from sklearn.preprocessing import StandardScaler

df = data[['가격', '호수', '칼로리', '원산지', '살찔까요']].copy()
df = pd.get_dummies(df)
df
cols = ["가격", "호수", "칼로리"]
scaler = StandardScaler()
df[cols] = scaler.fit_transform(df[cols])
df

Unnamed: 0,가격,호수,칼로리,원산지_국내산,원산지_미국,원산지_브라질,살찔까요_no,살찔까요_yes
0,1.54247,0.848875,-0.57792,True,False,False,True,False
1,0.941508,1.940285,0.622376,False,False,True,False,True
2,0.340545,-1.333946,1.022475,False,True,False,False,True
3,0.340545,-1.333946,1.822672,True,False,False,False,True
4,0.340545,0.848875,-0.177822,False,False,True,False,True
5,-0.260417,-0.242536,0.222277,True,False,False,False,True
6,-0.260417,-0.242536,-0.177822,True,False,False,False,True
7,-0.861379,-0.242536,-1.378118,True,False,False,True,False
8,-2.1234,-0.242536,-1.378118,True,False,False,True,False
