# 스케일링

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('https://bit.ly/Iris_Data')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().round(3)

In [None]:
df.describe(include=object)

In [None]:
df['Species'].value_counts()

In [None]:
df = df.set_index('Species')

In [None]:
df.head()

## 데이터 표준화

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
scaled = scaler.fit_transform(X=df)

In [None]:
pd.DataFrame(data=scaled, columns=df.columns).describe().round(3)

In [None]:
pd.DataFrame(data=scaled, columns=df.columns).std(ddof=0)
# Sepal.Length    1.0
# Sepal.Width     1.0
# Petal.Length    1.0
# Petal.Width     1.0
# dtype: float64

In [None]:
from scipy.stats import zscore

In [None]:
scaled = pd.DataFrame(data=zscore(a=df, ddof=1), columns=df.columns)

In [None]:
scaled.describe().round(3)


## 최소-최대 정규화

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
scaled = scaler.fit_transform(X=df)

In [None]:
pd.DataFrame(data=scaled, columns=df.columns).describe().round(3)

## 원-핫 인코딩

In [None]:
df = df.reset_index()
df.head()

In [None]:
pd.get_dummies(
    data=df, dtype=int
)

## 레이블 인코딩

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
encoded = le.fit_transform(y=df['Species'])

In [None]:
pd.Series(data=encoded).unique()
# array([0, 1, 2])

## 오디널 인코딩

In [None]:
df['Species'].unique()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
oe = OrdinalEncoder(categories=[['versicolor', 'virginica', 'setosa']])

In [None]:
encoded = oe.fit_transform(X=df[['Species']])

In [None]:
pd.Series(data=encoded[:, 0]).unique()
# array([2., 0., 1.])

## 타겟 인코딩

In [None]:
import category_encoders as ce

In [None]:
te = ce.TargetEncoder()

In [None]:
encoded = te.fit_transform(X=df['Species'], y=df['Sepal.Length'])

In [None]:
encoded['Species'].unique()
# array([5.04571126, 5.9316052 , 6.55268353])

In [None]:
df.groupby(by='Species')['Sepal.Length'].mean()
# Species
# setosa        5.006
# versicolor    5.936
# virginica     6.588
# Name: Sepal.Length, dtype: float64