# 스케일링

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('https://bit.ly/Iris_Data')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().round(3)

In [None]:
df.describe(include=object)

In [None]:
df['Species'].value_counts()

In [None]:
df = df.set_index('Species')

In [None]:
df.head()

## 데이터 표준화

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
scaler = StandardScaler()

In [18]:
scaled = scaler.fit_transform(X=df)

In [20]:
pd.DataFrame(data=scaled, columns=df.columns).describe().round(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,-0.0,-0.0,-0.0,-0.0
std,1.003,1.003,1.003,1.003
min,-1.87,-2.434,-1.568,-1.447
25%,-0.901,-0.592,-1.227,-1.184
50%,-0.053,-0.132,0.336,0.133
75%,0.675,0.559,0.763,0.791
max,2.492,3.091,1.786,1.712


In [21]:
pd.DataFrame(data=scaled, columns=df.columns).std(ddof=0)

Sepal.Length    1.0
Sepal.Width     1.0
Petal.Length    1.0
Petal.Width     1.0
dtype: float64

In [22]:
from scipy.stats import zscore

In [27]:
scaled = pd.DataFrame(data=zscore(a=df, ddof=1), columns=df.columns)

In [28]:
scaled.describe().round(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,-0.0,-0.0,-0.0,-0.0
std,1.0,1.0,1.0,1.0
min,-1.864,-2.426,-1.562,-1.442
25%,-0.898,-0.59,-1.222,-1.18
50%,-0.052,-0.132,0.335,0.132
75%,0.672,0.557,0.76,0.788
max,2.484,3.08,1.78,1.706



## 최소-최대 정규화

In [29]:
from sklearn.preprocessing import MinMaxScaler

In [30]:
scaler = MinMaxScaler()

In [31]:
scaled = scaler.fit_transform(X=df)

In [32]:
pd.DataFrame(data=scaled, columns=df.columns).describe().round(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,0.429,0.441,0.467,0.458
std,0.23,0.182,0.299,0.318
min,0.0,0.0,0.0,0.0
25%,0.222,0.333,0.102,0.083
50%,0.417,0.417,0.568,0.5
75%,0.583,0.542,0.695,0.708
max,1.0,1.0,1.0,1.0


## 원-핫 인코딩

In [33]:
df = df.reset_index()
df.head()

Unnamed: 0,Species,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


In [None]:
pd.get_dummies(
    data=df, dtype=int
)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0,0,1
146,6.3,2.5,5.0,1.9,0,0,1
147,6.5,3.0,5.2,2.0,0,0,1
148,6.2,3.4,5.4,2.3,0,0,1


## 레이블 인코딩

In [36]:
from sklearn.preprocessing import LabelEncoder

In [37]:
le = LabelEncoder()

In [41]:
encoded = le.fit_transform(y=df['Species'])

In [40]:
pd.Series(data=encoded).unique()

array([0, 1, 2])

## 오디널 인코딩

In [45]:
df['Species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [42]:
from sklearn.preprocessing import OrdinalEncoder

In [46]:
oe = OrdinalEncoder(categories=[['versicolor', 'virginica', 'setosa']])

In [47]:
encoded = oe.fit_transform(X=df[['Species']])

In [48]:
pd.Series(data=encoded[:, 0]).unique()

array([2., 0., 1.])

## 타겟 인코딩

In [49]:
import category_encoders as ce

In [50]:
te = ce.TargetEncoder()

In [51]:
encoded = te.fit_transform(X=df['Species'], y=df['Sepal.Length'])

In [53]:
encoded['Species'].unique()

array([5.04571126, 5.9316052 , 6.55268353])

In [54]:
df.groupby(by='Species')['Sepal.Length'].mean()

Species
setosa        5.006
versicolor    5.936
virginica     6.588
Name: Sepal.Length, dtype: float64