In [0]:
import pandas as pd
import numpy as np

## 데이터 변환

In [0]:
n_samples = 10
height = 3*np.random.randn(n_samples).round() + 170
nationality = np.random.randint(0,3,n_samples)

In [5]:
height, nationality

(array([170., 170., 164., 173., 167., 170., 170., 176., 173., 173.]),
 array([2, 1, 0, 2, 0, 2, 2, 2, 2, 2]))

In [6]:
list(zip(height, nationality))

[(170.0, 2),
 (170.0, 1),
 (164.0, 0),
 (173.0, 2),
 (167.0, 0),
 (170.0, 2),
 (170.0, 2),
 (176.0, 2),
 (173.0, 2),
 (173.0, 2)]

In [7]:
df = pd.DataFrame(list(zip(height, nationality)), 
                  columns=["height","nationality"])
df.head()

Unnamed: 0,height,nationality
0,170.0,2
1,170.0,1
2,164.0,0
3,173.0,2
4,167.0,0


In [0]:
#one hot encoding으로 변환
nat = pd.get_dummies(df['nationality'], prefix='nat_')

In [9]:
nat

Unnamed: 0,nat__0,nat__1,nat__2
0,0,0,1
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,0,1


In [10]:
new_df = pd.concat([df, nat], axis=1); new_df.head()

Unnamed: 0,height,nationality,nat__0,nat__1,nat__2
0,170.0,2,0,0,1
1,170.0,1,0,1,0
2,164.0,0,1,0,0
3,173.0,2,0,0,1
4,167.0,0,1,0,0


In [0]:
new_df.drop('nationality', axis=1, inplace=True)

In [12]:
new_df

Unnamed: 0,height,nat__0,nat__1,nat__2
0,170.0,0,0,1
1,170.0,0,1,0
2,164.0,1,0,0
3,173.0,0,0,1
4,167.0,1,0,0
5,170.0,0,0,1
6,170.0,0,0,1
7,176.0,0,0,1
8,173.0,0,0,1
9,173.0,0,0,1


In [13]:
nationality

array([2, 1, 0, 2, 0, 2, 2, 2, 2, 2])

In [14]:
# data type이 categories로 변경됨
nat_categ = pd.Categorical(nationality)
nat_categ

[2, 1, 0, 2, 0, 2, 2, 2, 2, 2]
Categories (3, int64): [0, 1, 2]

In [0]:
# 주의: Series로 변경된다
df['categ'] = nat_categ

In [16]:
df

Unnamed: 0,height,nationality,categ
0,170.0,2,2
1,170.0,1,1
2,164.0,0,0
3,173.0,2,2
4,167.0,0,0
5,170.0,2,2
6,170.0,2,2
7,176.0,2,2
8,173.0,2,2
9,173.0,2,2


In [17]:
type(df.categ)

pandas.core.series.Series

In [18]:
type(nat_categ)

pandas.core.categorical.Categorical

## 표준 스케일링

In [19]:
height = 3*np.random.randn(n_samples).round() + 170
weight = 4*np.random.randn(n_samples).round() + 70

X = pd.DataFrame(list(zip(height, weight)));X.head()

Unnamed: 0,0,1
0,167.0,74.0
1,167.0,58.0
2,173.0,70.0
3,167.0,78.0
4,173.0,70.0


In [20]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X);X_std

array([[-1.22474487,  0.65465367],
       [-1.22474487, -2.2549182 ],
       [ 0.81649658, -0.0727393 ],
       [-1.22474487,  1.38204664],
       [ 0.81649658, -0.0727393 ],
       [-0.20412415,  0.65465367],
       [ 0.81649658, -0.80013226],
       [-0.20412415,  0.65465367],
       [-0.20412415,  0.65465367],
       [ 1.83711731, -0.80013226]])

In [21]:
x=X.values; x

array([[167.,  74.],
       [167.,  58.],
       [173.,  70.],
       [167.,  78.],
       [173.,  70.],
       [170.,  74.],
       [173.,  66.],
       [170.,  74.],
       [170.,  74.],
       [176.,  66.]])

In [22]:
x_std = StandardScaler().fit_transform(x);x_std

array([[-1.22474487,  0.65465367],
       [-1.22474487, -2.2549182 ],
       [ 0.81649658, -0.0727393 ],
       [-1.22474487,  1.38204664],
       [ 0.81649658, -0.0727393 ],
       [-0.20412415,  0.65465367],
       [ 0.81649658, -0.80013226],
       [-0.20412415,  0.65465367],
       [-0.20412415,  0.65465367],
       [ 1.83711731, -0.80013226]])