In [3]:
#Scaling

import pandas as pd
from sklearn.preprocessing import scale, minmax_scale

x = pd.DataFrame({'col':[-3,-1,1,3,5,7,9]})

#평균 0, 분산을 이용해 정규화
#astype(float)는 scale의 입력이 float이므로 warning 방지를 위해 변환

x["scale"] = scale(x.col.astype(float))

print(x.describe())
#0~1사이의 값으로 정규화
x["minmax_scale"] = minmax_scale(x.col.astype(float))

print(x)
print(x.describe())

            col     scale
count  7.000000  7.000000
mean   3.000000  0.000000
std    4.320494  1.080123
min   -3.000000 -1.500000
25%    0.000000 -0.750000
50%    3.000000  0.000000
75%    6.000000  0.750000
max    9.000000  1.500000
   col  scale  minmax_scale
0   -3   -1.5      0.000000
1   -1   -1.0      0.166667
2    1   -0.5      0.333333
3    3    0.0      0.500000
4    5    0.5      0.666667
5    7    1.0      0.833333
6    9    1.5      1.000000
            col     scale  minmax_scale
count  7.000000  7.000000      7.000000
mean   3.000000  0.000000      0.500000
std    4.320494  1.080123      0.360041
min   -3.000000 -1.500000      0.000000
25%    0.000000 -0.750000      0.250000
50%    3.000000  0.000000      0.500000
75%    6.000000  0.750000      0.750000
max    9.000000  1.500000      1.000000


In [6]:

#minmaxscaler 객체 이용

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

dfTest = pd.DataFrame({'A':[14.00, 90.20, 90.95, 96.27, 91.21],
                       'B':[103.02, 107.26, 110.35, 114.23, 114.68], 
                       'C':['big','small','big','small','small']})

dfTest[['A','B']] = scaler.fit_transform(dfTest[['A','B']])

print(dfTest)

          A         B      C
0  0.000000  0.000000    big
1  0.926219  0.363636  small
2  0.935335  0.628645    big
3  1.000000  0.961407  small
4  0.938495  1.000000  small


In [7]:
#nominal attributes

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(['paris','paris','tokyo','amsterdam'])
print(le.classes_)
print(type(le.classes_))

data= le.transform(['paris','paris','tokyo','amsterdam'])
print(data)
print(type(data))

original = le.inverse_transform([2,2,1])
print(original)
print(type(data))

['amsterdam' 'paris' 'tokyo']
<class 'numpy.ndarray'>
[1 1 2 0]
<class 'numpy.ndarray'>
['tokyo' 'tokyo' 'paris']
<class 'numpy.ndarray'>


In [8]:
#transform 예제

from sklearn import preprocessing

le = preprocessing.LabelEncoder()

str = []
for i in range(ord('a'),ord('z') + 1):
    str.append(chr(i))

print(str)

le.fit(str)
data = le.transform(['q','a','z'])
print(data)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
[16  0 25]


In [10]:
# dataframe 변환

le = preprocessing.LabelEncoder()
df = pd.DataFrame({'A':['a','b','b','c','a'],
'B' : ['x','y','x','y','x']})

#fit_transform: fit과 transform을 동시에 처리
#df.apply는 dataframe에서 인자로 주어진 함수를 각 column에 적용하는 함수
data = df.apply(le.fit_transform)
print(data)
print(type(data))

   A  B
0  0  0
1  1  1
2  1  0
3  2  1
4  0  0
<class 'pandas.core.frame.DataFrame'>


In [11]:
#one hot encoding
df = pd.DataFrame({'country': ['russia','germany','australia','korea','germany']})
a = pd.get_dummies(df,prefix= ['country'])
print(a)

   country_australia  country_germany  country_korea  country_russia
0                  0                0              0               1
1                  0                1              0               0
2                  1                0              0               0
3                  0                0              1               0
4                  0                1              0               0


In [12]:
#one-hot encoding 2

df = pd.DataFrame({'A':['a','b','a'], 'B':['b','a','c']})

#Get one-hot encoding of columns B
one_hot = pd.get_dummies(df['B'])

#Drop column B as it is now encoded
df = df.drop('B',axis=1)
#join the encoded df
df=df.join(one_hot)
print(df)

   A  a  b  c
0  a  0  1  0
1  b  1  0  0
2  a  0  0  1


In [14]:
# Multiple column 변환
df = pd.DataFrame({'A': ['a','b','b','c','a'],
'B': ['x','y','x','y','x']})

a = pd.get_dummies(df,prefix=['A','B'])
print(a)

   A_a  A_b  A_c  B_x  B_y
0    1    0    0    1    0
1    0    1    0    0    1
2    0    1    0    1    0
3    0    0    1    0    1
4    1    0    0    1    0


In [15]:
#일부만 자동인식 encoding

df = pd.DataFrame({'A':['a','b','b','c','a'],
'B':[3,4,7,2,5]})
a = pd.get_dummies(df,prefix=['A'])
print(a)

   B  A_a  A_b  A_c
0  3    1    0    0
1  4    0    1    0
2  7    0    1    0
3  2    0    0    1
4  5    1    0    0


In [18]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

x1 = pd.DataFrame({'country' : ['russia','germany','asutralia','korea', 'germany']})

#Dataframe 전체를 라벨인코딩(숫자로 변환) 한 후 , One-hot encoding을 해야함
le = LabelEncoder()
x2 = x1.apply(le.fit_transform)
print(x2)
print(type(x2))
print()

encoder = OneHotEncoder()
x2 = encoder.fit_transform(x2) # 결과는 sparse matrix로 변환
print(x2)
print(type(x2))
print()

x3 = x2.toarray()  #numpy array로 변화, 추후에 dataframe으로 변환
print(x3)
print(type(x3))

x4 = pd.DataFrame(x3) #최종적으로 다시 dataframe으로 변환
print(x4)
print(type(x4))





   country
0        3
1        1
2        0
3        2
4        1
<class 'pandas.core.frame.DataFrame'>

  (0, 3)	1.0
  (1, 1)	1.0
  (2, 0)	1.0
  (3, 2)	1.0
  (4, 1)	1.0
<class 'scipy.sparse.csr.csr_matrix'>

[[0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]]
<class 'numpy.ndarray'>
     0    1    2    3
0  0.0  0.0  0.0  1.0
1  0.0  1.0  0.0  0.0
2  1.0  0.0  0.0  0.0
3  0.0  0.0  1.0  0.0
4  0.0  1.0  0.0  0.0
<class 'pandas.core.frame.DataFrame'>


In [19]:
# Onehot encoding with Scikit-learn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

x1 = pd.DataFrame({'A' : ['a','b','b','c','a'],
'B': ['x','y','x','y','x']})

#Dataframe 전체를 라벨인코딩(숫자로 변환) 한 후 , One-hot encoding을 해야함
le = LabelEncoder()
x2 = x1.apply(le.fit_transform)
print(x2)
print(type(x2))
print()

encoder = OneHotEncoder()
x2 = encoder.fit_transform(x2) # 결과는 sparse matrix로 변환
print(x2)
print(type(x2))
print()

x3 = x2.toarray()  #numpy array로 변화, 추후에 dataframe으로 변환
print(x3)
print(type(x3))

x4 = pd.DataFrame(x3) #최종적으로 다시 dataframe으로 변환
print(x4)
print(type(x4))





   A  B
0  0  0
1  1  1
2  1  0
3  2  1
4  0  0
<class 'pandas.core.frame.DataFrame'>

  (0, 0)	1.0
  (0, 3)	1.0
  (1, 1)	1.0
  (1, 4)	1.0
  (2, 1)	1.0
  (2, 3)	1.0
  (3, 2)	1.0
  (3, 4)	1.0
  (4, 0)	1.0
  (4, 3)	1.0
<class 'scipy.sparse.csr.csr_matrix'>

[[1. 0. 0. 1. 0.]
 [0. 1. 0. 0. 1.]
 [0. 1. 0. 1. 0.]
 [0. 0. 1. 0. 1.]
 [1. 0. 0. 1. 0.]]
<class 'numpy.ndarray'>
     0    1    2    3    4
0  1.0  0.0  0.0  1.0  0.0
1  0.0  1.0  0.0  0.0  1.0
2  0.0  1.0  0.0  1.0  0.0
3  0.0  0.0  1.0  0.0  1.0
4  1.0  0.0  0.0  1.0  0.0
<class 'pandas.core.frame.DataFrame'>


In [21]:
#ColumnTransformer 활용
from sklearn.compose import ColumnTransformer

x = pd.DataFrame({'A': ['a','b','b','c','a'],
'B':[3,4,5,1,7]})

ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(), [0])], remainder='passthrough')

x = ct.fit_transform(x)

print(x)
print(type(x))

x = pd.DataFrame(x) #dataframe으로 변환

print(x)
print(type(x))

[[1. 0. 0. 3.]
 [0. 1. 0. 4.]
 [0. 1. 0. 5.]
 [0. 0. 1. 1.]
 [1. 0. 0. 7.]]
<class 'numpy.ndarray'>
     0    1    2    3
0  1.0  0.0  0.0  3.0
1  0.0  1.0  0.0  4.0
2  0.0  1.0  0.0  5.0
3  0.0  0.0  1.0  1.0
4  1.0  0.0  0.0  7.0
<class 'pandas.core.frame.DataFrame'>
