## 데이터의 변환
- 일반적으로 데이터를 표준화하면 분석시 정확도가 상승함.
- GPU는 실수 연산에 최적화되어 있어 대용량 데이터 분석시 저수를 실수,표준화등의 값으로 변형을 권장

In [1]:
import warnings
warnings.filterwarnings(action='ignore')
    
import numpy as np 
import pandas as pd
from pandas import Series, DataFrame

In [3]:
df = pd.read_csv('./data/mtcars.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6.0,160.0,110,3.9,2.62,16.46,0,manual,4,4
1,Mazda RX4 Wag,21.0,6.0,160.0,110,3.9,2.875,17.02,0,manual,4,4
2,Datsun 710,22.8,4.0,108.0,93,3.85,2.32,18.61,1,manual,4,1
3,Hornet 4 Drive,21.4,6.0,258.0,110,3.08,3.215,0.1,1,auto,3,1
4,Hornet Sportabout,18.7,8.0,360.0,175,3.15,3.44,17.02,0,auto,3,2


In [11]:
# 표준화: (x(확률변수: 데이터) - μ(뮤:모평균)) / σ(표준편차)
# 평균: 0, 분산은 1
from sklearn.preprocessing import StandardScaler

qsec = df[['qsec']]
scaler = StandardScaler() # 파이썬 객체 생성, new 생략
print(qsec)
print(scaler.fit_transform(qsec)) # 표준화 실행
print(type(scaler.fit_transform(qsec)))


      qsec
0    16.46
1    17.02
2    18.61
3     0.10
4    17.02
5    20.22
6    15.84
7    20.00
8    22.90
9    18.30
10     NaN
11   17.40
12   17.60
13   18.00
14   17.98
15   17.82
16   17.42
17   19.47
18   18.52
19   19.90
20   20.01
21   16.87
22   17.30
23   15.41
24  100.00
25   18.90
26   16.70
27   16.90
28   14.50
29   15.50
30   14.60
31   18.60
[[-2.26190871e-01]
 [-1.89009983e-01]
 [-8.34428210e-02]
 [-1.31240394e+00]
 [-1.89009983e-01]
 [ 2.34522302e-02]
 [-2.67355425e-01]
 [ 8.84545304e-03]
 [ 2.01389334e-01]
 [-1.04025098e-01]
 [            nan]
 [-1.63780095e-01]
 [-1.50501207e-01]
 [-1.23943430e-01]
 [-1.25271319e-01]
 [-1.35894430e-01]
 [-1.62452207e-01]
 [-2.63436011e-02]
 [-8.94183207e-02]
 [ 2.20600887e-03]
 [ 9.50939746e-03]
 [-1.98969150e-01]
 [-1.70419540e-01]
 [-2.95905035e-01]
 [ 5.32040079e+00]
 [-6.41884329e-02]
 [-2.10256205e-01]
 [-1.96977316e-01]
 [-3.56323977e-01]
 [-2.89929535e-01]
 [-3.49684532e-01]
 [-8.41067654e-02]]
<class 'numpy.ndarray'>


In [22]:
qsec.describe()

Unnamed: 0,qsec
count,31.0
mean,19.866774
std,15.310469
min,0.1
25%,16.785
50%,17.6
75%,18.755
max,100.0


In [20]:
qsec_df = DataFrame(scaler.fit_transform(qsec))
des = qsec_df.describe()
print(des)
print(type(des))
print('mean: {0:.3f}, std: {1:.3f}'.format(des.loc['mean',0],des.loc['std',0]))

                  0
count  3.100000e+01
mean  -2.663640e-16
std    1.016530e+00
min   -1.312404e+00
25%   -2.046127e-01
50%   -1.505012e-01
75%   -7.381563e-02
max    5.320401e+00
<class 'pandas.core.frame.DataFrame'>
mean: -0.000, std: 1.017


In [23]:
# 최소-최대 크기 변환: MinMaxScaler
# (x - min(x가 속한 컬럼)) / max(x가 속한 컬럼) - min(x가 속한 컬럼)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
qsec_df = DataFrame(scaler.fit_transform(qsec))
print(qsec_df)

           0
0   0.163764
1   0.169369
2   0.185285
3   0.000000
4   0.169369
5   0.201401
6   0.157558
7   0.199199
8   0.228228
9   0.182182
10       NaN
11  0.173173
12  0.175175
13  0.179179
14  0.178979
15  0.177377
16  0.173373
17  0.193894
18  0.184384
19  0.198198
20  0.199299
21  0.167868
22  0.172172
23  0.153253
24  1.000000
25  0.188188
26  0.166166
27  0.168168
28  0.144144
29  0.154154
30  0.145145
31  0.185185


In [24]:
qsec_df.describe() # 기술통계

Unnamed: 0,0
count,31.0
mean,0.197866
std,0.153258
min,0.0
25%,0.167017
50%,0.175175
75%,0.186737
max,1.0


In [26]:
# 범주형을 수치형으로 변환
df.head() # am 컬럼이 text로 되어 있는데 수치형으로 변형해야 분석이 수월함.

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6.0,160.0,110,3.9,2.62,16.46,0,manual,4,4
1,Mazda RX4 Wag,21.0,6.0,160.0,110,3.9,2.875,17.02,0,manual,4,4
2,Datsun 710,22.8,4.0,108.0,93,3.85,2.32,18.61,1,manual,4,1
3,Hornet 4 Drive,21.4,6.0,258.0,110,3.08,3.215,0.1,1,auto,3,1
4,Hornet Sportabout,18.7,8.0,360.0,175,3.15,3.44,17.02,0,auto,3,2


In [29]:
print(df['am'].unique())
print(df['am'].head(5))

['manual' 'auto']
0    manual
1    manual
2    manual
3      auto
4      auto
Name: am, dtype: object


In [28]:
pd.get_dummies(df['am']) # 더미 변수화

Unnamed: 0,auto,manual
0,0,1
1,0,1
2,0,1
3,1,0
4,1,0
5,1,0
6,1,0
7,1,0
8,1,0
9,1,0


In [31]:
pd.get_dummies(df['am'], drop_first=True) # manual:1 , auto: 0

Unnamed: 0,manual
0,1
1,1
2,1
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [34]:
df['am_manual'] = pd.get_dummies(df['am'], drop_first=True)
df

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,am_manual
0,Mazda RX4,21.0,6.0,160.0,110,3.9,2.62,16.46,0,manual,4,4,1
1,Mazda RX4 Wag,21.0,6.0,160.0,110,3.9,2.875,17.02,0,manual,4,4,1
2,Datsun 710,22.8,4.0,108.0,93,3.85,2.32,18.61,1,manual,4,1,1
3,Hornet 4 Drive,21.4,6.0,258.0,110,3.08,3.215,0.1,1,auto,3,1,0
4,Hornet Sportabout,18.7,8.0,360.0,175,3.15,3.44,17.02,0,auto,3,2,0
5,Valiant,18.1,6.0,225.0,105,2.76,3.46,20.22,1,auto,3,1,0
6,Duster 360,14.3,8.0,360.0,245,3.21,3.57,15.84,0,auto,3,4,0
7,Merc 240D,24.4,,146.7,62,3.69,3.19,20.0,1,auto,4,2,0
8,Merc 230,22.8,4.0,140.8,95,3.92,3.15,22.9,1,auto,4,2,0
9,Merc 280,19.2,6.0,167.6,123,3.92,3.44,18.3,1,auto,4,4,0


In [35]:
# am 삭제
df.drop(columns='am',inplace=True)
df

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,gear,carb,am_manual
0,Mazda RX4,21.0,6.0,160.0,110,3.9,2.62,16.46,0,4,4,1
1,Mazda RX4 Wag,21.0,6.0,160.0,110,3.9,2.875,17.02,0,4,4,1
2,Datsun 710,22.8,4.0,108.0,93,3.85,2.32,18.61,1,4,1,1
3,Hornet 4 Drive,21.4,6.0,258.0,110,3.08,3.215,0.1,1,3,1,0
4,Hornet Sportabout,18.7,8.0,360.0,175,3.15,3.44,17.02,0,3,2,0
5,Valiant,18.1,6.0,225.0,105,2.76,3.46,20.22,1,3,1,0
6,Duster 360,14.3,8.0,360.0,245,3.21,3.57,15.84,0,3,4,0
7,Merc 240D,24.4,,146.7,62,3.69,3.19,20.0,1,4,2,0
8,Merc 230,22.8,4.0,140.8,95,3.92,3.15,22.9,1,4,2,0
9,Merc 280,19.2,6.0,167.6,123,3.92,3.44,18.3,1,4,4,0


In [36]:
df = pd.read_csv('./data/mtcars.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6.0,160.0,110,3.9,2.62,16.46,0,manual,4,4
1,Mazda RX4 Wag,21.0,6.0,160.0,110,3.9,2.875,17.02,0,manual,4,4
2,Datsun 710,22.8,4.0,108.0,93,3.85,2.32,18.61,1,manual,4,1
3,Hornet 4 Drive,21.4,6.0,258.0,110,3.08,3.215,0.1,1,auto,3,1
4,Hornet Sportabout,18.7,8.0,360.0,175,3.15,3.44,17.02,0,auto,3,2


In [37]:
df['am'] = df['am'].replace('manual',1).replace('auto',0)
df.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6.0,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6.0,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4.0,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6.0,258.0,110,3.08,3.215,0.1,1,0,3,1
4,Hornet Sportabout,18.7,8.0,360.0,175,3.15,3.44,17.02,0,0,3,2


In [43]:
df = pd.read_csv('./data/mtcars.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6.0,160.0,110,3.9,2.62,16.46,0,manual,4,4
1,Mazda RX4 Wag,21.0,6.0,160.0,110,3.9,2.875,17.02,0,manual,4,4
2,Datsun 710,22.8,4.0,108.0,93,3.85,2.32,18.61,1,manual,4,1
3,Hornet 4 Drive,21.4,6.0,258.0,110,3.08,3.215,0.1,1,auto,3,1
4,Hornet Sportabout,18.7,8.0,360.0,175,3.15,3.44,17.02,0,auto,3,2


In [44]:
def proc(x):
    if x == 'manual':
        return 1
    elif x == 'auto':
        return 0
    else:
        return x == np.nan
df['am'] = df['am'].apply(proc)
df

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6.0,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6.0,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4.0,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6.0,258.0,110,3.08,3.215,0.1,1,0,3,1
4,Hornet Sportabout,18.7,8.0,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6.0,225.0,105,2.76,3.46,20.22,1,0,3,1
6,Duster 360,14.3,8.0,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4.0,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6.0,167.6,123,3.92,3.44,18.3,1,0,4,4


In [45]:
df = pd.read_csv('./data/mtcars.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6.0,160.0,110,3.9,2.62,16.46,0,manual,4,4
1,Mazda RX4 Wag,21.0,6.0,160.0,110,3.9,2.875,17.02,0,manual,4,4
2,Datsun 710,22.8,4.0,108.0,93,3.85,2.32,18.61,1,manual,4,1
3,Hornet 4 Drive,21.4,6.0,258.0,110,3.08,3.215,0.1,1,auto,3,1
4,Hornet Sportabout,18.7,8.0,360.0,175,3.15,3.44,17.02,0,auto,3,2


In [49]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
print(encoder.fit_transform(df['am']))

[1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1]


In [50]:
df['am'] = encoder.fit_transform(df['am'])
df

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6.0,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6.0,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4.0,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6.0,258.0,110,3.08,3.215,0.1,1,0,3,1
4,Hornet Sportabout,18.7,8.0,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6.0,225.0,105,2.76,3.46,20.22,1,0,3,1
6,Duster 360,14.3,8.0,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4.0,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6.0,167.6,123,3.92,3.44,18.3,1,0,4,4


In [51]:
# 파생 변수
condition = df['wt'] >= 3
print(condition)

0     False
1     False
2     False
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17    False
18    False
19    False
20    False
21     True
22     True
23     True
24     True
25    False
26    False
27    False
28     True
29    False
30     True
31    False
Name: wt, dtype: bool


In [55]:
df.loc[condition,'wt_class'] = 1 # wt가 3이상이면 wt_calss는 2
df

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,wt_class
0,Mazda RX4,21.0,6.0,160.0,110,3.9,2.62,16.46,0,1,4,4,
1,Mazda RX4 Wag,21.0,6.0,160.0,110,3.9,2.875,17.02,0,1,4,4,
2,Datsun 710,22.8,4.0,108.0,93,3.85,2.32,18.61,1,1,4,1,
3,Hornet 4 Drive,21.4,6.0,258.0,110,3.08,3.215,0.1,1,0,3,1,1.0
4,Hornet Sportabout,18.7,8.0,360.0,175,3.15,3.44,17.02,0,0,3,2,1.0
5,Valiant,18.1,6.0,225.0,105,2.76,3.46,20.22,1,0,3,1,1.0
6,Duster 360,14.3,8.0,360.0,245,3.21,3.57,15.84,0,0,3,4,1.0
7,Merc 240D,24.4,,146.7,62,3.69,3.19,20.0,1,0,4,2,1.0
8,Merc 230,22.8,4.0,140.8,95,3.92,3.15,22.9,1,0,4,2,1.0
9,Merc 280,19.2,6.0,167.6,123,3.92,3.44,18.3,1,0,4,4,1.0
