## 데이타 전처리(Data Preprocessing) 
* 1) 결손 데이타(Missing Data) 처리
* 2) 피쳐 스케일링(Feature Scaling)과 정규화
* 3) 인코딩 - 레이블 인코딩(Label Encoding), 원-핫 인코딩(One-Hot Encoding)

In [1]:
import sklearn as sk

# 경고 무시
import warnings
warnings.filterwarnings(action='ignore')

### 데이타 로드

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# 데이타 로드 
path = "../data/DataPreprocess.csv"
df1 = pd.read_csv(path)
df1.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [5]:
df1.shape

(10, 4)

In [6]:
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### 데이타와 레이블 나누기 

In [7]:
# 데이타와 레이블 나누기 - 종속변수(반응변수)와 독립변수(설명변수) 나누기
x = df1.values[:, :-1]   # 데이타  
y = df1.values[:, -1]    # 레이블(정답)
x, y 

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

## 데이타 전처리- 결손 데이타(missing data) 처리, 누락 데이타 처리 

### (1) 판다스 데이타프레임의 메소드 이용하여 결손 데이타 처리
- isna(), fillna(), dropna()

In [8]:
df_new = pd.DataFrame(x)
df_new

Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [9]:
df_new.isna()

Unnamed: 0,0,1,2
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,True
5,False,False,False
6,False,True,False
7,False,False,False
8,False,False,False
9,False,False,False


In [10]:
df_new.isna().sum()

0    0
1    1
2    1
dtype: int64

In [11]:
df_new.isna().sum().sum()

2

- fillna()로 결측치 대체하기 : 0으로 대체

In [12]:
df_new = df_new.fillna(0)
df_new

Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,0.0
5,France,35.0,58000.0
6,Spain,0.0,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


- fillna()로 결측치 대체하기 : 각 컬럼의 mean() 값으로 대체

In [13]:
df_new = pd.DataFrame(x)
df_new

Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,Spain,,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [14]:
df_new.loc[:, 1].mean()

38.77777777777778

In [15]:
df_new.loc[:, 2].mean()

63777.77777777778

In [16]:
df_new.loc[:, 1] = df_new.loc[:, 1].fillna(df_new.loc[:, 1].mean())

In [17]:
df_new.loc[:, 2] = df_new.loc[:, 2].fillna(df_new.loc[:, 2].mean())

In [18]:
df_new

Unnamed: 0,0,1,2
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778
5,France,35.0,58000.0
6,Spain,38.777778,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


### (2) 사이킷런의 SimpleImputer 클래스 이용하여 결손 데이타 처리
(예) strategy : string, optional (default='mean')
#### str 클래스의 결측치 대치값
- strategy='mean' 평균값으로 대치 (디폴트)
- strategy='median' 중앙값으로 대치
- strategy='most_frequent' 최빈값 (mode)으로 대치
- strategy='constant', fill_value=1 특정값으로 대치, 예) transformer = SimpleImputer(strategy='constant', fill_value=1)

In [19]:
x[:, 1:3]

array([[44.0, 72000.0],
       [27.0, 48000.0],
       [30.0, 54000.0],
       [38.0, 61000.0],
       [40.0, nan],
       [35.0, 58000.0],
       [nan, 52000.0],
       [48.0, 79000.0],
       [50.0, 83000.0],
       [37.0, 67000.0]], dtype=object)

In [20]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
imputer = imputer.fit(x[:, 1:3]) 
x[:, 1:3] = imputer.transform(x[:, 1:3]) 
#트랜스포머의 transform() 함수는 결과를 넘파이 배열로 리턴
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [21]:
df2 = pd.DataFrame(x[:, 1:3])
df2.describe

<bound method NDFrame.describe of          0        1
0       44    72000
1       27    48000
2       30    54000
3       38    61000
4       40  63777.8
5       35    58000
6  38.7778    52000
7       48    79000
8       50    83000
9       37    67000>

## 데이타 전처리- 피처 스케일링(Feature Scaling)
- 1) 표준화 - 평균이 0이고 분산이 1인 가우시안 정규분포를 가진 값으로 변환 -> "StandardScaler"
       (xi – mean(x)) / stdev(x)
- 2) 정규화 - 서로 다른 피처의 크기를 통일하기 위해 크기를 변환해 주는 개념 -> "MinMaxScaler"
       (xi - min(x)) / (max(x)-min(x))

### 1) 표준화 - 사이킷런의 StandardScaler 클래스 사용

In [30]:
x# 데이타와 레이블 나누기 - 종속변수(반응변수)와 독립변수(설명변수) 나누기
x = df1.values[:, :-1]   # 데이타  
y = df1.values[:, -1]    # 레이블(정답)
x, y 

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

In [31]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
imputer = imputer.fit(x[:, 1:3]) 
x[:, 1:3] = imputer.transform(x[:, 1:3]) 
#트랜스포머의 transform() 함수는 결과를 넘파이 배열로 리턴
x # 결측치 처리를 한 데이타

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### 표준화

In [32]:
from sklearn.preprocessing import StandardScaler # 표준화 지원 클래스 

sc_x = StandardScaler()
sc_x.fit_transform(x[:, 1:3])
x[:, 1:3] = sc_x.transform(x[:, 1:3])
x

array([['France', 0.758874361590019, 0.7494732544921677],
       ['Spain', -1.7115038793306814, -1.4381784072687531],
       ['Germany', -1.2755547779917342, -0.8912654918285229],
       ['Spain', -0.1130238410878753, -0.253200423814921],
       ['Germany', 0.17760889313808945, 6.632191985654332e-16],
       ['France', -0.5489729424268225, -0.5266568815350361],
       ['Spain', 0.0, -1.0735697969752662],
       ['France', 1.3401398300419485, 1.3875383225057696],
       ['Germany', 1.6307725642679132, 1.7521469327992565],
       ['France', -0.2583402082008577, 0.29371249162530916]], dtype=object)

In [33]:
print("평균 : ", x[:, 1].mean(), x[:, 2].mean())
print("표준편차 : ", x[:, 1].var(), x[:, 2].var())

평균 :  -8.881784197001253e-17 4.274358644806853e-16
표준편차 :  1.0 1.0000000000000002


### 2) 정규화 - 사이킷런의 MinMaxScaler 사용

In [34]:
x# 데이타와 레이블 나누기 - 종속변수(반응변수)와 독립변수(설명변수) 나누기
x = df1.values[:, :-1]   # 데이타  
y = df1.values[:, -1]    # 레이블(정답)
x, y 

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

In [35]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
imputer = imputer.fit(x[:, 1:3]) 
x[:, 1:3] = imputer.transform(x[:, 1:3]) 
#트랜스포머의 transform() 함수는 결과를 넘파이 배열로 리턴
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### 정규화

In [36]:
from sklearn.preprocessing import MinMaxScaler # 정규화 지원 모듈 

mmsc_x = MinMaxScaler()
mmsc_x.fit_transform(x[:, 1:3])
x[:, 1:3] = mmsc_x.transform(x[:, 1:3])
x

array([['France', 0.7391304347826089, 0.6857142857142855],
       ['Spain', 0.0, 0.0],
       ['Germany', 0.1304347826086958, 0.17142857142857149],
       ['Spain', 0.4782608695652175, 0.37142857142857144],
       ['Germany', 0.5652173913043479, 0.45079365079365075],
       ['France', 0.34782608695652173, 0.2857142857142856],
       ['Spain', 0.5120772946859904, 0.11428571428571432],
       ['France', 0.9130434782608696, 0.8857142857142857],
       ['Germany', 1.0, 1.0],
       ['France', 0.43478260869565233, 0.5428571428571427]], dtype=object)

In [37]:
print("최소값 : ", x[:, 1].min(), x[:, 2].min())
print("최대값 : ", x[:, 1].max(), x[:, 2].max())

최소값 :  0.0 0.0
최대값 :  1.0 1.0


## 데이타 인코딩 - 레이블 인코딩, 원-핫 인코딩
- 카테고리 피쳐를 코드형 숫자 값을 변환하는 것
- (예) 상품 구분 – TV : 1, 선풍기 : 2, 냉장고 : 3, 전자레인지 : 4

### 데이타 로드

In [38]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# 데이타 로드 
path = "../data/DataPreprocess.csv"
df1 = pd.read_csv(path)
df1.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


### 데이타와 레이블 나누기 

In [39]:
# 데이타와 레이블 나누기 - 종속변수(반응변수)와 독립변수(설명변수) 나누기
x = df1.values[:, :-1]   # 데이타  
y = df1.values[:, -1]    # 레이블(정답)
x, y 

(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

### 1) 레이블 인코딩
- 'France': 0, 'Germany': 1, 'Spain':2

In [40]:
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### LabelEncoder

In [41]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(x[:, 0])
new_x = le.transform(x[:, 0])
new_x 

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

In [42]:
x[:, 0] = new_x
pd.DataFrame(x, columns=['Country', 'Age', 'Salary'])

Unnamed: 0,Country,Age,Salary
0,0,44.0,72000.0
1,2,27.0,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
4,1,40.0,
5,0,35.0,58000.0
6,2,,52000.0
7,0,48.0,79000.0
8,1,50.0,83000.0
9,0,37.0,67000.0


In [44]:
print('인코딩 클래스:',le.classes_)

인코딩 클래스: ['France' 'Germany' 'Spain']


In [47]:
print('디코딩 원본 값:',le.inverse_transform([0, 2, 1, 2, 1, 0, 2, 0, 1, 0]))

디코딩 원본 값: ['France' 'Spain' 'Germany' 'Spain' 'Germany' 'France' 'Spain' 'France'
 'Germany' 'France']


### 2) 원핫 인코딩(One-Hot Encoding)
- 카테고리형 데이타에 적용, 피처값의 유형에 따라 새로운 피쳐를 추가해 
- 고유 값에 해당하는 컬럼에만 1을 표시하고, 나머지 컬럼에는 0을 표시하는 방식 

### 원-핫 인코딩 순서
- 1) 레이블 인코딩
- 2) 2차원 데이타 형태로 변환
- 3) 원-핫 인코딩

In [53]:
x

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, nan],
       [0, 35.0, 58000.0],
       [2, nan, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [54]:
x[:, 0]

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0], dtype=object)

### OneHotEncoder

In [61]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# 1) 레이블 인코딩 진행
le = LabelEncoder()
le.fit(x[:, 0])
new_x = le.transform(x[:, 0])
new_x 

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0])

In [63]:
# # 2) 2차원 데이타 형태로 변환
new_x = new_x.reshape(-1,1)
new_x

array([[0],
       [2],
       [1],
       [2],
       [1],
       [0],
       [2],
       [0],
       [1],
       [0]])

In [65]:
# 3) 원-핫 인코딩 진행
ohe = OneHotEncoder()
ohe.fit(new_x)
new_ohe = ohe.transform(new_x)
new_ohe.toarray() 

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [66]:
new_ohe.toarray().shape

(10, 3)

In [67]:
pd.DataFrame(new_ohe.toarray(), columns = ['French', 'Germany', 'Spain'])

Unnamed: 0,French,Germany,Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,0.0,1.0,0.0
3,0.0,0.0,1.0
4,0.0,1.0,0.0
5,1.0,0.0,0.0
6,0.0,0.0,1.0
7,1.0,0.0,0.0
8,0.0,1.0,0.0
9,1.0,0.0,0.0


### 3) 판다스의 get_dummies() 함수로 원-핫 인코딩 구현
- 숫자형 값으로 변환 없이도 바로 원-핫 인코딩 가능

In [68]:
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [69]:
df1.iloc[:, 0]

0     France
1      Spain
2    Germany
3      Spain
4    Germany
5     France
6      Spain
7     France
8    Germany
9     France
Name: Country, dtype: object

In [70]:
import pandas as pd

pd.get_dummies(df1.iloc[:, 0])

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0
5,1,0,0
6,0,0,1
7,1,0,0
8,0,1,0
9,1,0,0


### [실습 문제]

### 타이타닉 데이타 전처리 

In [1]:
import pandas as pd

# 데이타 불러오기
path = "../data/TitanicData.csv"
df = pd.read_csv(path)
df.shape

(418, 11)

In [85]:
df.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### 1) 결손데이타 처리 

In [94]:
# 1) 결손데이타 처리  - null 합계를 구해봄다.
# isnull().sum()
df.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [88]:
# isna().sum()
df.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

### 2) 필요없는 컬럼 제거

In [101]:
# 2) 필요없는 컬럼은 버린다.
df = df.drop(['PassengerId','Ticket','Cabin' ], axis=1)

In [104]:
df.shape  # 11개 --> 8개 변수

(418, 8)

In [105]:
df.head(5)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q
1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S
2,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q
3,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S
4,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S


### pivot 테이블 생성

In [106]:
# 3) 'Embarked' 컬럼을 기준으로 pivot 테이블 
df['Embarked'].mode()

0    S
dtype: object

In [107]:
df.pivot_table(index = 'Embarked', aggfunc='count')

Unnamed: 0_level_0,Age,Fare,Name,Parch,Pclass,Sex,SibSp
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C,82,102,102,102,102,102,102
Q,22,46,46,46,46,46,46
S,228,269,270,270,270,270,270


In [111]:
df.pivot_table(index = 'Embarked', aggfunc='mean')

Unnamed: 0_level_0,Age,Fare,Parch,Pclass,SibSp
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C,34.737805,66.259765,0.382353,1.794118,0.421569
Q,29.318182,10.9577,0.021739,2.869565,0.195652
S,28.758772,28.230436,0.459259,2.340741,0.5
