## 결측치 처리

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/swkim01/DSAC1-2/blob/main/gg-25-범주형데이터코딩.ipynb"><img src="https://github.com/swkim01/DSAC1-2/raw/main/colab_logo_32px.png" />구글 코랩에서 실행</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/swkim01/DSAC1-2/blob/main/gg-25-범주형데이터코딩.ipynb"><img src="https://github.com/swkim01/DSAC1-2/raw/main/GitHub-Mark-32px.png" />깃헙에서 소스 보기</a>
  </td>
</table>

- NaN : Not a Number

In [1]:
from numpy import nan as NA
from pandas import DataFrame
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(np.random.randn(3,4)) 
print(df)

          0         1         2         3
0  0.649261  1.199547  0.152862  0.317770
1 -1.062128  1.357210  0.235500 -1.114727
2  1.095125  0.285579  0.225166  1.081298


In [3]:
# df[1][2] 위치의 값을 강제로 결측치로 변경
df[1][2] = np.nan
print(df)

          0         1         2         3
0  0.649261  1.199547  0.152862  0.317770
1 -1.062128  1.357210  0.235500 -1.114727
2  1.095125       NaN  0.225166  1.081298


In [4]:
# dropna() : NaN이 하나라도 들어있는 행(row)을 제거
cleaned = df.dropna()
print(cleaned)

          0         1         2         3
0  0.649261  1.199547  0.152862  0.317770
1 -1.062128  1.357210  0.235500 -1.114727


In [8]:
# 결측치를 다른 값으로 대체 : 0
df_2 = df.fillna(0)
print(df_2)

          0         1         2         3
0  0.649261  1.199547  0.152862  0.317770
1 -1.062128  1.357210  0.235500 -1.114727
2  1.095125  0.000000  0.225166  1.081298


In [9]:
# 결측치를 다른 값으로 대체 : 평균, mean() : default로 열(column) 기준
df_3 = df.fillna(df.mean())
print(df_3)

          0         1         2         3
0  0.649261  1.199547  0.152862  0.317770
1 -1.062128  1.357210  0.235500 -1.114727
2  1.095125  1.278379  0.225166  1.081298


- ffill : f(forward), 바로 앞의 값으로 대체

In [10]:
df_4 = df.fillna(method='ffill')
print(df_4)

          0         1         2         3
0  0.649261  1.199547  0.152862  0.317770
1 -1.062128  1.357210  0.235500 -1.114727
2  1.095125  1.357210  0.225166  1.081298


## 데이터 변환

In [11]:
import pandas as pd
import numpy as np

In [12]:
n_samples = 10
height = 3*np.random.randn(n_samples).round() + 170
nationality = np.random.randint(0,3,n_samples)

In [13]:
height, nationality

(array([170., 164., 173., 173., 170., 170., 164., 164., 170., 173.]),
 array([0, 2, 2, 1, 0, 1, 1, 0, 1, 0]))

In [14]:
list(zip(height, nationality))

[(170.0, 0),
 (164.0, 2),
 (173.0, 2),
 (173.0, 1),
 (170.0, 0),
 (170.0, 1),
 (164.0, 1),
 (164.0, 0),
 (170.0, 1),
 (173.0, 0)]

#### DataFrame으로 변환

In [15]:
df = pd.DataFrame(list(zip(height, nationality)), 
                  columns=["height","nationality"])
df.head()

Unnamed: 0,height,nationality
0,170.0,0
1,164.0,2
2,173.0,2
3,173.0,1
4,170.0,0


- pd.get_dummies() : one-hot-encoding 수행

In [16]:
nat = pd.get_dummies(df['nationality'], prefix='nat_')

In [17]:
nat

Unnamed: 0,nat__0,nat__1,nat__2
0,1,0,0
1,0,0,1
2,0,0,1
3,0,1,0
4,1,0,0
5,0,1,0
6,0,1,0
7,1,0,0
8,0,1,0
9,1,0,0


In [18]:
new_df = pd.concat([df, nat], axis=1); new_df.head()

Unnamed: 0,height,nationality,nat__0,nat__1,nat__2
0,170.0,0,1,0,0
1,164.0,2,0,0,1
2,173.0,2,0,0,1
3,173.0,1,0,1,0
4,170.0,0,1,0,0


In [19]:
new_df.drop('nationality', axis=1, inplace=True)

In [20]:
new_df

Unnamed: 0,height,nat__0,nat__1,nat__2
0,170.0,1,0,0
1,164.0,0,0,1
2,173.0,0,0,1
3,173.0,0,1,0
4,170.0,1,0,0
5,170.0,0,1,0
6,164.0,0,1,0
7,164.0,1,0,0
8,170.0,0,1,0
9,173.0,1,0,0


In [21]:
nationality

array([0, 2, 2, 1, 0, 1, 1, 0, 1, 0])

In [22]:
nat_categ = pd.Categorical(nationality)
nat_categ

[0, 2, 2, 1, 0, 1, 1, 0, 1, 0]
Categories (3, int64): [0, 1, 2]

In [23]:
# 주의: Series로 변경된다
df['categ'] = nat_categ

In [24]:
df

Unnamed: 0,height,nationality,categ
0,173.0,2,2
1,170.0,2,2
2,164.0,2,2
3,167.0,2,2
4,170.0,1,1
5,170.0,0,0
6,173.0,2,2
7,173.0,1,1
8,173.0,1,1
9,170.0,1,1


In [24]:
type(df.categ)

pandas.core.series.Series

In [25]:
type(nat_categ)

pandas.core.arrays.categorical.Categorical

## 표준 스케일링

In [26]:
height = 3*np.random.randn(n_samples).round() + 170
weight = 4*np.random.randn(n_samples).round() + 70

X = pd.DataFrame(list(zip(height, weight))); X.head()

Unnamed: 0,0,1
0,173.0,66.0
1,170.0,70.0
2,164.0,74.0
3,170.0,70.0
4,170.0,70.0


In [27]:
from sklearn.preprocessing import StandardScaler

X_std = StandardScaler().fit_transform(X); X_std

array([[ 1.03209369, -2.        ],
       [ 0.29488391, -0.33333333],
       [-1.17953565,  1.33333333],
       [ 0.29488391, -0.33333333],
       [ 0.29488391, -0.33333333],
       [-1.91674543, -0.33333333],
       [ 0.29488391,  1.33333333],
       [ 1.03209369, -0.33333333],
       [-1.17953565,  1.33333333],
       [ 1.03209369, -0.33333333]])

### 교재 p.147 여기까지-----

In [28]:
x = X.values; x

array([[173.,  66.],
       [170.,  70.],
       [164.,  74.],
       [170.,  70.],
       [170.,  70.],
       [161.,  70.],
       [170.,  74.],
       [173.,  70.],
       [164.,  74.],
       [173.,  70.]])

In [29]:
x_std = StandardScaler().fit_transform(x); x_std

array([[ 1.03209369, -2.        ],
       [ 0.29488391, -0.33333333],
       [-1.17953565,  1.33333333],
       [ 0.29488391, -0.33333333],
       [ 0.29488391, -0.33333333],
       [-1.91674543, -0.33333333],
       [ 0.29488391,  1.33333333],
       [ 1.03209369, -0.33333333],
       [-1.17953565,  1.33333333],
       [ 1.03209369, -0.33333333]])