In [1]:
from numpy import nan as NA
from pandas import DataFrame
import pandas as pd
import numpy as np

## 결측치 처리 방법

In [2]:
df = pd.DataFrame(np.random.randn(3,4)) # 정규분포에 의한 난수를 3행 4열의 프레임으로 만든것
print(df)

          0         1         2         3
0 -0.188847  1.166757  1.008105  0.161395
1  0.173704 -0.612143 -0.004967  0.105026
2 -0.089830  0.040599  1.448288  0.565700


In [3]:
df[1][2] = np.nan #(1,2)를 NaN으로 바꿈
print(df)

          0         1         2         3
0 -0.188847  1.166757  1.008105  0.161395
1  0.173704 -0.612143 -0.004967  0.105026
2 -0.089830       NaN  1.448288  0.565700


In [4]:
cleaned = df.dropna() #방법 1) NaN을 모두 삭제
print(cleaned)

          0         1         2         3
0 -0.188847  1.166757  1.008105  0.161395
1  0.173704 -0.612143 -0.004967  0.105026


In [5]:
df_2 = df.fillna(0) #방법 2-1) NaN을 0으로 대체
print(df_2)

          0         1         2         3
0 -0.188847  1.166757  1.008105  0.161395
1  0.173704 -0.612143 -0.004967  0.105026
2 -0.089830  0.000000  1.448288  0.565700


In [6]:
df_3 = df.fillna(df.mean()) #방법 2-2) NaN을 평균값으로 대체
print(df_3)

          0         1         2         3
0 -0.188847  1.166757  1.008105  0.161395
1  0.173704 -0.612143 -0.004967  0.105026
2 -0.089830  0.277307  1.448288  0.565700


In [7]:
df_4 = df.fillna(method = "ffill") #방법 2-3) NaN을 인접 값으로 대체
print(df_4)

          0         1         2         3
0 -0.188847  1.166757  1.008105  0.161395
1  0.173704 -0.612143 -0.004967  0.105026
2 -0.089830 -0.612143  1.448288  0.565700


## 데이터 변환

In [8]:
n_samples = 10
height = 3*np.random.randn(n_samples).round() + 170
nationality = np.random.randint(0, 3, n_samples)
df = pd.DataFrame(list(zip(height, nationality)), columns=["height", "nationality"]) # 두개를 짝으로 데이터 프레임 만듦
df.head(10)

Unnamed: 0,height,nationality
0,167.0,0
1,170.0,0
2,167.0,2
3,167.0,2
4,173.0,2
5,176.0,0
6,170.0,1
7,164.0,2
8,170.0,1
9,167.0,1


In [9]:
nat = pd.get_dummies(df['nationality'], prefix='nat_') #0과 1의 값을 넣음
nat

Unnamed: 0,nat__0,nat__1,nat__2
0,1,0,0
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
5,1,0,0
6,0,1,0
7,0,0,1
8,0,1,0
9,0,1,0


In [12]:
new_df = pd.concat([df, nat], axis=1)
new_df

Unnamed: 0,height,nationality,nat__0,nat__1,nat__2
0,167.0,0,1,0,0
1,170.0,0,1,0,0
2,167.0,2,0,0,1
3,167.0,2,0,0,1
4,173.0,2,0,0,1
5,176.0,0,1,0,0
6,170.0,1,0,1,0
7,164.0,2,0,0,1
8,170.0,1,0,1,0
9,167.0,1,0,1,0


In [13]:
new_df.drop('nationality', axis=1, inplace=True) # inplace=True : 원본을 변형시킨다는 의미
new_df

Unnamed: 0,height,nat__0,nat__1,nat__2
0,167.0,1,0,0
1,170.0,1,0,0
2,167.0,0,0,1
3,167.0,0,0,1
4,173.0,0,0,1
5,176.0,1,0,0
6,170.0,0,1,0
7,164.0,0,0,1
8,170.0,0,1,0
9,167.0,0,1,0


In [14]:
nationality

array([0, 0, 2, 2, 2, 0, 1, 2, 1, 1])

In [15]:
nat_categ = pd.Categorical(nationality) # 대표값 찾아내기
nat_categ

[0, 0, 2, 2, 2, 0, 1, 2, 1, 1]
Categories (3, int64): [0, 1, 2]

In [16]:
df['categ'] = nat_categ
df

Unnamed: 0,height,nationality,categ
0,167.0,0,0
1,170.0,0,0
2,167.0,2,2
3,167.0,2,2
4,173.0,2,2
5,176.0,0,0
6,170.0,1,1
7,164.0,2,2
8,170.0,1,1
9,167.0,1,1


In [21]:
type(df.categ) # 시리즈

pandas.core.series.Series

In [18]:
type(nat_categ) # 카테고리

pandas.core.arrays.categorical.Categorical

## 스케일링
> from sklearn.preprocessing import 스케일링 이름    

> 변수명 = 스케일링 이름() -> 스케일링을 적용하겠다고 변수를 생성    

> print(변수명.fit(train_data))    

> 훈련 변수명 = 변수명.transform(train_data)     

In [None]:
# from sklearn.preprocessing import StandardScaler
# st = StandardScaler()
# st.fit(X)
# st.transform(X)
# # X_std = StandardScaler().fit_transform(X) 줄이면 일케 됨!
# X_std

In [24]:
height = 3 * np.random.randn(n_samples).round() + 170
weight = 4 * np.random.randn(n_samples).round() + 70
X = pd.DataFrame(list(zip(height, weight)))
X.head()

Unnamed: 0,0,1
0,170.0,70.0
1,173.0,70.0
2,170.0,70.0
3,173.0,74.0
4,170.0,70.0


In [26]:
from sklearn.preprocessing import *
X_std = StandardScaler().fit_transform(X);X_std

array([[-0.33333333, -0.12038585],
       [ 0.77777778, -0.12038585],
       [-0.33333333, -0.12038585],
       [ 0.77777778,  1.08347268],
       [-0.33333333, -0.12038585],
       [ 0.77777778,  2.28733121],
       [-1.44444444, -1.32424438],
       [-1.44444444, -0.12038585],
       [-0.33333333, -1.32424438],
       [ 1.88888889, -0.12038585]])

In [28]:
x = X.values 
print(x)
x_std = StandardScaler().fit_transform(x)
print(x_std)

[[170.  70.]
 [173.  70.]
 [170.  70.]
 [173.  74.]
 [170.  70.]
 [173.  78.]
 [167.  66.]
 [167.  70.]
 [170.  66.]
 [176.  70.]]
[[-0.33333333 -0.12038585]
 [ 0.77777778 -0.12038585]
 [-0.33333333 -0.12038585]
 [ 0.77777778  1.08347268]
 [-0.33333333 -0.12038585]
 [ 0.77777778  2.28733121]
 [-1.44444444 -1.32424438]
 [-1.44444444 -0.12038585]
 [-0.33333333 -1.32424438]
 [ 1.88888889 -0.12038585]]
