### 결측값 처리
- 제거
- 대체
  - 자기참조 vs 다른 변수와의 관계 이용
  - deterministic imputation vs stochastic imputation
  - single imputation vs multiple imputation

### 행제거: 관측값 제거
- 모든 행이 NA이면 제거
- 특정 변수의 값이 NA이면 제거
- 결측값이 몇 개 이상이면 제거

In [1]:
import pandas as pd
import numpy as np

In [2]:
결측자료 = pd.read_csv("Employee_missing.csv", encoding="cp949")
결측자료.head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
1,0.0,,,사무직,60000.0,,95.0,,YES
2,,,,,,,,,
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


#### 결측값 표시 변경: read_csv 옵션 중 na_values에서 변경 가능
- 아래의 문자는 모두 NaN으로 처리
- keep_default_na 옵션(default=True)와 연동
  - keep_default_na = True & na_values 미설정:기존 NaN표현만 사용
  - keep_default_na = True & na_values 설정: 기존 NaN표현에 설정된 값 추가
  - keep_default_na = False & na_values 미설정: NaN 처리 문자 없음
  - keep_default_na = True & na_values 설정: 설정된 값만 NaN으로 처리

In [3]:
## 결측자료의 수
결측자료.isna().sum()

id          2
gender      2
educ        2
jobcat      2
salary      2
salbegin    3
jobtime     2
prevexp     3
minority    2
dtype: int64

In [4]:
## 결측값이 하나라도 있는 경우 제거 - 행 제거
결측자료.dropna(axis=0).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No
6,4.0,여성,8.0,사무직,21900.0,13200.0,98.0,190.0,No
7,5.0,남성,15.0,사무직,45000.0,21000.0,98.0,138.0,No


In [5]:
## 모든 자료가 결측값이면 제거: how='all'(특정행의 모든자료가 결측값일 경우 삭제), default='any'(하나의 값만 결측값일 경우 삭제)
결측자료.dropna(how='all').head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
1,0.0,,,사무직,60000.0,,95.0,,YES
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No


In [6]:
## 결측값이 아닌 개수가 k개 미만이면 제거
k = 5
결측자료.dropna(thresh=k).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
1,0.0,,,사무직,60000.0,,95.0,,YES
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No
6,4.0,여성,8.0,사무직,21900.0,13200.0,98.0,190.0,No


In [17]:
## 결측값이 특정변수(column)에 있으면 제거
결측자료.dropna(subset=["gender"]).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No
6,4.0,여성,8.0,사무직,21900.0,13200.0,98.0,190.0,No


In [18]:
결측자료.dropna(subset=['gender', 'jobcat']).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No
6,4.0,여성,8.0,사무직,21900.0,13200.0,98.0,190.0,No
7,5.0,남성,15.0,사무직,45000.0,21000.0,98.0,138.0,No


In [19]:
## 결측을 제거한 데이터프레임을 동일명의 데이터프레임으로 저장
결측자료.dropna(subset=["gender"], inplace=True)
결측자료.head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No
6,4.0,여성,8.0,사무직,21900.0,13200.0,98.0,190.0,No


### 열제거: 변수제거

In [20]:
결측자료.dropna(axis=1).head()

Unnamed: 0,gender,educ
0,남성,12.0
3,남성,15.0
4,남성,16.0
5,여성,12.0
6,여성,8.0


In [21]:
결측자료.dropna(axis='columns').head()

Unnamed: 0,gender,educ
0,남성,12.0
3,남성,15.0
4,남성,16.0
5,여성,12.0
6,여성,8.0


In [8]:
결측자료.dropna(axis=1, thresh=474).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
1,0.0,,,사무직,60000.0,,95.0,,YES
2,,,,,,,,,
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


In [23]:
## 특정변수의 값을 NaN으로 처리
결측자료['jobcat'].replace('경영자', np.NaN)

0      NaN
3      NaN
4      사무직
5      사무직
6      사무직
      ... 
472    사무직
473    사무직
474    사무직
475    사무직
476    사무직
Name: jobcat, Length: 475, dtype: object

In [24]:
결측자료['jobcat'].replace({'경영자':np.NaN,'사무직': np.NaN})

0      NaN
3      NaN
4      NaN
5      NaN
6      NaN
      ... 
472    NaN
473    NaN
474    NaN
475    NaN
476    NaN
Name: jobcat, Length: 475, dtype: object

### 결측값 대체
- 자기참조 vs 다른 변수와의 관계 이용
- determinist imputation vs stochastic imputation
- single imputation vs multiple imputation

In [25]:
결측자료 = pd.read_csv("Employee_missing.csv", encoding="cp949")
결측자료.head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
1,0.0,,,사무직,60000.0,,95.0,,YES
2,,,,,,,,,
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


In [26]:
## 대체값 직접지정
결측자료.fillna(0).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,0.0,남성,12.0,0,0.0,0.0,0.0,0.0,0
1,0.0,0,0.0,사무직,60000.0,0.0,95.0,0.0,YES
2,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


In [27]:
결측자료.fillna({'id':0, 'gender':'남자', 'jobcat':'경영자'}).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,0.0,남성,12.0,경영자,,,,,
1,0.0,남자,,사무직,60000.0,,95.0,,YES
2,0.0,남자,,경영자,,,,,
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


In [28]:
## 전/후 관측값으로 대체(관측값의 순서가 "시간순"인 경우)
결측자료.fillna(method='ffill').head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
1,0.0,남성,12.0,사무직,60000.0,,95.0,,YES
2,0.0,남성,12.0,사무직,60000.0,,95.0,,YES
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


In [29]:
결측자료.fillna(method='bfill').head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,0.0,남성,12.0,사무직,60000.0,27000.0,95.0,144.0,YES
1,0.0,남성,15.0,사무직,60000.0,27000.0,95.0,144.0,YES
2,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


#### 특정 통계값으로 대체
- 수치자료: 평균, 중앙값, ..
- 범주자료: 최빈값

In [30]:
수치변수명 = 결측자료.select_dtypes(include=np.number).columns.tolist()
수치결측자료 = 결측자료[수치변수명]
수치결측자료.head()
## 수치 값을 갖는 컬럼만 모음

Unnamed: 0,id,educ,salary,salbegin,jobtime,prevexp
0,,12.0,,,,
1,0.0,,60000.0,,95.0,
2,,,,,,
3,1.0,15.0,57000.0,27000.0,98.0,144.0
4,2.0,16.0,40200.0,18750.0,98.0,36.0


In [31]:
수치결측자료.fillna(수치결측자료.mean()).head()# 결측값 -> 평균으로 채워넣기

Unnamed: 0,id,educ,salary,salbegin,jobtime,prevexp
0,237.0,12.0,34473.421053,17016.086498,81.138947,95.860759
1,0.0,13.488421,60000.0,17016.086498,95.0,95.860759
2,237.0,13.488421,34473.421053,17016.086498,81.138947,95.860759
3,1.0,15.0,57000.0,27000.0,98.0,144.0
4,2.0,16.0,40200.0,18750.0,98.0,36.0


In [32]:
최빈값 = 결측자료.mode()
최빈값

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,0.0,남성,12.0,사무직,30750.0,15000.0,81.0,0.0,No
1,1.0,,,,,,93.0,,
2,2.0,,,,,,,,
3,3.0,,,,,,,,
4,4.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...
470,470.0,,,,,,,,
471,471.0,,,,,,,,
472,472.0,,,,,,,,
473,473.0,,,,,,,,


In [33]:
결측자료.fillna(최빈값.iloc[0,:]).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,0.0,남성,12.0,사무직,30750.0,15000.0,81.0,0.0,No
1,0.0,남성,12.0,사무직,60000.0,15000.0,95.0,0.0,YES
2,0.0,남성,12.0,사무직,30750.0,15000.0,81.0,0.0,No
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


#### 대체 패키지

In [35]:
from sklearn.impute import SimpleImputer
단순대체 = SimpleImputer(strategy = 'mean')
대체자료 = 단순대체.fit_transform(수치결측자료)
## 평균=mean, 중앙값=median, 최빈값=most_freauent, 정해진값=constant
대체자료

array([[2.37000000e+02, 1.20000000e+01, 3.44734211e+04, 1.70160865e+04,
        8.11389474e+01, 9.58607595e+01],
       [0.00000000e+00, 1.34884211e+01, 6.00000000e+04, 1.70160865e+04,
        9.50000000e+01, 9.58607595e+01],
       [2.37000000e+02, 1.34884211e+01, 3.44734211e+04, 1.70160865e+04,
        8.11389474e+01, 9.58607595e+01],
       ...,
       [4.72000000e+02, 1.50000000e+01, 3.91500000e+04, 1.57500000e+04,
        6.30000000e+01, 4.60000000e+01],
       [4.73000000e+02, 1.20000000e+01, 2.14500000e+04, 1.27500000e+04,
        6.30000000e+01, 1.39000000e+02],
       [4.74000000e+02, 1.20000000e+01, 2.94000000e+04, 1.42500000e+04,
        6.30000000e+01, 9.00000000e+00]])

In [37]:
대체자료 = pd.DataFrame(대체자료,columns=수치결측자료.columns)
대체자료.head()

Unnamed: 0,id,educ,salary,salbegin,jobtime,prevexp
0,237.0,12.0,34473.421053,17016.086498,81.138947,95.860759
1,0.0,13.488421,60000.0,17016.086498,95.0,95.860759
2,237.0,13.488421,34473.421053,17016.086498,81.138947,95.860759
3,1.0,15.0,57000.0,27000.0,98.0,144.0
4,2.0,16.0,40200.0,18750.0,98.0,36.0


In [39]:
#단순대체.set_output(transform='pandas')
#대체자료 = 단순대체.fit_transform(수치결측자료)
#대체자료.head()

In [45]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
확률대체 = lterativeImputer(random_state = 0)
확률대체.set_output(transform='pandas')
대체자료 = 확률대체.fit_transform(수치결측자료)
대체자료.head()

In [None]:
### KNN
from sklearn.impute import KNNImputer
근접대체 = KNNImputer(n_neighbors=5, weights="uniform")
근접대체.set_output(transform='pandas')
근접대체.fit_transform(수치결측자료).head()

#### 회귀대체
- 결측값이 없는 설명변수와 반응변수 자료로 회귀모형 적합
- 결측값이 있는 설명변수에 대해 예측값에 계산
  - t분포 난수와 표준 오차를 곱한 오차를 예측값에 더함
  - np.random.standard_t(자유도, 표본크기)
- 예측값을 반응변수에 대입
  - df['반응변수'].fillna(pd.Series(예측값.flatten()),inplace=True)