### 결측값 처리
- 제거
- 대체
  - 자기참조 vs 다른 변수와의 관계 이용
  - deterministic imputation vs stochastic imputation
  - single imputation vs multiple imputation

### 행제거: 관측값 제거
- 모든 행이 NA이면 제거
- 특정 변수의 값이 NA이면 제거
- 결측값이 몇 개 이상이면 제거

In [7]:
import pandas as pd
import numpy as np

In [8]:
결측자료 = pd.read_csv("Employee_missing.csv", encoding="cp949")##한글 format
결측자료.head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
1,0.0,,,사무직,60000.0,,95.0,,YES
2,,,,,,,,,
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


#### 결측값 표시 변경: read_csv 옵션 중 na_values에서 변경 가능
- 아래의 문자는 모두 NaN으로 처리
- keep_default_na 옵션(default=True)와 연동
  - keep_default_na = True & na_values 미설정:기존 NaN표현만 사용
  - keep_default_na = True & na_values 설정: 기존 NaN표현에 설정된 값 추가
  - keep_default_na = False & na_values 미설정: NaN 처리 문자 없음
  - keep_default_na = True & na_values 설정: 설정된 값만 NaN으로 처리

In [9]:
## 결측자료의 수
결측자료.isna().sum()

id          2
gender      2
educ        2
jobcat      2
salary      2
salbegin    3
jobtime     2
prevexp     3
minority    2
dtype: int64

In [10]:
## 결측값이 하나라도 있는 경우 제거 - 행 기준 제거
결측자료.dropna(axis=0).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No
6,4.0,여성,8.0,사무직,21900.0,13200.0,98.0,190.0,No
7,5.0,남성,15.0,사무직,45000.0,21000.0,98.0,138.0,No


In [11]:
## 모든 자료가 결측값이면 제거: how='all'(특정행의 모든자료가 결측값일 경우 삭제), default='any'(적어도 하나 이상이 결측값일 경우 삭제)
결측자료.dropna(how='all').head() ##2번 사라짐

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
1,0.0,,,사무직,60000.0,,95.0,,YES
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No


In [12]:
## 결측값 개수가 k개 이상이면 제거
k = 5
결측자료.dropna(thresh=k).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
1,0.0,,,사무직,60000.0,,95.0,,YES
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No
6,4.0,여성,8.0,사무직,21900.0,13200.0,98.0,190.0,No


In [13]:
## 결측값이 특정변수(column)에 있으면 제거
결측자료.dropna(subset=["gender"]).head() #행기준 제거

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No
6,4.0,여성,8.0,사무직,21900.0,13200.0,98.0,190.0,No


In [14]:
결측자료.dropna(subset=['gender', 'jobcat']).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No
6,4.0,여성,8.0,사무직,21900.0,13200.0,98.0,190.0,No
7,5.0,남성,15.0,사무직,45000.0,21000.0,98.0,138.0,No


In [15]:
## 결측을 제거한 데이터프레임을 저장
결측자료.dropna(subset=["gender"], inplace=True)
결측자료.head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No
6,4.0,여성,8.0,사무직,21900.0,13200.0,98.0,190.0,No


### 열제거: 변수제거

In [16]:
결측자료.dropna(axis=1).head()

Unnamed: 0,gender,educ
0,남성,12.0
3,남성,15.0
4,남성,16.0
5,여성,12.0
6,여성,8.0


In [17]:
결측자료.dropna(axis='columns').head()

Unnamed: 0,gender,educ
0,남성,12.0
3,남성,15.0
4,남성,16.0
5,여성,12.0
6,여성,8.0


In [18]:
결측자료.dropna(axis=1, thresh=474).head() ## 결측값이 아닌 개수가 k개 미만이면 제거 , 결측값이 전체개수 - k개 이상 존재하면 제거

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No
5,3.0,여성,12.0,사무직,21450.0,12000.0,98.0,381.0,No
6,4.0,여성,8.0,사무직,21900.0,13200.0,98.0,190.0,No


In [19]:
## 특정변수의 값을 NaN으로 처리
결측자료['jobcat'].replace('경영자', np.NaN) #속성의 특정 값을 변경

0      NaN
3      NaN
4      사무직
5      사무직
6      사무직
      ... 
472    사무직
473    사무직
474    사무직
475    사무직
476    사무직
Name: jobcat, Length: 475, dtype: object

In [20]:
결측자료['jobcat'].replace({'경영자':np.NaN,'사무직': np.NaN})#딕셔너리 형태로 작성 

0      NaN
3      NaN
4      NaN
5      NaN
6      NaN
      ... 
472    NaN
473    NaN
474    NaN
475    NaN
476    NaN
Name: jobcat, Length: 475, dtype: object

### 결측값 대체
- 자기참조 vs 다른 변수와의 관계 이용
- determinist imputation(결정 적인 값) vs stochastic imputation (바뀔 수 있는 값- 난수에 따라)
- single imputation vs multiple imputation
- 자기참조 => 수치: 평균, 중앙값, 최빈값 or핫델
- 다른 변수 관계 이용=> 회귀분석, 

In [21]:
결측자료 = pd.read_csv("Employee_missing.csv", encoding="cp949")
결측자료.head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
1,0.0,,,사무직,60000.0,,95.0,,YES
2,,,,,,,,,
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


In [22]:
## 대체값 직접 지정
결측자료.fillna(444).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,444.0,남성,12.0,444,444.0,444.0,444.0,444.0,444
1,0.0,444,444.0,사무직,60000.0,444.0,95.0,444.0,YES
2,444.0,444,444.0,444,444.0,444.0,444.0,444.0,444
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


In [23]:
결측자료.fillna({'id':'미정', 'gender':'여자', 'jobcat':'무직'}).head() #값이 문자열인 경우에도 수치형 "최빈값" 사용 가능 -> 딕셔너리 형태로 지정

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,미정,남성,12.0,무직,,,,,
1,0.0,여자,,사무직,60000.0,,95.0,,YES
2,미정,여자,,무직,,,,,
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


In [24]:
## 전/후 관측값으로 대체(관측값의 순서가 "시간순"인 경우 -> 각 "앞 뒤 데이터"로 채울 수 있음)
결측자료.fillna(method='ffill').head() #맨 첫값이 missing 경우 채워넣을 수 없음 -> foward

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,,남성,12.0,,,,,,
1,0.0,남성,12.0,사무직,60000.0,,95.0,,YES
2,0.0,남성,12.0,사무직,60000.0,,95.0,,YES
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


In [25]:
결측자료.fillna(method='bfill').head()## ->backward

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,0.0,남성,12.0,사무직,60000.0,27000.0,95.0,144.0,YES
1,0.0,남성,15.0,사무직,60000.0,27000.0,95.0,144.0,YES
2,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


## 특정 통계값으로 대체
- 수치자료: 평균, 중앙값, ..
- 범주자료: 최빈값

In [26]:
## 이번 퀴즈 문제 - 오답노트
수치변수명 = 결측자료.select_dtypes(include=np.number).columns.tolist()
수치결측자료 = 결측자료[수치변수명]
수치결측자료.head()
## 수치 값을 갖는 컬럼만 모음

Unnamed: 0,id,educ,salary,salbegin,jobtime,prevexp
0,,12.0,,,,
1,0.0,,60000.0,,95.0,
2,,,,,,
3,1.0,15.0,57000.0,27000.0,98.0,144.0
4,2.0,16.0,40200.0,18750.0,98.0,36.0


In [27]:
수치자료 = 결측자료.select_dtypes(include=np.number)
수치자료.head()

Unnamed: 0,id,educ,salary,salbegin,jobtime,prevexp
0,,12.0,,,,
1,0.0,,60000.0,,95.0,
2,,,,,,
3,1.0,15.0,57000.0,27000.0,98.0,144.0
4,2.0,16.0,40200.0,18750.0,98.0,36.0


In [28]:
수치결측자료.mean()

id            237.000000
educ           13.488421
salary      34473.421053
salbegin    17016.086498
jobtime        81.138947
prevexp        95.860759
dtype: float64

In [29]:
수치결측자료.fillna(수치결측자료.mean()).head()# 결측값 -> 평균으로 채워넣기

Unnamed: 0,id,educ,salary,salbegin,jobtime,prevexp
0,237.0,12.0,34473.421053,17016.086498,81.138947,95.860759
1,0.0,13.488421,60000.0,17016.086498,95.0,95.860759
2,237.0,13.488421,34473.421053,17016.086498,81.138947,95.860759
3,1.0,15.0,57000.0,27000.0,98.0,144.0
4,2.0,16.0,40200.0,18750.0,98.0,36.0


In [30]:
최빈값 = 결측자료.mode() # mode: 제일 윗줄에 컬럼의 최빈값이 표현됨 => 
최빈값                   ## 최빈값.iloc[0, :]

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,0.0,남성,12.0,사무직,30750.0,15000.0,81.0,0.0,No
1,1.0,,,,,,93.0,,
2,2.0,,,,,,,,
3,3.0,,,,,,,,
4,4.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...
470,470.0,,,,,,,,
471,471.0,,,,,,,,
472,472.0,,,,,,,,
473,473.0,,,,,,,,


In [31]:
결측자료.fillna(최빈값.iloc[0,:]).head()

Unnamed: 0,id,gender,educ,jobcat,salary,salbegin,jobtime,prevexp,minority
0,0.0,남성,12.0,사무직,30750.0,15000.0,81.0,0.0,No
1,0.0,남성,12.0,사무직,60000.0,15000.0,95.0,0.0,YES
2,0.0,남성,12.0,사무직,30750.0,15000.0,81.0,0.0,No
3,1.0,남성,15.0,경영자,57000.0,27000.0,98.0,144.0,No
4,2.0,남성,16.0,사무직,40200.0,18750.0,98.0,36.0,No


## 대체 패키지

In [32]:
from sklearn.impute import SimpleImputer # 선언을 한 후 나중에 데이터를 넣는 형식 
단순대체 = SimpleImputer(strategy = 'mean') # 평균 구하기 방법 사용 -> "단순대체" 변수에 선언함
대체자료 = 단순대체.fit_transform(수치결측자료)# fit_transform()-> 위에서 만든 수치 결측 자료를 "단순대체" 방식으로 계산하여 "대체자료"변수에 저장
## 평균=mean, 중앙값=median, 최빈값=most_freauent, 정해진값=constant
대체자료

array([[2.37000000e+02, 1.20000000e+01, 3.44734211e+04, 1.70160865e+04,
        8.11389474e+01, 9.58607595e+01],
       [0.00000000e+00, 1.34884211e+01, 6.00000000e+04, 1.70160865e+04,
        9.50000000e+01, 9.58607595e+01],
       [2.37000000e+02, 1.34884211e+01, 3.44734211e+04, 1.70160865e+04,
        8.11389474e+01, 9.58607595e+01],
       ...,
       [4.72000000e+02, 1.50000000e+01, 3.91500000e+04, 1.57500000e+04,
        6.30000000e+01, 4.60000000e+01],
       [4.73000000e+02, 1.20000000e+01, 2.14500000e+04, 1.27500000e+04,
        6.30000000e+01, 1.39000000e+02],
       [4.74000000e+02, 1.20000000e+01, 2.94000000e+04, 1.42500000e+04,
        6.30000000e+01, 9.00000000e+00]])

In [33]:
대체자료 = pd.DataFrame(대체자료,columns=수치결측자료.columns)
대체자료.head()

Unnamed: 0,id,educ,salary,salbegin,jobtime,prevexp
0,237.0,12.0,34473.421053,17016.086498,81.138947,95.860759
1,0.0,13.488421,60000.0,17016.086498,95.0,95.860759
2,237.0,13.488421,34473.421053,17016.086498,81.138947,95.860759
3,1.0,15.0,57000.0,27000.0,98.0,144.0
4,2.0,16.0,40200.0,18750.0,98.0,36.0


In [34]:
단순대체.set_output(transform='pandas') ## 참고만 하기!!
대체자료 = 단순대체.fit_transform(수치결측자료) ## 데이터 프레임 형태로 구성됨 
대체자료.head()

Unnamed: 0,id,educ,salary,salbegin,jobtime,prevexp
0,237.0,12.0,34473.421053,17016.086498,81.138947,95.860759
1,0.0,13.488421,60000.0,17016.086498,95.0,95.860759
2,237.0,13.488421,34473.421053,17016.086498,81.138947,95.860759
3,1.0,15.0,57000.0,27000.0,98.0,144.0
4,2.0,16.0,40200.0,18750.0,98.0,36.0


In [35]:
from sklearn.experimental import enable_iterative_imputer ## 무작위 추출
from sklearn.impute import IterativeImputer

In [36]:
확률대체 = IterativeImputer(random_state = 0)
확률대체.set_output(transform='pandas')
대체자료 = 확률대체.fit_transform(수치결측자료)
대체자료.head()

Unnamed: 0,id,educ,salary,salbegin,jobtime,prevexp
0,237.299536,12.0,34023.646203,17045.190569,81.132509,116.751862
1,0.0,16.290908,60000.0,26325.929668,95.0,88.290084
2,237.000629,13.494309,34472.476147,17035.706135,81.138934,95.888744
3,1.0,15.0,57000.0,27000.0,98.0,144.0
4,2.0,16.0,40200.0,18750.0,98.0,36.0


In [41]:
### KNN
## 중요 !! 퀴즈 준비
from sklearn.impute import KNNImputer
근접대체 = KNNImputer(n_neighbors=190, weights="uniform") #근처에 있는 값들을 정해 군집 분석 -> knn , k개의 값들의 평균 값을 대입
근접대체.set_output(transform='pandas') #190개를 기준으로 값들을 대체함
근접대체.fit_transform(수치결측자료).head()

Unnamed: 0,id,educ,salary,salbegin,jobtime,prevexp
0,243.763158,12.0,25887.157895,13241.868421,80.768421,96.378947
1,0.0,15.2,60000.0,21198.526316,95.0,91.168421
2,237.0,13.488421,34473.421053,17016.086498,81.138947,95.860759
3,1.0,15.0,57000.0,27000.0,98.0,144.0
4,2.0,16.0,40200.0,18750.0,98.0,36.0


In [39]:
수치결측자료.groupby("educ").count()

Unnamed: 0_level_0,id,salary,salbegin,jobtime,prevexp
educ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8.0,53,53,53,53,53
12.0,190,190,190,190,190
14.0,6,6,6,6,6
15.0,116,116,116,116,116
16.0,59,59,59,59,59
17.0,11,11,11,11,11
18.0,9,9,9,9,9
19.0,27,27,27,27,27
20.0,2,2,2,2,2
21.0,1,1,1,1,1


In [40]:
수치결측자료.groupby("educ").mean()

Unnamed: 0_level_0,id,salary,salbegin,jobtime,prevexp
educ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8.0,256.528302,24399.056604,13064.150943,79.773585,183.245283
12.0,243.763158,25887.157895,13241.868421,80.768421,96.378947
14.0,257.833333,31625.0,15625.0,79.666667,57.333333
15.0,222.387931,31685.0,15610.603448,82.12069,77.784483
16.0,224.237288,48225.932203,22338.474576,82.016949,62.745763
17.0,218.454545,59527.272727,26904.545455,82.272727,104.454545
18.0,244.777778,65127.777778,32240.0,80.666667,82.444444
19.0,260.888889,72520.37037,34764.074074,79.111111,75.925926
20.0,151.5,64312.5,36240.0,87.0,70.0
21.0,137.0,65000.0,37500.0,88.0,264.0


#### 회귀대체
- 결측값이 없는 설명변수와 반응변수 자료로 회귀모형 적합
- 결측값이 있는 설명변수에 대해 예측값에 계산
  - t분포 난수와 표준 오차를 곱한 오차를 예측값에 더함
  - np.random.standard_t(자유도, 표본크기)
- 예측값을 반응변수에 대입
  - df['반응변수'].fillna(pd.Series(예측값.flatten()),inplace=True)