## 결측치 (Missing value) 에 대한 처리는 어떻게?

- 데이터가 없으면 sample을 drop 한다
- 데이터가 없는 최소 개수를 정해서 sample을 drop 한다
- 데이터가 거의 없는 feature는 feature 자체를 drop 한다
- 최빈값, 평균값으로 비어있는 데이터를 채우기

In [2]:
import pandas as pd
import numpy as np

In [62]:
# Eaxmple from - https://chrisalbon.com/python/pandas_missing_data.html
raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
        'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
        'age': [42, np.nan, 36, 24, 73],
        'sex': ['m', np.nan, 'f', 'm', 'f'],
        'preTestScore': [4, np.nan, np.nan, 2, 3],
        'postTestScore': [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


## Data drop examples

In [63]:
df.isnull().sum()

first_name       1
last_name        1
age              1
sex              1
preTestScore     2
postTestScore    2
dtype: int64

In [64]:
## 몇 퍼센트 비어있는지 확인

df.isnull().sum() / len(df)

first_name       0.2
last_name        0.2
age              0.2
sex              0.2
preTestScore     0.4
postTestScore    0.4
dtype: float64

#### dropna() : 하나라도 비어있다면 데이터들이 row level로 사라짐

- 너무 많은 데이터들을 지울 우려가 있음

In [65]:
df_no_missing = df.dropna()
df_no_missing

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


#### dropna(how='all') : 모든 데이터가 지워져 있는 경우에만 drop

In [66]:
df_cleaned = df.dropna(how='all')
df_cleaned

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [67]:
# 모든 row가 비어있는 column을 생성
df['location'] = np.nan
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [68]:
# column 기준으로 삭제
df.dropna(axis=1, how='all')

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


#### thresh 설정해서 최소 4개 이상 없을 때 drop

In [69]:
df.dropna(axis=1, thresh=3)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [70]:
# 5개 이상 데이터가 있지 않으면 drop
df.dropna(thresh=5)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


## 데이터 값 채우기

- 평균값, 중위값, 최빈값 활용
- 데이터의 분포가 고르면 평균값 활용, 아니면 보통 최빈값 활용
- 평균값 : 해당 column의 값을 평균을 내서 채우기
  - df['column'].mean()
- 중위값 : 값을 일렬로 나열했을 때 중간에 위치한 값
  - df['column'].median()
- 최빈값 : 가장 많이 나오는 값
  - df['column'].mode()

## Data Fill Examples

#### fillna(0)

- 데이터가 없는 곳은 0으로 집어 넣기

In [71]:
df.fillna(0)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,0.0
1,0,0,0.0,0,0.0,0.0,0.0
2,Tina,Ali,36.0,f,0.0,0.0,0.0
3,Jake,Milner,24.0,m,2.0,62.0,0.0
4,Amy,Cooze,73.0,f,3.0,70.0,0.0


#### 결측치에 평균값 넣기

In [72]:
df['preTestScore'].fillna(df['preTestScore'].mean())
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


#### 성별에 따라 평균값을 채워넣기

- fillna와 transform을 함께 사용하는 경우 많음
- 특정 category 별로 평균의 차이가 있는 경우가 있음

In [73]:
# 각 성별별로 값 채워넣기
df['preTestScore'].fillna(df.groupby('sex')['postTestScore'].transform("mean"))
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [74]:
# Age와 Sex가 모두 비어있는 경우만 출력
df[df['age'].notnull() & df['sex'].notnull()]

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [75]:
df.groupby('sex')['postTestScore'].transform('mean')

0    43.5
1     NaN
2    70.0
3    43.5
4    70.0
Name: postTestScore, dtype: float64

In [76]:
clean_column_df = df.dropna(axis=1, how='all')
clean_df = clean_column_df.dropna(axis=0, how='all')
clean_df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [85]:
clean_df['postTestScore'].fillna(clean_df.groupby('sex')['postTestScore'].transform('mean'), inplace=True)
clean_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['postTestScore'].fillna(clean_df.groupby('sex')['postTestScore'].transform('mean'), inplace=True)


Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,70.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [89]:
clean_df['preTestScore'].fillna(clean_df.groupby('sex')['preTestScore'].transform('mean'), inplace=True)
clean_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['preTestScore'].fillna(clean_df.groupby('sex')['preTestScore'].transform('mean'), inplace=True)


Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,3.0,70.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


## Titanic Missing Value Handling

#### Load dataset

In [91]:
import os
DATA_DIR = './titanic'
data_files = reversed([os.path.join(DATA_DIR, filename) for filename in os.listdir(DATA_DIR)])
df_list = []
for filename in data_files:
    df_list.append(pd.read_csv(filename))
df = pd.concat(df_list, sort=False)
df = df.reset_index(drop=True)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [94]:
number_of_train_dataset = df['Survived'].notnull().sum()  # train data의 개수
number_of_test_dataset = df['Survived'].isnull().sum()  # test data의 개수
y_true = df.pop('Survived')[:number_of_train_dataset]  # train data까지 슬라이싱

In [95]:
pd.options.display.float_format = '{:.2f}'.format
df.isnull().sum() / len(df) * 100

PassengerId    0.00
Pclass         0.00
Name           0.00
Sex            0.00
Age           20.09
SibSp          0.00
Parch          0.00
Ticket         0.00
Fare           0.08
Cabin         77.46
Embarked       0.15
dtype: float64

#### 성별에 따른 나이가 차이가 있는지 확인

- 정규분포 확인 등 통계적 검증 필요

In [105]:
# 성별에 따라 나이가 크게 차이가 나지 않는다는 것을 확인
df[df['Age'].notnull()].groupby('Sex')['Age'].mean()

Sex
female   28.69
male     30.59
Name: Age, dtype: float64

#### Pclass에 따른 나이의 차이가 있는지 확인

In [106]:
# Pclass가 낮아짐에 따라 나이도 어려지는 것을 확인
df[df['Age'].notnull()].groupby('Pclass')['Age'].mean()

Pclass
1   39.16
2   29.51
3   24.82
Name: Age, dtype: float64

#### Pclass별로 나이의 평균값 채워넣기

- 더이상 Age는 비어있지 않음

In [109]:
df['Age'].fillna(df[df['Age'].notnull()].groupby('Pclass')['Age'].transform('mean'), inplace=True)
df.isnull().sum() / len(df) * 100

PassengerId    0.00
Pclass         0.00
Name           0.00
Sex            0.00
Age           20.09
SibSp          0.00
Parch          0.00
Ticket         0.00
Fare           0.08
Cabin         77.46
Embarked       0.15
dtype: float64