# Missing Value (결측치 처리)

## Missing Value Strategy
- 데이터가 없으면 sample을 drop
- 데이터가 없는 최소 개수를 정해서 sample을 drop
- 데이터가 거의 없는 feature는 feature 자체를 drop
- 최빈값, 평균값으로 비어있는 데이터를 채우기

In [1]:
import pandas as pd
import numpy as np

In [3]:
# Example from - https://chrisalbon.com/python/pandas_missing_data.html
raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'], 
        'age': [42, np.nan, 36, 24, 73], 
        'sex': ['m', np.nan, 'f', 'm', 'f'], 
        'preTestScore': [4, np.nan, np.nan, 2, 3],
        'postTestScore': [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


## Drop - drop()

In [5]:
df.isnull().sum()
# df.isnull().sum()/len(df) # 이렇게 하면 NaN값의 비율을 알 수 있음

first_name       1
last_name        1
age              1
sex              1
preTestScore     2
postTestScore    2
dtype: int64

In [31]:
df_no_missing = df.dropna() # NaN이 하나라도 있으면 행 전체를 drop
df_no_missing

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


- `dropna(thresh = )` : thresh는 최소로 존재해야 하는 data의 개수를 의미함.

In [30]:
df.dropna(thresh = 5) # 데이터가 5개 이하로 있는 행은 drop

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


* `dropna(how = 'all')` : 전체가 NaN이면 drop
* `dropna(how = 'any')` : 하나라도 NaN이면 drop

In [35]:
df_cleaned_1 = df.dropna(how = 'all')
df_cleaned_1

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [37]:
df_cleaned_2 = df.dropna(how = 'any')
df_cleaned_2

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [42]:
# 열 전체가 NaN인 열
df['location'] = np.nan
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


In [41]:
df.dropna(axis = 1, how = 'all')

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


## Data Fill - fillna()
- 평균값(mean), 중위값(median), 최빈값(mode)을 활용

In [49]:
cleaned_df = df.dropna(how = 'all')
cleaned_df = cleaned_df.dropna(axis = 1, how = 'all')
cleaned_df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [59]:
fillna_df = cleaned_df.copy()
fillna_df["preTestScore"] = cleaned_df['preTestScore'].fillna(cleaned_df["preTestScore"].mean())
fillna_df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,3.0,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [56]:
# 남녀별로 score에 차이가 있을 수도 있기 때문에 groupby를 사용해서 NaN값을 채움.
cleaned_df.groupby("sex")["postTestScore"].transform("mean") # transform을 하면 값이 수정됨

0    43.5
2    70.0
3    43.5
4    70.0
Name: postTestScore, dtype: float64

In [61]:
# NaN 값이 있는 칸만 남녀별 score의 평균으로 채워짐
fillna_df['postTestScore'] = cleaned_df['postTestScore'].fillna(
    cleaned_df.groupby("sex")["postTestScore"].transform("mean")
)
fillna_df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,3.0,70.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


## Missing Value Handling

In [62]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

import seaborn as sns
sns.set(style="white") # white background style for seaborn plots
sns.set(style="whitegrid", color_codes = True)

In [64]:
DATA_DIR = './titanic'
data_files = reversed([os.path.join(DATA_DIR, filename) for filename in os.listdir(DATA_DIR)])
df_list = []
for filename in data_files:
    df_list.append(pd.read_csv(filename))

df = pd.concat(df_list, sort=False) 
df = df.reset_index(drop=True) 

number_of_train_dataset = df.Survived.notnull().sum() 
number_of_test_dataset = df.Survived.isnull().sum() 
y_true = df.pop("Survived")[:number_of_train_dataset] 
df.tail() 

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1304,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
1305,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9,C105,C
1306,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
1307,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
1308,1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C


In [65]:
# NaN 값의 비율 확인
pd.options.display.float_format = '{:.2f}'.format
df.isnull().sum() / len(df) * 100

PassengerId    0.00
Pclass         0.00
Name           0.00
Sex            0.00
Age           20.09
SibSp          0.00
Parch          0.00
Ticket         0.00
Fare           0.08
Cabin         77.46
Embarked       0.15
dtype: float64

- Age의 빈 칸을 채워 보자. 근데 무슨 값으로 채우나?

In [68]:
df[df["Age"].notnull()].groupby(["Sex"])["Age"].mean()
# 남녀별 나이의 차이는 거의 없는 것 같다.

Sex
female   28.69
male     30.59
Name: Age, dtype: float64

In [69]:
df[df["Age"].notnull()].groupby(["Pclass"])["Age"].mean()
# Pclass(등석)에 비례해 나이가 달라지는 듯 보인다. 그럼 이걸로 채우면 되겠다.

Pclass
1   39.16
2   29.51
3   24.82
Name: Age, dtype: float64

In [76]:
df["Age"] = df["Age"].fillna(
    df.groupby(["Pclass"])["Age"].transform("mean")
)
df.isnull().sum() / len(df) * 100
# Pclass별 Age의 평균으로 Age column을 채움

PassengerId    0.00
Pclass         0.00
Name           0.00
Sex            0.00
Age            0.00
SibSp          0.00
Parch          0.00
Ticket         0.00
Fare           0.08
Cabin         77.46
Embarked       0.15
dtype: float64

- 사실 이렇게 빈 칸을 채울 값을 정하는 과정에는 통계적 검증이 필요하지만, 머신러닝에서는 그 과정보단 코딩을 빠르게 하는 방향에 집중에 값을 채우는 경우가 많음.