## Pandas Basic 4

In [1]:
from IPython.display import Image

from pandas import Series, DataFrame
import pandas as pd
import numpy as np

## 4. Handling missing data
누락된 데이터 처리하기

- pandas의 설계 목표 중 하나는 누락 데이터를 가능한 한 쉽게 처리할 수 있도록 하는 것이다.
- pandas는 누락된 데이터를 실수든 아니든 모두 NaN(Not a Number)으로 취급한다.
- 그래서 누락된 값을 쉽게 찾을 수 있다.

In [2]:
string_data = Series(['apple', 'pear', np.nan, 'avocado'])
string_data

0      apple
1       pear
2        NaN
3    avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data[0] = None
string_data.isnull()
#index 0을 nan 값으로 바꾸어줌

0     True
1    False
2     True
3    False
dtype: bool

### Filtering out missing data
누락된 데이터 골라내기

- Series에 대해 dropna 메소드를 적용하면, 실제 데이터가 들어있는 색인값과 Series값으로 반환한다.

In [6]:
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [7]:
# 값 없는 것 솎아내기
data = Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [9]:
#위와 같다
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [11]:
#한 slot에 Nan있으면 다 삭제
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [12]:
# 모든 값이 NA인 로우만 제외
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [14]:
data[5] = NA
data

Unnamed: 0,0,1,2,4,5
0,1.0,6.5,3.0,,
1,1.0,,,,
2,,,,,
3,,6.5,3.0,,


In [15]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [16]:
df = DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,0.230926,0.620068,1.954758
1,0.395055,0.557786,-0.810938
2,1.396638,1.848773,-0.165362
3,0.515624,1.755848,0.188746
4,0.348043,-0.006708,-0.015069
5,-1.390579,0.077908,1.00872
6,0.804866,-1.713002,0.404705


In [17]:
df.iloc[:4, 1] = NA 
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.230926,,
1,0.395055,,
2,1.396638,,-0.165362
3,0.515624,,0.188746
4,0.348043,-0.006708,-0.015069
5,-1.390579,0.077908,1.00872
6,0.804866,-1.713002,0.404705


In [18]:
# thresh : 몇 개 이상의 값이 들어있는 로우만 살펴보고 
#싶을때 넣는 인자 값
df.dropna(thresh=3)
#3개 다 정상값

Unnamed: 0,0,1,2
4,0.348043,-0.006708,-0.015069
5,-1.390579,0.077908,1.00872
6,0.804866,-1.713002,0.404705


In [19]:
df.dropna(thresh=2)
#2개가 정상값

Unnamed: 0,0,1,2
2,1.396638,,-0.165362
3,0.515624,,0.188746
4,0.348043,-0.006708,-0.015069
5,-1.390579,0.077908,1.00872
6,0.804866,-1.713002,0.404705


In [20]:
df.dropna(thresh=1)
#1개가 정상값

Unnamed: 0,0,1,2
0,0.230926,,
1,0.395055,,
2,1.396638,,-0.165362
3,0.515624,,0.188746
4,0.348043,-0.006708,-0.015069
5,-1.390579,0.077908,1.00872
6,0.804866,-1.713002,0.404705


### Filling in missing data
누락된 값 채우기

In [22]:
df

Unnamed: 0,0,1,2
0,0.230926,,
1,0.395055,,
2,1.396638,,-0.165362
3,0.515624,,0.188746
4,0.348043,-0.006708,-0.015069
5,-1.390579,0.077908,1.00872
6,0.804866,-1.713002,0.404705


In [23]:
df.fillna(0)
#Nan 값 0으로 채우기

Unnamed: 0,0,1,2
0,0.230926,0.0,0.0
1,0.395055,0.0,0.0
2,1.396638,0.0,-0.165362
3,0.515624,0.0,0.188746
4,0.348043,-0.006708,-0.015069
5,-1.390579,0.077908,1.00872
6,0.804866,-1.713002,0.404705


In [25]:
# fillna에 사전값을 넣어서 
#각 칼럼마다 다른값을 채워넣을수도 있다.
df.fillna({1: 0.5, 2: -1})

Unnamed: 0,0,1,2
0,0.230926,0.5,-1.0
1,0.395055,0.5,-1.0
2,1.396638,0.5,-0.165362
3,0.515624,0.5,0.188746
4,0.348043,-0.006708,-0.015069
5,-1.390579,0.077908,1.00872
6,0.804866,-1.713002,0.404705


In [26]:
# fillna는 값을 채워 넣은 객체의 참조를 반환한다.
_ = df.fillna(0, inplace=True)
df
# inplace는 default 값이 false이다.
# 그냥 df 전체를 바꾸어 버린다.

Unnamed: 0,0,1,2
0,0.230926,0.0,0.0
1,0.395055,0.0,0.0
2,1.396638,0.0,-0.165362
3,0.515624,0.0,0.188746
4,0.348043,-0.006708,-0.015069
5,-1.390579,0.077908,1.00872
6,0.804866,-1.713002,0.404705


In [27]:
df = DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA 
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.050236,-1.39789,-0.449767
1,-1.363689,0.717781,0.756131
2,0.022658,,-0.087257
3,0.539615,,1.518329
4,0.905723,,
5,0.127411,,


In [28]:
df.fillna(method='ffill')
#위의 값을 그대로 복붙한다.

Unnamed: 0,0,1,2
0,0.050236,-1.39789,-0.449767
1,-1.363689,0.717781,0.756131
2,0.022658,0.717781,-0.087257
3,0.539615,0.717781,1.518329
4,0.905723,0.717781,1.518329
5,0.127411,0.717781,1.518329


In [29]:
df.fillna(method='ffill', limit=2)
#복붙을 하되, 최대 2개까지만 복붙이 가능

Unnamed: 0,0,1,2
0,0.050236,-1.39789,-0.449767
1,-1.363689,0.717781,0.756131
2,0.022658,0.717781,-0.087257
3,0.539615,0.717781,1.518329
4,0.905723,,1.518329
5,0.127411,,1.518329


In [30]:
data = Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())
#3개 숫자의 평균을 집어 넣음

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64