# 결측치(Missing Value) 처리하기 실습

- 결측치(Missing Value)란?
    : 값이 들어있지 않는 것으로 NaN, NA로 표기

- 관련 메서드
    - 결측치 여부 체크 : isnull(), isna() => 비어있으면 True
                         notnull(), notna() => 비어있으면 False
    - 결측치 삭제 : dropna()
    - 결측치 치환 : fillna()

### 처리 방법 (1) 삭제

In [1]:
# 모듈 로딩
import pandas as pd

# 파일 관련 변수 선언
DIR_PATH='../Data/'
FILE_NAME=DIR_PATH+'weather.csv'

In [2]:
# (1) CSV FILE => DataFrame으로 로딩
weather=pd.read_csv(FILE_NAME)

In [3]:
# (2) 데이터 정보 확인
weather.head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,


In [4]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 35 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       22 non-null     object 
 1   year     22 non-null     int64  
 2   month    22 non-null     int64  
 3   element  22 non-null     object 
 4   d1       2 non-null      float64
 5   d2       4 non-null      float64
 6   d3       4 non-null      float64
 7   d4       2 non-null      float64
 8   d5       8 non-null      float64
 9   d6       2 non-null      float64
 10  d7       2 non-null      float64
 11  d8       2 non-null      float64
 12  d9       0 non-null      float64
 13  d10      2 non-null      float64
 14  d11      2 non-null      float64
 15  d12      0 non-null      float64
 16  d13      2 non-null      float64
 17  d14      4 non-null      float64
 18  d15      2 non-null      float64
 19  d16      2 non-null      float64
 20  d17      2 non-null      float64
 21  d18      0 non-nul

In [5]:
# 결측치 개수 파악
weather.isnull().sum()

id          0
year        0
month       0
element     0
d1         20
d2         18
d3         18
d4         20
d5         14
d6         20
d7         20
d8         20
d9         22
d10        20
d11        20
d12        22
d13        20
d14        18
d15        20
d16        20
d17        20
d18        22
d19        22
d20        22
d21        22
d22        22
d23        18
d24        22
d25        20
d26        20
d27        16
d28        20
d29        18
d30        20
d31        20
dtype: int64

In [6]:
# 2010년도 1월의 최고온도, 최저온도 출력
weather.dropna(axis=1, how='all')[weather['month']==1]
#.iloc[:, 4:].max(axis=1)[0]

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d16,d17,d23,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,


In [7]:
weather.dropna(how='all', axis=1)[weather['month']==1].iloc[:, 4:].min(axis=1)[1]

14.5

## 강사님 버전

In [8]:
# (1) 1월 데이터만 추출
oneMonth=weather.iloc[:2]

In [9]:
oneMonth

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,


In [10]:
# (2) NaN값 제거 => dropna() : 기본값 => 행(row) 방향, 한 개라도 NaN 있으면 삭제
oneMonth.dropna()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31


In [36]:
# => 열(column) 방향 설정, 최고/최저온도 값이 모두 NaN인 것 삭제하고 싶음
# => dropna(axis=1, how='all')
oneMonth.dropna(axis=1, how='all')

Unnamed: 0,id,year,month,element,d30
0,MX17004,2010,1,tmax,27.8
1,MX17004,2010,1,tmin,14.5


In [39]:
# 2월의 최고-최저온도 출력
twoMonth=weather.iloc[[2, 3]]
twoMonth.dropna(axis=1, how='all')

Unnamed: 0,id,year,month,element,d2,d3,d11,d23
2,MX17004,2010,2,tmax,27.3,24.1,29.7,29.9
3,MX17004,2010,2,tmin,14.4,14.4,13.4,10.7


In [41]:
# 과제 0704-(1) 2010년 1월~12월까지 최고온도/최저온도 추출


### 처리방법 (2) 치환

In [42]:
# (1) CSV FILE => DataFrame으로 로딩
weather=pd.read_csv(FILE_NAME)
weather.head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,


In [44]:
# (2) NaN 데이터를 다른 값으로 치환(변경)하기 => fillna()
# 모든 NaN을 0으로 채우기
weather.fillna(0)

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.8,0.0
1,MX17004,2010,1,tmin,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.5,0.0
2,MX17004,2010,2,tmax,0.0,27.3,24.1,0.0,0.0,0.0,...,0.0,29.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,MX17004,2010,2,tmin,0.0,14.4,14.4,0.0,0.0,0.0,...,0.0,10.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MX17004,2010,3,tmax,0.0,0.0,0.0,0.0,32.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,MX17004,2010,3,tmin,0.0,0.0,0.0,0.0,14.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,MX17004,2010,4,tmax,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,36.3,0.0,0.0,0.0,0.0
7,MX17004,2010,4,tmin,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,16.7,0.0,0.0,0.0,0.0
8,MX17004,2010,5,tmax,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,33.2,0.0,0.0,0.0,0.0
9,MX17004,2010,5,tmin,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,18.2,0.0,0.0,0.0,0.0


In [46]:
weather.fillna(method='bfill')

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,29.9,27.3,24.1,27.2,32.1,27.8,...,,29.9,,29.7,28.1,36.3,31.2,30.1,27.8,25.4
1,MX17004,2010,1,tmin,29.9,27.3,24.1,27.2,32.1,27.8,...,,29.9,,29.7,28.1,36.3,31.2,30.1,14.5,25.4
2,MX17004,2010,2,tmax,29.9,27.3,24.1,27.2,32.1,27.8,...,,29.9,,29.7,28.1,36.3,31.2,30.1,,25.4
3,MX17004,2010,2,tmin,29.9,14.4,14.4,27.2,32.1,27.8,...,,10.7,,29.7,28.1,36.3,31.2,30.1,,25.4
4,MX17004,2010,3,tmax,29.9,31.3,28.6,27.2,32.1,27.8,...,,26.4,,29.7,28.1,36.3,31.2,30.1,,25.4
5,MX17004,2010,3,tmin,29.9,31.3,28.6,27.2,14.2,27.8,...,,26.4,,29.7,28.1,36.3,31.2,30.1,,25.4
6,MX17004,2010,4,tmax,29.9,31.3,28.6,27.2,29.6,27.8,...,,26.4,,29.7,28.1,36.3,31.2,30.1,,25.4
7,MX17004,2010,4,tmin,29.9,31.3,28.6,27.2,29.6,27.8,...,,26.4,,29.7,28.1,16.7,31.2,30.1,,25.4
8,MX17004,2010,5,tmax,29.9,31.3,28.6,27.2,29.6,27.8,...,,26.4,,29.7,28.1,33.2,31.2,30.1,,25.4
9,MX17004,2010,5,tmin,29.9,31.3,28.6,27.2,29.6,27.8,...,,26.4,,29.7,28.1,18.2,31.2,30.1,,25.4


In [None]:
# 