# 5. Data Types and Missing Values

## Data Type

In [23]:
import pandas as pd
from sklearn.datasets import load_iris

# sample df
columns = list(load_iris().feature_names)
columns = list(map(lambda x: x[:-5].replace(' ', '_'), columns))

df = pd.DataFrame(columns = columns, data = load_iris().data)
df.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [4]:
# dtype: 지정된 칼럼의 데이터 타입 반환
df.sepal_length.dtype

dtype('float64')

In [5]:
# dtypes: 모든 칼럼의 데이터 타입 반환
df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
dtype: object

In [24]:
# astype(): 특정 칼럼의 데이터 타입 변경

df['temp'] = 0                          # 값이 0(int)인 temp 칼럼 추가
# df.dtypes

df.temp = df.temp.astype('float64')     # int -> float
df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
temp            float64
dtype: object

## Missing data
- NaN: "Not a Number" -> float64 타입이다.

In [39]:
# missing value 넣기
df.iloc[148, 0] = None
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,temp
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0.0
146,6.3,2.5,5.0,1.9,0.0
147,6.5,,5.2,2.0,0.0
148,,3.4,5.4,2.3,0.0


In [41]:
# sepal_length에 NaN이 있는 행 출력(1)
df[df.sepal_length.isnull()]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,temp
148,,3.4,5.4,2.3,0.0


In [42]:
# sepal_length에 NaN이 있는 행 출력(2)
df[pd.isnull(df.sepal_length)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,temp
148,,3.4,5.4,2.3,0.0


In [44]:
# NaN -> Unknown
df.sepal_length.fillna('Unknown', inplace=True)
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,temp
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0.0
146,6.3,2.5,5.0,1.9,0.0
147,6.5,,5.2,2.0,0.0
148,Unknown,3.4,5.4,2.3,0.0


In [51]:
# replace: 칼럼 내 값 바꾸기(sepal_length 칼럼의 5.0을 100으로 변경)
df.head(10)     # 변경 전

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,temp
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
5,5.4,3.9,1.7,0.4,0.0
6,4.6,3.4,1.4,0.3,0.0
7,5.0,3.4,1.5,0.2,0.0
8,4.4,2.9,1.4,0.2,0.0
9,4.9,3.1,1.5,0.1,0.0


In [52]:
df.sepal_length.replace(5.0, 100, inplace=True)
df.head(10)     # 변경 후

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,temp
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,100.0,3.6,1.4,0.2,0.0
5,5.4,3.9,1.7,0.4,0.0
6,4.6,3.4,1.4,0.3,0.0
7,100.0,3.4,1.5,0.2,0.0
8,4.4,2.9,1.4,0.2,0.0
9,4.9,3.1,1.5,0.1,0.0
