# 결측치(Missing Value) 처리 (2)

- 데이터 정보 확인에서 NaN 감지가 안 되는 경우에 대한 처리

- (1) NaN으로 처리해야 하는 데이터를 찾기
- (2) 데이터를 NaN으로 변경
- (3) NaN 데이터 처리

In [2]:
# 모듈 로딩
import pandas as pd

# 파일 관련 변수 선언
DIR_PATH='../Data/'
FILE_NAME=DIR_PATH+'auto_mpg.csv'

In [5]:
# (1) CSV FILE => DataFrame 로딩
mpgDF=pd.read_csv(FILE_NAME)

In [6]:
# (2) Data 정보 확인 - 요약 정보
mpgDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [8]:
# 일부 데이터 확인
mpgDF.head()

# horsepower가 살펴보면 정수인데 타입이 object로 나옴

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [9]:
# NaN 체크
mpgDF.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [10]:
# (3) 데이터 전처리
# 데이터 --- 데이터 타입 맞도록 설정
mpgDF.horsepower

0      130
1      165
2      150
3      150
4      140
      ... 
393     86
394     52
395     84
396     79
397     82
Name: horsepower, Length: 398, dtype: object

In [13]:
# 타입 변경 => astype(타입)
# (1) 데이터의 고유값 확인
mpgDF.horsepower.unique()
#mpgDF.horsepower.astype(int)    # 오류 남

# 그냥 바꾸기 전에 unique()로 값 확인하기

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [16]:
# (2) 문제가 되는 데이터를 NaN으로 변경하기 => replace(원래값, 새로운값)
# NaN 상수 사용을 위해서 numpy 모듈 불러오기
import numpy as np

mpgDF.horsepower.replace('?', np.nan, inplace=True)
mpgDF.horsepower.unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', nan, '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [17]:
mpgDF.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [20]:
# 0으로 채우거나 데이터를 살펴보고 값을 채우거나.
# NaN 값을 0으로 채우기
mpgDF.horsepower.fillna(0, inplace=True)
mpgDF.horsepower.isnull().sum()

0

In [23]:
# horsepower 컬럼을 object => int로 타입 변경 : astype()
mpgDF.horsepower.astype(int)

0      130
1      165
2      150
3      150
4      140
      ... 
393     86
394     52
395     84
396     79
397     82
Name: horsepower, Length: 398, dtype: int32

In [24]:
# 잘 바뀌었는지 확인
mpgDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB
