# 1. Handling Missing Values

In [35]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [36]:
DATA_PATH = '../../data/'
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

train_origin = train.copy()
test_origin = test.copy()

train.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


## 1) 결측값 확인

In [37]:
# 각 칼럼별 결측값 개수 확인
n_missing_values = train.isna().sum().sort_values(ascending=False)
n_missing_values.head(10)

UvAerosolLayerHeight_aerosol_height                        78584
UvAerosolLayerHeight_solar_zenith_angle                    78584
UvAerosolLayerHeight_solar_azimuth_angle                   78584
UvAerosolLayerHeight_sensor_azimuth_angle                  78584
UvAerosolLayerHeight_aerosol_pressure                      78584
UvAerosolLayerHeight_aerosol_optical_depth                 78584
UvAerosolLayerHeight_sensor_zenith_angle                   78584
NitrogenDioxide_tropopause_pressure                        18320
NitrogenDioxide_stratospheric_NO2_column_number_density    18320
NitrogenDioxide_NO2_slant_column_number_density            18320
dtype: int64

In [38]:
# 각 칼럼별 결측값 개수가 차지하는 비율
n_rows = train.shape[0]
ratio_missing_values = n_missing_values / n_rows
ratio_missing_values.head(20)

UvAerosolLayerHeight_aerosol_height                        0.994445
UvAerosolLayerHeight_solar_zenith_angle                    0.994445
UvAerosolLayerHeight_solar_azimuth_angle                   0.994445
UvAerosolLayerHeight_sensor_azimuth_angle                  0.994445
UvAerosolLayerHeight_aerosol_pressure                      0.994445
UvAerosolLayerHeight_aerosol_optical_depth                 0.994445
UvAerosolLayerHeight_sensor_zenith_angle                   0.994445
NitrogenDioxide_tropopause_pressure                        0.231831
NitrogenDioxide_stratospheric_NO2_column_number_density    0.231831
NitrogenDioxide_NO2_slant_column_number_density            0.231831
NitrogenDioxide_cloud_fraction                             0.231831
NitrogenDioxide_absorbing_aerosol_index                    0.231831
NitrogenDioxide_NO2_column_number_density                  0.231831
NitrogenDioxide_sensor_altitude                            0.231831
NitrogenDioxide_sensor_azimuth_angle            

## 2) 칼럼 제거
- 결측값 비율이 너무 큰(99.45%) `UvAerosolLayerHeight~` 관련 칼럼 모두 제거

In [39]:
f = lambda x: x.startswith('UvAerosolLayerHeight')
drop_columns = list(filter(f, train.columns))
drop_columns

['UvAerosolLayerHeight_aerosol_height',
 'UvAerosolLayerHeight_aerosol_pressure',
 'UvAerosolLayerHeight_aerosol_optical_depth',
 'UvAerosolLayerHeight_sensor_zenith_angle',
 'UvAerosolLayerHeight_sensor_azimuth_angle',
 'UvAerosolLayerHeight_solar_azimuth_angle',
 'UvAerosolLayerHeight_solar_zenith_angle']

In [40]:
train.drop(drop_columns, axis=1, inplace=True)

# 칼럼 제거가 잘 이루어졌는지 확인
train.columns.str.startswith('UvAerosolLayerHeight')

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False])

## 3) 결측값 대체

In [41]:
# 결측값이 존재하는 모든 칼럼 추출
temp = train.isna().any()
na_columns = list(temp.loc[temp == True].index)

na_columns[:10]

['SulphurDioxide_SO2_column_number_density',
 'SulphurDioxide_SO2_column_number_density_amf',
 'SulphurDioxide_SO2_slant_column_number_density',
 'SulphurDioxide_cloud_fraction',
 'SulphurDioxide_sensor_azimuth_angle',
 'SulphurDioxide_sensor_zenith_angle',
 'SulphurDioxide_solar_azimuth_angle',
 'SulphurDioxide_solar_zenith_angle',
 'SulphurDioxide_SO2_column_number_density_15km',
 'CarbonMonoxide_CO_column_number_density']

In [42]:
# # 칼럼들의 plot 저장하기
# PLOT_PATH = './plots'
# start = train.shape[0] // 2
# end = start + 2000

# plt.figure(figsize = (15, 4))

# for col in na_columns:
#     plt.title(col)
#     plt.plot(train.index[start:end], train.loc[start:end - 1, col])
#     img_name = os.path.join(PLOT_PATH, col)
#     plt.savefig(f'{img_name}.png')
#     plt.clf()

In [43]:
# 선형 보간
train = train.interpolate(method='linear', limit_direction='both')

# 결측값 확인 -> 결측값 없음
train.isna().sum().sort_values(ascending=False)

ID_LAT_LON_YEAR_WEEK                    0
UvAerosolIndex_sensor_azimuth_angle     0
Ozone_O3_slant_column_number_density    0
Ozone_O3_column_number_density_amf      0
Ozone_O3_column_number_density          0
                                       ..
NitrogenDioxide_sensor_altitude         0
NitrogenDioxide_sensor_azimuth_angle    0
NitrogenDioxide_sensor_zenith_angle     0
NitrogenDioxide_solar_azimuth_angle     0
emission                                0
Length: 69, dtype: int64

In [44]:
data_name = os.path.join(DATA_PATH, 'train_1.csv')
train.to_csv(data_name)