In [1]:
import os
import pandas as pd
from tools.dataset import score_dataset
from tools.preprocessing.missing_values import get_missing_raio, delete_columns, impute_missing_values
from tools.preprocessing.outliers import delete_outliers, impute_outliers, get_limits
from tools.preprocessing.scaling import minmax

# 1. 데이터 로드

In [2]:
DATA_PATH = './data'

train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

# train_origin = train.copy()
# test_origin = test.copy()

print(f'train shape: {train.shape}')
print(f'test shape: {test.shape}')

train shape: (79023, 76)
test shape: (24353, 75)


In [3]:
# 데이터 일부 확인
train.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


# 2. 결측값 처리

결측값 비율이 높은 칼럼 삭제

In [4]:
train_deleted, deleted_columns = delete_columns(train, 0.2, target='emission')

print(f'삭제된 칼럼 개수: {len(deleted_columns)}')

삭제된 칼럼 개수: 19


In [5]:
# 테스트 데이터에도 적용
test_deleted = test.drop(deleted_columns, axis=1)

In [5]:
# 결측값 제거 결과
data_list = [['train', train], ['train_deleted', train_deleted]]
results = []

for name, data in data_list:
    score = score_dataset(data, 'emission')
    results.append([name, score])

results

KeyboardInterrupt: 

칼럼을 삭제한 데이터셋의 스코어가 더 좋다.

가장 이상적인 결측값 대체법 찾기 

In [7]:
methods = ['mean', 'median', 'linear']
results = []

for method in methods:
    train_imputed = impute_missing_values(train_deleted, method)
    score = score_dataset(train_imputed, 'emission')
    results.append([method, score])
    print(f'method "{method}" 계산 완료')

results

AttributeError: 'tuple' object has no attribute 'drop'

'mean'으로 대체할 때 가장 좋은 score 기록

In [6]:
train_imputed, tool = impute_missing_values(train_deleted, 'mean')

test_imputed = test_deleted.fillna(tool)

In [23]:
# score_dataset(train_imputed, 'emission')

8.75663387517149

# 3. 이상치 처리

numeric 칼럼에 대해 이상치 처리

In [7]:
outliers_columns = list(train_imputed.select_dtypes(exclude=['O']).columns)
outliers_columns.remove('emission')
outliers_columns[:10]

['latitude',
 'longitude',
 'year',
 'week_no',
 'SulphurDioxide_SO2_column_number_density',
 'SulphurDioxide_SO2_column_number_density_amf',
 'SulphurDioxide_SO2_slant_column_number_density',
 'SulphurDioxide_cloud_fraction',
 'SulphurDioxide_sensor_azimuth_angle',
 'SulphurDioxide_sensor_zenith_angle']

이상치 기준이 되는 상한, 하한 구하기

In [8]:
limits = get_limits(train_imputed, outliers_columns)
limits.items()

dict_items([('latitude', [0.41900000000000026, -4.173]), ('longitude', [32.2845, 27.448500000000003]), ('year', [2024.0, 2016.0]), ('week_no', [78.0, -26.0]), ('SulphurDioxide_SO2_column_number_density', [0.0003911345979763542, -0.00033896669728099033]), ('SulphurDioxide_SO2_column_number_density_amf', [1.1669939597287828, 0.4693074770428757]), ('SulphurDioxide_SO2_slant_column_number_density', [0.00031344770484945946, -0.0002734028406228577]), ('SulphurDioxide_cloud_fraction', [0.3137327257603556, 0.009091709129575035]), ('SulphurDioxide_sensor_azimuth_angle', [140.1330893219491, -150.97045989778013]), ('SulphurDioxide_sensor_zenith_angle', [65.45264699107074, 10.663640519012876]), ('SulphurDioxide_solar_azimuth_angle', [43.523391257981416, -208.53374808358552]), ('SulphurDioxide_solar_zenith_angle', [40.21798721033717, 15.809738889080622]), ('SulphurDioxide_SO2_column_number_density_15km', [0.0001327659731399024, -0.00011663270986185858]), ('CarbonMonoxide_CO_column_number_density', 

In [9]:
train_outliers_deleted, deleted_indices = delete_outliers(train_imputed, outliers_columns, limits)
train_impute_outliers = impute_outliers(train_imputed, outliers_columns, limits)

In [10]:
# 삭제된 행의 개수
len(train_outliers_deleted)

43388

이상치 처리 비교(이상치 처리 전: 8.75663387517149)

In [15]:
outliers_dfs = [['deleted', train_outliers_deleted], ['imputed', train_impute_outliers]]
results = []

for name, outliers_df in outliers_dfs:
    score = score_dataset(outliers_df, 'emission')
    results.append([name, score])

results

[['deleted', 8.49429524751788], ['imputed', 8.7388773949861]]

이상치가 있는 행을 제거했을 때가 가장 이상적인 score 기록

In [11]:
# 테스트 데이터의 이상치도 제거
test_outliers_deleted, deleted_indices = delete_outliers(test_imputed, outliers_columns, limits)

# 4. Scaling

In [12]:
scaling_columns = list(train_outliers_deleted.select_dtypes(exclude=['O']).columns)
scaling_columns.remove('emission')

In [13]:
train_scaled, scaler = minmax(train_outliers_deleted, scaling_columns)
train_scaled.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,1.0,0.321429,0.0,0.0,0.315841,0.190895,0.354435,0.847635,0.091993,...,0.254227,0.532466,0.18356,0.323598,0.500352,0.497417,0.365752,0.074577,0.692598,3.750994
1,ID_-0.510_29.290_2019_01,1.0,0.321429,0.0,0.019231,0.492357,0.370518,0.489026,0.419006,0.602869,...,0.252499,0.65263,0.262747,0.161423,0.436948,0.787278,0.480748,0.020072,0.541363,4.025176
3,ID_-0.510_29.290_2019_03,1.0,0.321429,0.0,0.057692,0.529828,0.523511,0.525578,0.513303,0.494124,...,0.458194,0.327798,0.405198,0.319315,0.458323,0.506422,0.036103,0.126723,0.609514,4.305286
4,ID_-0.510_29.290_2019_04,1.0,0.321429,0.0,0.076923,0.356349,0.296029,0.384624,0.385232,0.547555,...,0.295457,0.586901,0.288386,0.147844,0.350901,0.842386,0.417331,0.051374,0.323296,4.347317
5,ID_-0.510_29.290_2019_05,1.0,0.321429,0.0,0.096154,0.86741,0.576404,0.877833,0.751332,0.469607,...,0.497557,0.302911,0.449689,0.134674,0.409876,0.785721,0.406023,0.086485,0.278068,4.310819


In [15]:
# score = score_dataset(train_scaled, 'emission')
# score

8.453965506427712

scaling을 진행한 데이터셋의 스코어가 더 이상적이다.

In [14]:
# 테스트 데이터에도 scaling 수행
test_scaled, _ = minmax(test_outliers_deleted, scaling_columns)
test_scaled.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle
2,ID_-0.510_29.290_2022_02,1.0,0.321429,0.0,0.041667,0.684135,0.190866,0.648351,0.243553,0.292511,...,0.038981,0.94215,0.039142,0.941054,0.206972,0.487669,0.857512,0.664983,0.016634,0.465836
3,ID_-0.510_29.290_2022_03,1.0,0.321429,0.0,0.0625,0.94395,0.323135,0.885029,0.660914,0.83759,...,0.421315,0.5093,0.437601,0.500001,0.231017,0.644403,0.41994,0.552758,0.099467,0.601213
4,ID_-0.510_29.290_2022_04,1.0,0.321429,0.0,0.083333,0.029188,0.155454,0.14873,0.672365,0.85678,...,0.323906,0.630926,0.336068,0.623932,0.261942,0.559009,0.499514,0.197469,0.095458,0.443763
5,ID_-0.510_29.290_2022_05,1.0,0.321429,0.0,0.104167,0.663442,0.303172,0.637222,0.491319,0.84378,...,0.494767,0.415232,0.513971,0.40415,0.335627,0.332187,0.649331,0.538641,0.089926,0.332803
6,ID_-0.510_29.290_2022_06,1.0,0.321429,0.0,0.125,0.204186,0.488461,0.200959,0.942623,0.840409,...,0.421844,0.517844,0.429338,0.517812,0.355924,0.409996,0.246942,0.611278,0.184667,0.464801
