In [1]:
import os
import pandas as pd
from tools.dataset import score_dataset
from tools.preprocessing.missing_values import get_missing_raio, delete_columns, impute_missing_values
from tools.preprocessing.outliers import delete_outliers, impute_outliers, get_limits
from tools.preprocessing.scaling import minmax
from tools.engineering.mi import mi_score
from tools.engineering.encoding import one_hot
from tools.engineering.clustering import kmc


# 1. 데이터 로드

In [2]:
DATA_PATH = './data'

train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

# train_origin = train.copy()
# test_origin = test.copy()

print(f'train shape: {train.shape}')
print(f'test shape: {test.shape}')

train shape: (79023, 76)
test shape: (24353, 75)


In [3]:
# 데이터 일부 확인
train.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


In [4]:
train.drop(['ID_LAT_LON_YEAR_WEEK'], axis=1, inplace=True)
test.drop(['ID_LAT_LON_YEAR_WEEK'], axis=1, inplace=True)

# 2. 결측값 처리

In [5]:
# 삭제하지 않을 칼럼
protected_columns = ['latitude', 'longitude', 'week_no', 'year']

결측값 비율이 높은 칼럼 삭제

In [6]:
train_deleted, deleted_columns = delete_columns(train, 0.3, target='emission')

print(f'삭제된 칼럼 개수: {len(deleted_columns)}')

삭제된 칼럼 개수: 7


In [7]:
# 테스트 데이터에도 적용
test_deleted = test.drop(deleted_columns, axis=1)

In [8]:
# # 결측값 제거 결과
# data_list = [['train', train], ['train_deleted', train_deleted]]
# results = []

# for name, data in data_list:
#     score = score_dataset(data, 'emission')
#     results.append([name, score])

# results

칼럼을 삭제한 데이터셋의 스코어가 더 좋다.

가장 이상적인 결측값 대체법 찾기 

In [9]:
# methods = ['mean', 'linear', 'fill']
# results = []

# for method in methods:
#     train_imputed, _ = impute_missing_values(train_deleted, method)
#     score = score_dataset(train_imputed, 'emission')
#     results.append([method, score])
#     print(f'method "{method}" 계산 완료')

# results

In [10]:
# best_method = sorted(results, key=lambda x: x[1])[0]
# print(f'best method: {best_method}')

In [11]:
# train_imputed, _ = impute_missing_values(train_deleted, best_method[0])
# test_imputed, _ = impute_missing_values(test_deleted, best_method[0])

train_imputed, _ = impute_missing_values(train_deleted, 'fill')
test_imputed, _ = impute_missing_values(test_deleted, 'fill')

In [12]:
train_now = train_imputed
test_now = test_imputed

# 3. 칼럼 추가

month 및 covid 칼럼 추가

In [13]:
train_now['date'] = pd.to_datetime('2021' + train_now['week_no'].astype(str) + '0', format='%Y%W%w')
train_now['month_no'] = train_now['date'].dt.month
train_now.drop(columns=['date'], inplace=True)

train_now['covid'] = (train_now.year == 2020) & (train_now.month_no > 2)

In [14]:
test_now['date'] = pd.to_datetime('2021' + test_now['week_no'].astype(str) + '0', format='%Y%W%w')
test_now['month_no'] = test_now['date'].dt.month
test_now.drop(columns=['date'], inplace=True)

test_now['covid'] = (test_now.year == 2020) & (test_now.month_no > 2)

In [15]:
train_now, encoder = one_hot(train_now, 'covid')
test_now, _ = one_hot(test_now, 'covid', encoder)



In [16]:
train_now.rename(columns={0: 'covid_false', 1: 'covid_true'}, inplace=True)
test_now.rename(columns={0: 'covid_false', 1: 'covid_true'}, inplace=True)

season 칼럼 추가

In [17]:
SEASON = {
    'summer': [i for i in range(0, 9)] + [j for j in range(48, 53)],
    'fall': [i for i in range(9, 22)],
    'winter': [i for i in range(22, 35)],
    'spring': [i for i in range(35, 48)],
}

In [18]:
def insert_season(x, season):
    if x in season['summer']:
        return 'summer'
    elif x in season['fall']:
        return 'fall'
    elif x in season['winter']:
        return 'winter'
    elif x in season['spring']:
        return 'spring'
    else:
        raise Exception('unknown week')

In [19]:
train_now['season'] = train_now['week_no'].apply(insert_season, args=[SEASON])

test_now['season'] = test_now['week_no'].apply(insert_season, args=[SEASON])

print(train_now['season'].value_counts(), test_now['season'].value_counts())

season
summer    20874
fall      19383
winter    19383
spring    19383
Name: count, dtype: int64 season
fall      6461
winter    6461
spring    6461
summer    4970
Name: count, dtype: int64


In [20]:
protected_columns.extend(['covid_false', 'covid_true', 'month_no'])

# 4. Feature Selection(by Mutual Information, Correlation)

In [25]:
df, columns, mi_scores = mi_score(train_now, 'emission', 0.2, corr=True, corr_threshold=0.3, protected=protected_columns)
columns.remove('emission')
train_now = df
print(f'train_now의 칼럼 개수: {len(train_now.columns)}')

train_now의 칼럼 개수: 20


In [26]:
train_now

Unnamed: 0,covid_false,NitrogenDioxide_sensor_zenith_angle,NitrogenDioxide_NO2_column_number_density,NitrogenDioxide_NO2_slant_column_number_density,covid_true,NitrogenDioxide_sensor_altitude,year,NitrogenDioxide_absorbing_aerosol_index,NitrogenDioxide_cloud_fraction,longitude,NitrogenDioxide_solar_zenith_angle,latitude,NitrogenDioxide_solar_azimuth_angle,SulphurDioxide_sensor_zenith_angle,NitrogenDioxide_tropospheric_NO2_column_number_density,emission,month_no,week_no,NitrogenDioxide_stratospheric_NO2_column_number_density,season
0,1.0,35.265195,0.000047,0.000093,0.0,829859.960368,2019,-1.935386,0.067038,29.290,30.054262,-0.510,-138.343908,50.843559,1.639765e-05,3.750994,1,0,0.000030,summer
1,1.0,35.265195,0.000047,0.000093,0.0,829859.960368,2019,-1.935386,0.067038,29.290,30.054262,-0.510,-138.343908,39.137194,1.639765e-05,4.025176,1,1,0.000030,summer
2,1.0,52.868816,0.000031,0.000080,0.0,829527.125000,2019,-2.754374,0.072135,29.290,23.206415,-0.510,-150.191757,52.868816,4.267369e-07,4.231381,1,2,0.000030,summer
3,1.0,52.868816,0.000031,0.000080,0.0,829527.125000,2019,-2.754374,0.072135,29.290,23.206415,-0.510,-150.191757,52.868816,4.267369e-07,4.305286,1,3,0.000030,summer
4,1.0,35.078624,0.000051,0.000093,0.0,829744.843750,2019,-1.450563,0.049393,29.290,24.729026,-0.510,-136.257947,35.515587,2.056437e-05,4.347317,1,4,0.000030,summer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79018,1.0,57.593802,0.000048,0.000136,0.0,830208.833058,2021,-0.728791,0.152581,30.301,32.324131,-3.299,-130.881374,55.988022,8.024310e-06,29.404171,12,48,0.000040,summer
79019,1.0,30.917440,0.000049,0.000109,0.0,830354.500000,2021,-1.213713,0.101535,30.301,33.416246,-3.299,-128.938451,19.435339,8.476496e-06,29.186497,12,49,0.000040,summer
79020,1.0,39.123617,0.000043,0.000100,0.0,830419.335612,2021,-0.366031,0.187802,30.301,34.959241,-3.299,-128.051809,19.435339,2.979501e-06,29.131205,12,50,0.000039,summer
79021,1.0,32.599393,0.000049,0.000111,0.0,830375.349911,2021,-0.596050,0.045697,30.301,33.906037,-3.299,-129.573396,32.599393,1.014400e-05,28.125792,12,51,0.000039,summer


In [27]:
test_now = test_now.loc[:, columns]

# 5. Encoding

In [28]:
df, encoder = one_hot(train_now, 'season')
train_now = df

df, _ = one_hot(test_now, 'season', encoder=encoder)
test_now = df



In [29]:
train_now

Unnamed: 0,covid_false,NitrogenDioxide_sensor_zenith_angle,NitrogenDioxide_NO2_column_number_density,NitrogenDioxide_NO2_slant_column_number_density,covid_true,NitrogenDioxide_sensor_altitude,year,NitrogenDioxide_absorbing_aerosol_index,NitrogenDioxide_cloud_fraction,longitude,...,SulphurDioxide_sensor_zenith_angle,NitrogenDioxide_tropospheric_NO2_column_number_density,emission,month_no,week_no,NitrogenDioxide_stratospheric_NO2_column_number_density,0,1,2,3
0,1.0,35.265195,0.000047,0.000093,0.0,829859.960368,2019,-1.935386,0.067038,29.290,...,50.843559,1.639765e-05,3.750994,1,0,0.000030,0.0,0.0,1.0,0.0
1,1.0,35.265195,0.000047,0.000093,0.0,829859.960368,2019,-1.935386,0.067038,29.290,...,39.137194,1.639765e-05,4.025176,1,1,0.000030,0.0,0.0,1.0,0.0
2,1.0,52.868816,0.000031,0.000080,0.0,829527.125000,2019,-2.754374,0.072135,29.290,...,52.868816,4.267369e-07,4.231381,1,2,0.000030,0.0,0.0,1.0,0.0
3,1.0,52.868816,0.000031,0.000080,0.0,829527.125000,2019,-2.754374,0.072135,29.290,...,52.868816,4.267369e-07,4.305286,1,3,0.000030,0.0,0.0,1.0,0.0
4,1.0,35.078624,0.000051,0.000093,0.0,829744.843750,2019,-1.450563,0.049393,29.290,...,35.515587,2.056437e-05,4.347317,1,4,0.000030,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79018,1.0,57.593802,0.000048,0.000136,0.0,830208.833058,2021,-0.728791,0.152581,30.301,...,55.988022,8.024310e-06,29.404171,12,48,0.000040,0.0,0.0,1.0,0.0
79019,1.0,30.917440,0.000049,0.000109,0.0,830354.500000,2021,-1.213713,0.101535,30.301,...,19.435339,8.476496e-06,29.186497,12,49,0.000040,0.0,0.0,1.0,0.0
79020,1.0,39.123617,0.000043,0.000100,0.0,830419.335612,2021,-0.366031,0.187802,30.301,...,19.435339,2.979501e-06,29.131205,12,50,0.000039,0.0,0.0,1.0,0.0
79021,1.0,32.599393,0.000049,0.000111,0.0,830375.349911,2021,-0.596050,0.045697,30.301,...,32.599393,1.014400e-05,28.125792,12,51,0.000039,0.0,0.0,1.0,0.0


# 5. Clustering

In [26]:
# features = ['latitude', 'longitude']

In [27]:
# clusters = []
# encoders = []
# for idx, train in enumerate(train_sets):
#     df, cluster, encoder, n_clusters = kmc(train, features, n_clusters=5, elbow=True, encoding=True)
#     encoders.append(encoder)
#     train_sets[idx] = df
#     clusters.append(cluster)

The elbow method is excecuting




The elbow method is excecuting




The elbow method is excecuting




The elbow method is excecuting




In [28]:
# for idx, test in enumerate(test_sets):
#     df, _, _, _ = kmc(test, features, cluster=clusters[idx], encoding=True, encoder=encoders[idx])
#     test_sets[idx] = df

# 7. 최종 데이터셋

In [32]:
col = list(map(str, list(train_now.columns)))
train_now.columns = col
col.sort()
train_now_reordered = train_now[col]
data_name = os.path.join(DATA_PATH, '0808/train_one.csv')
train_now_reordered.to_csv(data_name)

In [34]:
col_t = list(map(str, list(test_now.columns)))
test_now.columns = col_t
col.remove('emission')
test_now_reordered = test_now[col]
data_name = os.path.join(DATA_PATH, '0808/test_one.csv')
test_now_reordered.to_csv(data_name)

In [36]:
test_now_reordered

Unnamed: 0,0,1,2,3,NitrogenDioxide_NO2_column_number_density,NitrogenDioxide_NO2_slant_column_number_density,NitrogenDioxide_absorbing_aerosol_index,NitrogenDioxide_cloud_fraction,NitrogenDioxide_sensor_altitude,NitrogenDioxide_sensor_zenith_angle,...,NitrogenDioxide_stratospheric_NO2_column_number_density,NitrogenDioxide_tropospheric_NO2_column_number_density,SulphurDioxide_sensor_zenith_angle,covid_false,covid_true,latitude,longitude,month_no,week_no,year
0,0.0,0.0,1.0,0.0,0.000048,0.000093,-1.416309,0.036769,829736.125000,15.600607,...,0.000037,1.117653e-05,15.600607,1.0,0.0,-0.510,29.290,1,0,2022
1,0.0,0.0,1.0,0.0,0.000048,0.000093,-1.416309,0.036769,829736.125000,15.600607,...,0.000037,1.117653e-05,15.600607,1.0,0.0,-0.510,29.290,1,1,2022
2,0.0,0.0,1.0,0.0,0.000055,0.000109,-0.008104,0.036490,829922.562500,36.693165,...,0.000038,1.679787e-05,39.889060,1.0,0.0,-0.510,29.290,1,2,2022
3,0.0,0.0,1.0,0.0,0.000034,0.000109,-1.279531,0.134641,829375.749671,58.862543,...,0.000037,-2.399639e-07,58.862543,1.0,0.0,-0.510,29.290,1,3,2022
4,0.0,0.0,1.0,0.0,0.000034,0.000109,-1.279531,0.134641,829375.749671,58.862543,...,0.000037,-2.399639e-07,15.646016,1.0,0.0,-0.510,29.290,1,4,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24348,0.0,1.0,0.0,0.0,0.000066,0.000115,-1.233932,0.170029,830220.705687,9.122010,...,0.000042,2.902176e-05,61.114494,1.0,0.0,-3.299,30.301,11,44,2022
24349,0.0,1.0,0.0,0.0,0.000047,0.000120,-1.289134,0.226918,830507.625000,54.188267,...,0.000037,9.427952e-06,61.114494,1.0,0.0,-3.299,30.301,11,45,2022
24350,0.0,1.0,0.0,0.0,0.000047,0.000120,-1.289134,0.226918,830507.625000,54.188267,...,0.000037,9.427952e-06,61.114494,1.0,0.0,-3.299,30.301,11,46,2022
24351,0.0,1.0,0.0,0.0,0.000050,0.000108,-1.786733,0.138160,830046.343750,38.215228,...,0.000037,1.926275e-05,38.215228,1.0,0.0,-3.299,30.301,11,47,2022
