In [1]:
import os
import pandas as pd
from tools.dataset import score_dataset
from tools.preprocessing.missing_values import get_missing_raio, delete_columns, impute_missing_values
from tools.preprocessing.outliers import delete_outliers, impute_outliers, get_limits
from tools.preprocessing.scaling import minmax
from tools.engineering.mi import mi_score
from tools.engineering.encoding import one_hot
from tools.engineering.clustering import kmc


# 1. 데이터 로드

In [2]:
DATA_PATH = './data'

train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

# train_origin = train.copy()
# test_origin = test.copy()

print(f'train shape: {train.shape}')
print(f'test shape: {test.shape}')

train shape: (79023, 76)
test shape: (24353, 75)


In [3]:
# 데이터 일부 확인
train.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


In [4]:
# train.drop(['ID_LAT_LON_YEAR_WEEK'], axis=1, inplace=True)
# test.drop(['ID_LAT_LON_YEAR_WEEK'], axis=1, inplace=True)

# 2. 결측값 처리

In [28]:
# 삭제하지 않을 칼럼
protected_columns = ['latitude', 'longitude', 'week_no', 'year']

결측값 비율이 높은 칼럼 삭제

In [6]:
train_deleted, deleted_columns = delete_columns(train, 0.3, target='emission')

print(f'삭제된 칼럼 개수: {len(deleted_columns)}')

삭제된 칼럼 개수: 7


In [7]:
# 테스트 데이터에도 적용
test_deleted = test.drop(deleted_columns, axis=1)

In [8]:
# # 결측값 제거 결과
# data_list = [['train', train], ['train_deleted', train_deleted]]
# results = []

# for name, data in data_list:
#     score = score_dataset(data, 'emission')
#     results.append([name, score])

# results

칼럼을 삭제한 데이터셋의 스코어가 더 좋다.

가장 이상적인 결측값 대체법 찾기 

In [9]:
# methods = ['mean', 'linear', 'fill']
# results = []

# for method in methods:
#     train_imputed, _ = impute_missing_values(train_deleted, method)
#     score = score_dataset(train_imputed, 'emission')
#     results.append([method, score])
#     print(f'method "{method}" 계산 완료')

# results

In [10]:
# best_method = sorted(results, key=lambda x: x[1])[0]
# print(f'best method: {best_method}')

In [11]:
# train_imputed, _ = impute_missing_values(train_deleted, best_method[0])
# test_imputed, _ = impute_missing_values(test_deleted, best_method[0])

train_imputed, _ = impute_missing_values(train_deleted, 'fill')
test_imputed, _ = impute_missing_values(test_deleted, 'fill')

In [12]:
train_now = train_imputed
test_now = test_imputed

In [13]:
# score_dataset(train_now, 'emission')

# 3. 칼럼 추가

month 및 covid 칼럼 추가

In [14]:
train_now['date'] = pd.to_datetime('2021' + train_now['week_no'].astype(str) + '0', format='%Y%W%w')
train_now['month_no'] = train_now['date'].dt.month
train_now.drop(columns=['date'], inplace=True)

train_now['covid'] = (train_now.year == 2020) & (train_now.month_no > 2)

In [15]:
test_now['date'] = pd.to_datetime('2021' + test_now['week_no'].astype(str) + '0', format='%Y%W%w')
test_now['month_no'] = test_now['date'].dt.month
test_now.drop(columns=['date'], inplace=True)

test_now['covid'] = (test_now.year == 2020) & (test_now.month_no > 2)

In [16]:
train_now, encoder = one_hot(train_now, 'covid')
test_now, _ = one_hot(test_now, 'covid', encoder)



In [17]:
train_now.rename(columns={0: 'covid_false', 1: 'covid_true'}, inplace=True)
test_now.rename(columns={0: 'covid_false', 1: 'covid_true'}, inplace=True)

In [29]:
protected_columns.extend(['covid_False', 'covid_True'])

월별 데이터셋 따로 구성

In [19]:
train_sets = []
test_sets = []

for i in range(1, 13):
    train_month = train_now.loc[train_now['month_no'] == i]
    train_month = train_month.drop('month_no', axis=1)
    train_sets.append(train_month)

    test_month = test_now.loc[test_now['month_no'] == i]
    test_month = test_month.drop('month_no', axis=1)
    test_sets.append(test_month)

In [20]:
len(train_sets), len(test_sets)

(12, 12)

In [26]:
train_sets[11]

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission,covid_False,covid_True
48,ID_-0.510_29.290_2019_48,-0.510,29.290,2019,48,-0.000048,1.458318,-0.000063,0.246529,-12.885389,...,3440.042912,11.203506,0.324199,-12.896338,30.172556,-134.022436,32.437827,4.441943,1.0,0.0
49,ID_-0.510_29.290_2019_49,-0.510,29.290,2019,49,0.000252,0.866398,0.000228,0.145260,73.240585,...,4967.352908,33.751743,0.274405,15.586374,49.483856,-140.201218,30.735814,4.351813,1.0,0.0
50,ID_-0.510_29.290_2019_50,-0.510,29.290,2019,50,-0.000055,1.008103,-0.000057,0.081973,-12.900699,...,6025.432331,19.281656,0.339029,-99.362143,41.805481,-130.304616,36.869808,4.439641,1.0,0.0
51,ID_-0.510_29.290_2019_51,-0.510,29.290,2019,51,0.000723,1.003793,0.000622,0.172089,-13.285339,...,4691.211910,14.043937,0.291180,-0.026138,34.530260,-139.225956,31.530672,4.008589,1.0,0.0
101,ID_-0.510_29.290_2020_48,-0.510,29.290,2020,48,-0.000032,0.844771,-0.000027,0.023310,72.170433,...,7454.968820,25.529804,0.366663,15.521822,38.571108,-137.222735,30.416937,4.173133,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78968,ID_-3.299_30.301_2020_51,-3.299,30.301,2020,51,-0.000235,0.633192,-0.000149,0.257000,-99.141518,...,3858.187453,14.519789,0.248484,30.840922,39.529722,-138.964016,28.574091,25.591976,0.0,1.0
79018,ID_-3.299_30.301_2021_48,-3.299,30.301,2021,48,0.000284,1.195643,0.000340,0.191313,72.820518,...,4590.879504,20.245954,0.304797,-35.140368,40.113533,-129.935508,32.095214,29.404171,1.0,0.0
79019,ID_-3.299_30.301_2021_49,-3.299,30.301,2021,49,0.000083,1.130868,0.000063,0.177222,-12.856753,...,4659.130378,6.104610,0.314015,4.667058,47.528435,-134.252871,30.771469,29.186497,1.0,0.0
79020,ID_-3.299_30.301_2021_50,-3.299,30.301,2021,50,0.000083,1.130868,0.000063,0.177222,-12.856753,...,5222.646823,14.817885,0.288058,-0.340922,35.328098,-134.731723,30.716166,29.131205,1.0,0.0


# 4. Feature Selection(by Mutual Information, Correlation)

In [30]:
selected_columns = []

for idx, train in enumerate(train_sets):
    df, columns, _ = mi_score(train, 'emission', 0.4, corr=True, corr_threshold=0.4, protected=protected_columns)
    columns.remove('emission')
    selected_columns.append(columns)
    train_sets[idx] = df
    print(f'{idx} train의 칼럼 개수: {len(train_sets[idx].columns)}')

0 train의 칼럼 개수: 17
1 train의 칼럼 개수: 8
2 train의 칼럼 개수: 8
3 train의 칼럼 개수: 8
4 train의 칼럼 개수: 8
5 train의 칼럼 개수: 8
6 train의 칼럼 개수: 8
7 train의 칼럼 개수: 8
8 train의 칼럼 개수: 8
9 train의 칼럼 개수: 8
10 train의 칼럼 개수: 8
11 train의 칼럼 개수: 8


In [36]:
for train in train_sets:
    print(score_dataset(train, 'emission'))

8.62455871289881
9.119747537430815
8.416802220257834
9.750885651367195
7.794948428798791
8.09821217590328
8.15032913657319
7.911346516400593
8.222657767182522
10.030696144839013
8.89252907599103
8.85778518199263


In [31]:
for idx, test in enumerate(test_sets):
    test_sets[idx] = test.loc[:, selected_columns[idx]]

# 5. Clustering

In [28]:
# features = ['latitude', 'longitude']

In [29]:
# clusters = []
# encoders = []
# for idx, train in enumerate(train_sets):
#     df, cluster, encoder, n_clusters = kmc(train, features, n_clusters=20, elbow=True, encoding=True)
#     encoders.append(encoder)
#     train_sets[idx] = df
#     clusters.append(cluster)

The elbow method is excecuting




The elbow method is excecuting




The elbow method is excecuting




The elbow method is excecuting




In [30]:
# for idx, test in enumerate(test_sets):
#     df, _, _, _ = kmc(test, features, cluster=clusters[idx], encoding=True, encoder=encoders[idx])
#     test_sets[idx] = df

# 7. 최종 데이터셋

In [32]:
months = [i for i in range(1, 13)]
months

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [33]:
train_columns = []

for idx, train in enumerate(train_sets):
    col = list(map(str, list(train.columns)))
    train.columns = col
    col.sort()
    train_reordered = train[col]
    col.remove('emission')
    train_columns.append(col)
    data_name = os.path.join(DATA_PATH, '0808/train_{}_4.csv'.format(months[idx]))
    train_reordered.to_csv(data_name)

In [35]:
for idx, test in enumerate(test_sets):
    col = list(map(str, list(test.columns)))
    test.columns = col
    test_reordered = test[train_columns[idx]]
    data_name = os.path.join(DATA_PATH, '0808/test_{}_4.csv'.format(months[idx]))
    test_reordered.to_csv(data_name)