In [33]:
import os
import pandas as pd
from tools.dataset import score_dataset
from tools.preprocessing.missing_values import get_missing_raio, delete_columns, impute_missing_values
from tools.preprocessing.outliers import delete_outliers, impute_outliers, get_limits
from tools.preprocessing.scaling import minmax
from tools.engineering.mi import mi_score
from tools.engineering.encoding import one_hot
from tools.engineering.clustering import kmc


# 1. 데이터 로드

In [34]:
DATA_PATH = './data'

train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

# train_origin = train.copy()
# test_origin = test.copy()

print(f'train shape: {train.shape}')
print(f'test shape: {test.shape}')

train shape: (79023, 76)
test shape: (24353, 75)


In [35]:
# 데이터 일부 확인
train.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


In [36]:
# train.drop(['ID_LAT_LON_YEAR_WEEK'], axis=1, inplace=True)
# test.drop(['ID_LAT_LON_YEAR_WEEK'], axis=1, inplace=True)

# 2. 결측값 처리

In [37]:
# 삭제하지 않을 칼럼
protected_columns = ['latitude', 'longitude', 'week_no', 'year']

결측값 비율이 높은 칼럼 삭제

In [38]:
train_deleted, deleted_columns = delete_columns(train, 0.3, target='emission')

print(f'삭제된 칼럼 개수: {len(deleted_columns)}')

삭제된 칼럼 개수: 7


In [39]:
# 테스트 데이터에도 적용
test_deleted = test.drop(deleted_columns, axis=1)

In [40]:
# # 결측값 제거 결과
# data_list = [['train', train], ['train_deleted', train_deleted]]
# results = []

# for name, data in data_list:
#     score = score_dataset(data, 'emission')
#     results.append([name, score])

# results

칼럼을 삭제한 데이터셋의 스코어가 더 좋다.

가장 이상적인 결측값 대체법 찾기 

In [41]:
# methods = ['mean', 'linear', 'fill']
# results = []

# for method in methods:
#     train_imputed, _ = impute_missing_values(train_deleted, method)
#     score = score_dataset(train_imputed, 'emission')
#     results.append([method, score])
#     print(f'method "{method}" 계산 완료')

# results

In [42]:
# best_method = sorted(results, key=lambda x: x[1])[0]
# print(f'best method: {best_method}')

In [43]:
# train_imputed, _ = impute_missing_values(train_deleted, best_method[0])
# test_imputed, _ = impute_missing_values(test_deleted, best_method[0])

train_imputed, _ = impute_missing_values(train_deleted, 'fill')
test_imputed, _ = impute_missing_values(test_deleted, 'fill')

In [44]:
train_now = train_imputed
test_now = test_imputed

In [45]:
# score_dataset(train_now, 'emission')

# 3. 칼럼 추가

month 및 covid 칼럼 추가

In [46]:
train_now['date'] = pd.to_datetime('2021' + train_now['week_no'].astype(str) + '0', format='%Y%W%w')
train_now['month_no'] = train_now['date'].dt.month
train_now.drop(columns=['date'], inplace=True)

train_now['covid'] = (train_now.year == 2020) & (train_now.month_no > 2)

In [47]:
test_now['date'] = pd.to_datetime('2021' + test_now['week_no'].astype(str) + '0', format='%Y%W%w')
test_now['month_no'] = test_now['date'].dt.month
test_now.drop(columns=['date'], inplace=True)

test_now['covid'] = (test_now.year == 2020) & (test_now.month_no > 2)

In [48]:
train_now, encoder = one_hot(train_now, 'covid')
test_now, _ = one_hot(test_now, 'covid', encoder)



In [49]:
train_now.rename(columns={0: 'covid_false', 1: 'covid_true'}, inplace=True)
test_now.rename(columns={0: 'covid_false', 1: 'covid_true'}, inplace=True)

In [50]:
protected_columns.extend(['covid_False', 'covid_True', 'month_no'])

season 칼럼 추가

In [51]:
SEASON = {
    'summer': [12, 1, 2],
    'fall': [3, 4, 5],
    'winter': [6, 7, 8],
    'spring': [9, 10, 11],
}

In [52]:
def insert_season(x, season):
    if x in season['summer']:
        return 'summer'
    elif x in season['fall']:
        return 'fall'
    elif x in season['winter']:
        return 'winter'
    elif x in season['spring']:
        return 'spring'
    else:
        raise Exception('unknown week')

In [53]:
train_now['season'] = train_now['month_no'].apply(insert_season, args=[SEASON])

test_now['season'] = test_now['month_no'].apply(insert_season, args=[SEASON])

print(train_now['season'].value_counts(), test_now['season'].value_counts())

season
summer    20874
fall      19383
winter    19383
spring    19383
Name: count, dtype: int64 season
fall      6461
winter    6461
spring    6461
summer    4970
Name: count, dtype: int64


계절별 데이터셋 따로 구성

In [59]:
train_summer = train_now.loc[train_now['season'] == 'summer']
train_fall = train_now.loc[train_now['season'] == 'fall']
train_winter = train_now.loc[train_now['season'] == 'winter']
train_spring = train_now.loc[train_now['season'] == 'spring']

test_summer = test_now.loc[test_now['season'] == 'summer']
test_fall = test_now.loc[test_now['season'] == 'fall']
test_winter = test_now.loc[test_now['season'] == 'winter']
test_spring = test_now.loc[test_now['season'] == 'spring']

In [60]:
train_sets = [train_summer, train_fall, train_winter, train_spring]
test_sets = [test_summer, test_fall, test_winter, test_spring]

In [61]:
for i in range(len(train_sets)):
    train_sets[i] = train_sets[i].drop('season', axis=1)

for i in range(len(test_sets)):
    test_sets[i] = test_sets[i].drop('season', axis=1)

# 4. Feature Selection(by Mutual Information, Correlation)

In [62]:
selected_columns = []

for idx, train in enumerate(train_sets):
    df, columns, _ = mi_score(train, 'emission', 0.4, corr=True, corr_threshold=0.4, protected=protected_columns)
    columns.remove('emission')
    selected_columns.append(columns)
    train_sets[idx] = df
    print(f'{idx} train의 칼럼 개수: {len(train_sets[idx].columns)}')

0 train의 칼럼 개수: 9
1 train의 칼럼 개수: 9
2 train의 칼럼 개수: 9
3 train의 칼럼 개수: 9


In [63]:
for train in train_sets:
    print(score_dataset(train, 'emission'))

8.652852165161999
8.735763951839694
8.427803324867021
8.95630990981607


In [64]:
for idx, test in enumerate(test_sets):
    test_sets[idx] = test.loc[:, selected_columns[idx]]

# 5. Clustering

In [27]:
# features = ['latitude', 'longitude', 'emission']

In [28]:
# clusters = []
# encoders = []

# for idx, train in enumerate(train_sets):
#     df_km = train.groupby(by=['latitude', 'longitude'], as_index=False)['emission'].mean()
#     df_cluster, cluster, _, n_clusters = kmc(df_km, features, n_clusters=5, elbow=True, encoding=False)
#     train_sets[idx] = train.merge(df_cluster[['latitude', 'longitude', 'cluster']], on=['latitude', 'longitude'])
    
#     train_sets[idx], encoder = one_hot(train_sets[idx], 'cluster')
    
#     encoders.append(encoder)
#     clusters.append(df_cluster)

The elbow method is excecuting




The elbow method is excecuting
The elbow method is excecuting
The elbow method is excecuting




In [29]:
# for idx, test in enumerate(test_sets):
#     # df_km = test.groupby(by=['latitude', 'longitude'], as_index=False)['emission'].mean()
#     # df_cluster, _, _, _ = kmc(test, features, cluster=clusters[idx], encoding=False)
#     test_sets[idx] = test.merge(clusters[idx][['latitude', 'longitude', 'cluster']], on=['latitude', 'longitude'])
#     test_sets[idx], _ = one_hot(test_sets[idx], 'cluster', encoders[idx])

# 7. 최종 데이터셋

In [65]:
season_order = list(SEASON.keys())
season_order

['summer', 'fall', 'winter', 'spring']

In [66]:
train_columns = []

for idx, train in enumerate(train_sets):
    col = list(map(str, list(train.columns)))
    train.columns = col
    col.sort()
    train_reordered = train[col]
    col.remove('emission')
    train_columns.append(col)
    data_name = os.path.join(DATA_PATH, '0808/train_{}_3.csv'.format(season_order[idx]))
    train_reordered.to_csv(data_name)

In [68]:
for idx, test in enumerate(test_sets):
    col = list(map(str, list(test.columns)))
    test.columns = col
    test_reordered = test[train_columns[idx]]
    data_name = os.path.join(DATA_PATH, '0808/test_{}_3.csv'.format(season_order[idx]))
    test_reordered.to_csv(data_name)