In [1]:
import os
import pandas as pd
from tools.dataset import score_dataset
from tools.preprocessing.missing_values import get_missing_raio, delete_columns, impute_missing_values
from tools.preprocessing.outliers import delete_outliers, impute_outliers, get_limits
from tools.preprocessing.scaling import minmax
from tools.engineering.mi import mi_score


# 1. 데이터 로드

In [10]:
DATA_PATH = './data'

train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

# train_origin = train.copy()
# test_origin = test.copy()

print(f'train shape: {train.shape}')
print(f'test shape: {test.shape}')

train shape: (79023, 76)
test shape: (24353, 75)


In [11]:
# 데이터 일부 확인
train.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,-0.000108,0.603019,-6.5e-05,0.255668,-98.593887,...,3664.436218,61085.80957,2615.120483,15.568533,0.272292,-12.628986,35.632416,-138.786423,30.75214,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,2.1e-05,0.728214,1.4e-05,0.130988,16.592861,...,3651.190311,66969.478735,3174.572424,8.690601,0.25683,30.359375,39.557633,-145.18393,27.251779,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,0.000514,0.748199,0.000385,0.110018,72.795837,...,4216.986492,60068.894448,3516.282669,21.10341,0.251101,15.377883,30.401823,-142.519545,26.193296,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,,,,,,...,5228.507736,51064.547339,4180.973322,15.386899,0.262043,-11.293399,24.380357,-132.665828,28.829155,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,-7.9e-05,0.676296,-4.8e-05,0.121164,4.121269,...,3980.59812,63751.125781,3355.710107,8.114694,0.235847,38.532263,37.392979,-141.509805,22.204612,4.347317


In [12]:
test_id = test['ID_LAT_LON_YEAR_WEEK']

train.drop(['ID_LAT_LON_YEAR_WEEK', 'year'], axis=1, inplace=True)
test.drop(['ID_LAT_LON_YEAR_WEEK', 'year'], axis=1, inplace=True)

# 2. 결측값 처리

In [None]:
# 삭제하지 않을 칼럼
PROTECTED_COLUMNS = ['latitude', 'longitude', 'week_no']

결측값 비율이 높은 칼럼 삭제

In [13]:
train_deleted, deleted_columns = delete_columns(train, 0.3, target='emission')

print(f'삭제된 칼럼 개수: {len(deleted_columns)}')

삭제된 칼럼 개수: 7


In [14]:
# 테스트 데이터에도 적용
test_deleted = test.drop(deleted_columns, axis=1)

In [7]:
# 결측값 제거 결과
data_list = [['train', train], ['train_deleted', train_deleted]]
results = []

for name, data in data_list:
    score = score_dataset(data, 'emission')
    results.append([name, score])

results

[['train', 9.103228178514371], ['train_deleted', 9.081446014439852]]

칼럼을 삭제한 데이터셋의 스코어가 더 좋다.

가장 이상적인 결측값 대체법 찾기 

In [15]:
methods = ['mean', 'linear', 'fill']
results = []

for method in methods:
    train_imputed, _ = impute_missing_values(train_deleted, method)
    score = score_dataset(train_imputed, 'emission')
    results.append([method, score])
    print(f'method "{method}" 계산 완료')

results

method "mean" 계산 완료
method "linear" 계산 완료
method "fill" 계산 완료


[['mean', 8.956733358720077],
 ['linear', 8.898703621611462],
 ['fill', 8.84700168290635]]

In [19]:
best_method = sorted(results, key=lambda x: x[1])[0]
print(f'best method: {best_method}')

best method: ['fill', 8.84700168290635]


In [20]:
train_imputed, _ = impute_missing_values(train_deleted, best_method[0])
test_imputed, _ = impute_missing_values(test_deleted, best_method[0])
# test_imputed = test_deleted.fillna(tool)

In [21]:
train_now = train_imputed
test_now = test_imputed

# 3. 이상치 처리

numeric 칼럼에 대해 이상치 처리

In [21]:
outliers_columns = list(train_now.select_dtypes(exclude=['O']).columns)
outliers_columns.remove('emission')
outliers_columns[:10]

['latitude',
 'longitude',
 'year',
 'week_no',
 'SulphurDioxide_SO2_column_number_density',
 'SulphurDioxide_SO2_column_number_density_amf',
 'SulphurDioxide_SO2_slant_column_number_density',
 'SulphurDioxide_cloud_fraction',
 'SulphurDioxide_sensor_azimuth_angle',
 'SulphurDioxide_sensor_zenith_angle']

이상치 기준이 되는 상한, 하한 구하기

In [22]:
limits = get_limits(train_now, outliers_columns)
limits.items()

dict_items([('latitude', [0.41900000000000026, -4.173]), ('longitude', [32.2845, 27.448500000000003]), ('year', [2024.0, 2016.0]), ('week_no', [78.0, -26.0]), ('SulphurDioxide_SO2_column_number_density', [0.000513877984210504, -0.0004540995530549583]), ('SulphurDioxide_SO2_column_number_density_amf', [1.2902691189624758, 0.36975534250679815]), ('SulphurDioxide_SO2_slant_column_number_density', [0.00041078522838457806, -0.00036556129722763]), ('SulphurDioxide_cloud_fraction', [0.3579012532719073, -0.031659258299414794]), ('SulphurDioxide_sensor_azimuth_angle', [241.72112203980677, -234.71471377764252]), ('SulphurDioxide_sensor_zenith_angle', [74.43875067040783, 1.1427080050441525]), ('SulphurDioxide_solar_azimuth_angle', [65.9132108688354, -239.83539295196528]), ('SulphurDioxide_solar_zenith_angle', [43.08536106871162, 12.65385854363208]), ('SulphurDioxide_SO2_column_number_density_15km', [0.00017373174944799763, -0.00015564230224563433]), ('CarbonMonoxide_CO_column_number_density', [0.

In [23]:
train_outliers_deleted, deleted_indices = delete_outliers(train_now, outliers_columns, limits)
train_impute_outliers = impute_outliers(train_now, outliers_columns, limits)

In [24]:
# 삭제된 행의 개수
len(train_outliers_deleted)

45397

이상치 처리 비교(이상치 처리 전: 8.75663387517149)

In [15]:
outliers_dfs = [['deleted', train_outliers_deleted], ['imputed', train_impute_outliers]]
results = []

for name, outliers_df in outliers_dfs:
    score = score_dataset(outliers_df, 'emission')
    results.append([name, score])

results

[['deleted', 8.49429524751788], ['imputed', 8.7388773949861]]

이상치가 있는 행을 제거했을 때가 가장 이상적인 score 기록
그러나 너무 많은 행이 삭제되므로 값을 대체

In [25]:
test_impute_outliers = impute_outliers(test_now, outliers_columns, limits)

In [26]:
train_now = train_impute_outliers
test_now = test_impute_outliers

# 4. Scaling

In [27]:
scaling_columns = set(train_outliers_deleted.select_dtypes(exclude=['O']).columns)
no_scaling_columns = set(['emission', 'latitude', 'longitude', 'year', 'week_no'])

scaling_columns = list(scaling_columns - no_scaling_columns)

In [28]:
train_scaled, scaler = minmax(train_now, scaling_columns)
train_scaled.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle,emission
0,ID_-0.510_29.290_2019_00,-0.51,29.29,2019,0,0.357198,0.253406,0.386722,0.852234,0.268351,...,0.256252,0.533547,0.212175,0.319321,0.500316,0.48667,0.368867,0.112206,0.596886,3.750994
1,ID_-0.510_29.290_2019_01,-0.51,29.29,2019,1,0.490328,0.389411,0.488405,0.436631,0.650229,...,0.254557,0.65114,0.288041,0.15929,0.437434,0.758456,0.483222,0.0633,0.46826,4.025176
2,ID_-0.510_29.290_2019_02,-0.51,29.29,2019,2,1.0,0.411122,0.966374,0.36673,0.836558,...,0.326947,0.513223,0.33438,0.448103,0.414133,0.663738,0.216482,0.083668,0.429365,4.231381
3,ID_-0.510_29.290_2019_03,-0.51,29.29,2019,3,0.694011,0.372066,0.687952,0.385307,0.72272,...,0.456366,0.333259,0.424517,0.315095,0.458633,0.495114,0.041056,0.158996,0.526223,4.305286
4,ID_-0.510_29.290_2019_04,-0.51,29.29,2019,4,0.38775,0.33301,0.40953,0.403884,0.608882,...,0.296703,0.586817,0.312605,0.14589,0.352094,0.810127,0.420158,0.091387,0.282795,4.347317


In [29]:
score = score_dataset(train_scaled, 'emission')
score

8.848195284542431

scaling이 큰 의미가 없다.

In [30]:
# 테스트 데이터에도 scaling 수행
test_scaled, _ = minmax(test_now, scaling_columns)
test_scaled.head()

Unnamed: 0,ID_LAT_LON_YEAR_WEEK,latitude,longitude,year,week_no,SulphurDioxide_SO2_column_number_density,SulphurDioxide_SO2_column_number_density_amf,SulphurDioxide_SO2_slant_column_number_density,SulphurDioxide_cloud_fraction,SulphurDioxide_sensor_azimuth_angle,...,Cloud_cloud_top_pressure,Cloud_cloud_top_height,Cloud_cloud_base_pressure,Cloud_cloud_base_height,Cloud_cloud_optical_depth,Cloud_surface_albedo,Cloud_sensor_azimuth_angle,Cloud_sensor_zenith_angle,Cloud_solar_azimuth_angle,Cloud_solar_zenith_angle
0,ID_-0.510_29.290_2022_00,1.0,0.321429,0.0,0.0,0.201496,0.364783,0.232919,0.528106,0.374117,...,0.217114,0.666251,0.233233,0.653909,0.050704,0.457539,0.035528,0.516122,0.121612,0.76338
1,ID_-0.510_29.290_2022_01,1.0,0.321429,0.0,0.020833,0.281705,0.258898,0.313628,0.0,0.692547,...,0.425313,0.468456,0.457115,0.4488,0.079539,0.563887,0.405666,0.650183,0.077314,0.654085
2,ID_-0.510_29.290_2022_02,1.0,0.321429,0.0,0.041667,0.223644,0.195479,0.25328,0.266256,0.244992,...,0.185696,0.717032,0.200269,0.706567,0.073832,0.511086,0.775774,0.704603,0.028521,0.485473
3,ID_-0.510_29.290_2022_03,1.0,0.321429,0.0,0.0625,0.26088,0.263137,0.29278,0.670153,0.67715,...,0.463821,0.422735,0.5012,0.401388,0.082134,0.587373,0.437813,0.646146,0.105905,0.598489
4,ID_-0.510_29.290_2022_04,1.0,0.321429,0.0,0.083333,0.129779,0.177366,0.169897,0.681235,0.692364,...,0.392962,0.505429,0.424518,0.48714,0.092811,0.545809,0.499273,0.461078,0.10216,0.467045


In [30]:
train_now = train_impute_outliers
test_now = test_impute_outliers

# 5. Mutual Information

In [45]:
import pandas as pd
from sklearn.feature_selection import mutual_info_regression


def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores


def mi_score(df, target, threshold, corr=False, corr_threshold=None):
    df_columns = list(df.columns)
    df_mi = df.copy()
    
    X = df_mi.drop(target, axis=1)
    y = df_mi.pop(target)
    
    X_numeric = X.select_dtypes(exclude=['O'])
    object_columns = list(X.select_dtypes(include=['O']).columns)
    
    discrete_features = X_numeric.dtypes == int

    mi_scores = make_mi_scores(X_numeric, y, discrete_features)

    mi_selected_columns = list(mi_scores.loc[mi_scores >= threshold].index)


    # 상관관계가 일정 수준 이상일 경우에도 칼럼 포함
    corr_selected_columns = []
    if corr:
        df_numeric = df.select_dtypes(exclude=['O'])
        df_corr = df_numeric.corr()

        corr = abs(df_corr[target])
        corr_selected_columns = list(corr[corr >= corr_threshold].index)

    selected_columns_all = list(set(mi_selected_columns + corr_selected_columns)) + object_columns
    df_selected = df.loc[:, selected_columns_all]

    return df_selected, selected_columns_all

In [46]:
train_mi, mi_selected_columns = mi_score(train_now, 'emission', 0.01, corr=True, corr_threshold=0.1)

['NitrogenDioxide_sensor_azimuth_angle', 'Ozone_solar_azimuth_angle', 'emission', 'week_no', 'SulphurDioxide_SO2_column_number_density_amf', 'NitrogenDioxide_sensor_zenith_angle', 'NitrogenDioxide_stratospheric_NO2_column_number_density', 'UvAerosolIndex_sensor_azimuth_angle', 'Cloud_solar_azimuth_angle', 'Cloud_surface_albedo', 'Formaldehyde_solar_azimuth_angle', 'Cloud_cloud_top_height', 'Formaldehyde_sensor_zenith_angle', 'longitude', 'NitrogenDioxide_absorbing_aerosol_index', 'NitrogenDioxide_tropopause_pressure', 'UvAerosolIndex_solar_azimuth_angle', 'UvAerosolIndex_sensor_zenith_angle', 'CarbonMonoxide_sensor_altitude', 'SulphurDioxide_solar_azimuth_angle', 'Ozone_sensor_zenith_angle', 'Cloud_cloud_base_pressure', 'Formaldehyde_sensor_azimuth_angle', 'UvAerosolIndex_solar_zenith_angle', 'SulphurDioxide_sensor_azimuth_angle', 'NitrogenDioxide_solar_azimuth_angle', 'NitrogenDioxide_sensor_altitude', 'SulphurDioxide_SO2_column_number_density', 'SulphurDioxide_sensor_zenith_angle', '

In [47]:
train_mi.head()

Unnamed: 0,NitrogenDioxide_sensor_azimuth_angle,Ozone_solar_azimuth_angle,emission,week_no,SulphurDioxide_SO2_column_number_density_amf,NitrogenDioxide_sensor_zenith_angle,NitrogenDioxide_stratospheric_NO2_column_number_density,UvAerosolIndex_sensor_azimuth_angle,Cloud_solar_azimuth_angle,Cloud_surface_albedo,...,Formaldehyde_tropospheric_HCHO_column_number_density_amf,CarbonMonoxide_sensor_zenith_angle,Cloud_cloud_base_height,NitrogenDioxide_NO2_column_number_density,SulphurDioxide_SO2_column_number_density_15km,CarbonMonoxide_solar_azimuth_angle,UvAerosolIndex_sensor_altitude,CarbonMonoxide_sensor_azimuth_angle,Ozone_sensor_azimuth_angle,ID_LAT_LON_YEAR_WEEK
0,5.471037,-138.786446,3.750994,0,0.603019,35.265195,3e-05,-12.628979,-138.786423,0.272292,...,0.86323,52.775928,2615.120483,4.7e-05,-2.7e-05,-149.875565,829864.546875,71.111977,-12.628979,ID_-0.510_29.290_2019_00
1,5.471037,-143.097868,4.025176,1,0.728214,35.265195,3e-05,16.152492,-145.18393,0.25683,...,1.172826,38.982368,3174.572424,4.7e-05,1.2e-05,-140.158048,829747.856973,-1.019594,24.464335,ID_-0.510_29.290_2019_01
2,72.795837,-135.364627,4.231381,2,0.748199,52.868816,3e-05,-41.557633,-142.519545,0.251101,...,1.175467,52.344378,3516.282669,3.1e-05,0.000154,-133.683714,829892.960629,-54.801144,-41.557633,ID_-0.510_29.290_2019_02
3,29.68202,-137.489602,4.305286,3,0.712247,43.97372,3e-05,-0.00127,-132.665828,0.262043,...,1.022274,39.676184,4180.973322,4.1e-05,6.3e-05,-142.575915,829794.848214,28.916541,-0.00127,ID_-0.510_29.290_2019_03
4,-13.431798,-136.448518,4.347317,4,0.676296,35.078624,3e-05,-0.604325,-141.509805,0.235847,...,0.869081,33.703073,3355.710107,5.1e-05,-2.8e-05,-134.854258,829736.142857,-12.501663,-0.604325,ID_-0.510_29.290_2019_04


In [48]:
score = score_dataset(train_mi, 'emission')
score

8.688779766088285

MI 스코어 적용 후 score 개선

In [51]:
mi_selected_columns.remove('emission')

test_mi = test_now.loc[:, mi_selected_columns]

In [52]:
train_now = train_mi
test_now = test_mi

# 6. Clustering

In [77]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder


def kmc(df_origin, features, cluster=None, n_clusters=10, encoder=None):
    df = df_origin.copy()
    X = df_origin.copy()
    X = X.loc[:, features]
    X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)

    if not cluster:
        cluster = KMeans(n_clusters=n_clusters, random_state=0)

    X["cluster"] = cluster.fit_predict(X_scaled)
    
    df['cluster'] = X["cluster"]

    ## one hot encoding ##
    # sparse=False: 인코딩된 칼럼이 numpy array type을 갖도록
    if not encoder:
        encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
        encoder.fit(df[['cluster']])

    df['cluster'] = df['cluster'].astype('O')
    
    df_encoded = pd.DataFrame(encoder.transform(df[['cluster']]))
    df_encoded.index = df.index
    df = df.drop('cluster', axis=1)

    df = pd.concat([df, df_encoded], axis=1)

    return df, cluster

In [78]:
features = ['latitude', 'longitude', 'week_no']

In [85]:
train_clustered, cluster = kmc(train_scaled, features)

  super()._check_params_vs_input(X, default_n_init=10)


In [86]:
score = score_dataset(train_clustered, 'emission')
score

8.464154460464664

clustering 진행 후 스코어가 오히려 악화되었으므로 clustering 적용 안 함

# 7. 최종 데이터셋

In [53]:
train_final = train_now.copy()
test_final = test_now.copy()

In [54]:
print(train_final.shape)
print(test_final.shape)

(79023, 44)
(24353, 43)


In [55]:
# 데이터 저장
data_name = os.path.join(DATA_PATH, 'train_final_0806_2.csv')
train_final.to_csv(data_name)

data_name = os.path.join(DATA_PATH, 'test_final_0806_2.csv')
test_final.to_csv(data_name)