In [9]:
import lightgbm as lgb

from sklearn.decomposition import PCA
import math
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

import os
from os.path import join

# 1. 데이터 불러오기

- LGBM으로 여러 테스트를 거친 결과 마지막 전처리 데이터 중 roberst 기법을 사용한 데이터에서 몇 컬럼을 누락하여 PCA를 적용한 데이터가 성능이 좋아 해당 데이터로 사용을 하였음.

In [10]:
dpath = './data'
fname = 'lgbm_roberst'
ftrain = fname + '_train.pickle'
ftest = fname + '_test.pickle'

df_train = pd.read_pickle(join(dpath, ftrain))
df_test = pd.read_pickle(join(dpath, ftest))

merge = pd.concat([df_train, df_test], axis=0, ignore_index=True)
merge_x = merge.drop(columns='item_cnt_month')
merge_y = merge[['item_cnt_month']]

print(len(df_train.loc[df_train.date_block_num == 33]), len(df_test))
print('valid range:', 28680+214200)
print('test range:', 214200)

28680 214200
valid range: 242880
test range: 214200


- 데이터에 사용된 컬럼은 다음과 같음.

In [16]:
df_train.columns

Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'item_price',
       'item_avg_item_price', 'date_item_avg_item_price',
       'date_item_avg_item_price_lag_1', 'date_item_avg_item_price_lag_2',
       'date_item_avg_item_price_lag_3', 'delta_price_lag_1',
       'delta_price_lag_2', 'delta_price_lag_3', 'month',
       'item_cnt_month_lag_1', 'item_cnt_month_lag_2', 'item_cnt_month_lag_3',
       'item_price_lag_1', 'item_price_lag_2', 'item_price_lag_3', 'capital',
       'non_capital', 'city_etc', 'shopping_mall', 'online_mall',
       'computer_mall', 'type_etc'],
      dtype='object')

# 2. PCA

- 해당 데이터의 차원축소를 위해 지정된 차원 수는 5로 하였음.

In [12]:
## PCA 주성분 분석
pca = PCA(n_components=5)
printcipalComponents = pca.fit_transform(merge_x)
principalDf = pd.DataFrame(data=printcipalComponents, columns = ['1', '2', '3', '4', '5'])
principalDf

Unnamed: 0,1,2,3,4,5
0,-10260.207724,29.469094,20.718112,-2.044290,1.136570
1,-9827.207345,29.487248,20.520595,-4.665327,0.072656
2,-9809.209699,29.518942,21.534277,13.547727,-1.216789
3,-9456.209600,29.537576,21.501721,12.752132,8.849699
4,-9452.209187,29.529120,21.305177,9.604630,-6.000755
...,...,...,...,...,...
788548,8162.796666,-3.605847,-8.825965,-0.842984,-0.192800
788549,5896.796619,-3.697196,-9.069161,-0.726519,0.277450
788550,5465.796702,-3.718351,-9.090306,-1.172446,-0.202179
788551,9356.796656,-3.556303,-8.707196,-0.729408,-0.227849


- 첫 주성분에서 99%의 분산을 설명하지만 한 차원으로 학습을 하기에 무리가 있어보여 기존 5개로 진행

In [13]:
# explained_variance_ratio_는 고유값이며 설명가능한 분산량을 의미한다.
# 5개의 주성분으로 95% 이상 설명력을 가지는 것을 확인
print(pca.explained_variance_ratio_)

[9.99989112e-01 7.53213319e-06 2.18049348e-06 7.37853527e-07
 1.30977326e-07]


# 3. 모델링

In [14]:
x_train, x_valid = principalDf.values[:-242880], principalDf.values[-242880:-214200]
y_train, y_valid = merge_y[:-242880], merge_y[-242880:-214200]
x_test = principalDf.values[-214200:]

train_ds = lgb.Dataset(x_train,y_train) 
valid_ds = lgb.Dataset(x_valid,y_valid) 

params = {'learning_rate': 0.01, 
          'max_depth': 50, 
          'boosting': 'gbdt', 
          'objective': 'rmse', 
          'metric': 'rmse', 
          'is_training_metric': True, 
          'num_leaves': 600, 
          'feature_fraction': 0.9, 
          'bagging_fraction': 0.7, 
          'bagging_freq': 5,
          'lambda_l1' :5,
          'lambda_l2' : 10,
          'seed':42}
    
model = lgb.train(params, train_ds, 5000, valid_ds, early_stopping_rounds=100, verbose_eval=50)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 545673, number of used features: 5
[LightGBM] [Info] Start training from score 1.019563
Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 0.401176
[100]	valid_0's rmse: 0.377232
[150]	valid_0's rmse: 0.363501
[200]	valid_0's rmse: 0.356305
[250]	valid_0's rmse: 0.349351
[300]	valid_0's rmse: 0.344069
[350]	valid_0's rmse: 0.339207
[400]	valid_0's rmse: 0.336692
[450]	valid_0's rmse: 0.335068
[500]	valid_0's rmse: 0.333849
[550]	valid_0's rmse: 0.333677
[600]	valid_0's rmse: 0.333299
[650]	valid_0's rmse: 0.333305
[700]	valid_0's rmse: 0.3333
[750]	valid_0's rmse: 0.333132
[800]	valid_0's rmse: 0.333219
[850]	valid_0's rmse: 0.333436
Early stopping, best iteration is:
[770]	valid_0's rmse: 0.333037


- 모델 예측 결과를 다음 데이터프레임에 입력하여 제출하였음. 결과 Score는 1.31835로 나옴.

In [15]:
pred = model.predict(x_test).clip(0,20)
pred = pd.DataFrame(pred)
pred.columns = ['item_cnt_month']
temp = [i for i in range(0,214200,1)]
pred['ID'] = temp
pred = pred[['ID', 'item_cnt_month']]
pred.to_csv('./robust_submission_true.csv', index=False)

pred

Unnamed: 0,ID,item_cnt_month
0,0,0.846637
1,1,0.810977
2,2,0.849618
3,3,0.915049
4,4,0.775672
...,...,...
214195,214195,0.723623
214196,214196,0.852511
214197,214197,0.730829
214198,214198,0.726474
