### 使用するライラリ

In [1]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

### 作成済みDataFrameの読み込み

In [28]:
DF_FILE_PATH = '../input/pickle/df_baseline_ver1.pickle.gz'
SAMPLE_SUBMIT_PATH = '../input/csv/sample_submission.csv.gz'

In [3]:
df = pd.read_pickle(DF_FILE_PATH)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31681090 entries, 0 to 31681089
Data columns (total 68 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   id                         object        
 1   item_id                    int16         
 2   dept_id                    int8          
 3   cat_id                     int8          
 4   store_id                   int8          
 5   state_id                   int8          
 6   d                          int16         
 7   demand                     int16         
 8   part                       object        
 9   date                       datetime64[ns]
 10  wm_yr_wk                   int16         
 11  event_name_1               float16       
 12  event_type_1               float16       
 13  event_name_2               float16       
 14  event_type_2               float16       
 15  snap_CA                    int8          
 16  snap_TX                    int8   

### カテゴリ型コラムと未使用コラムを指定

In [4]:
cat_feats = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id']\
            + ['event_name_1', 'event_name_2', 'event_type_1', 'event_type_2']\
            + ['quarter', 'month', 'week', 'dayofweek']
useless_cols = ['id', 'date', 'demand','d', 'part', 'wm_yr_wk', 'weekday', 'day']
train_cols = df.columns[~df.columns.isin(useless_cols)]
X_train = df[df['part']=='train'][train_cols]
y_train = df[df['part']=='train']['demand']
X_test = df[df['part']=='validation'][train_cols]
test = df[df['part']=='validation'][['id', 'd']].reset_index(drop=True)

In [7]:
gc.collect()

40

### 不用データ削除

In [8]:
df['rolling_mean_t7'].dropna(inplace=True)

### 学習時のvalid_dataをランダムチョイスで作成

In [9]:
%%time

np.random.seed(777)

fake_valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feats, free_raw_data=False)
fake_valid_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                              categorical_feature=cat_feats,
                 free_raw_data=False)# This is a random sample, we're not gonna apply any time series train-test-split tricks here!

Wall time: 26.4 s


In [10]:
# train_data = lgb.Dataset(X_train , label = y_train, 
#                          categorical_feature=cat_feats, free_raw_data=False)
# fake_valid_data = lgb.Dataset(X_test, label = y_test,
#                               categorical_feature=cat_feats, free_raw_data=False)

In [11]:
del df, X_train, y_train, fake_valid_inds,train_inds
gc.collect()

40

In [12]:
params = {
        "objective" : "poisson",
        "metric" :"mse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
        "nthread" : 3,
        "metric": ["mse"],
        "verbosity": 1,
        "num_iterations" : 5000,
        "num_leaves": 128,
        "min_data_in_leaf": 100,
}

In [13]:
%%time
m_lgb = lgb.train(params, 
                  train_data, 
                  valid_sets = [fake_valid_data], 
                  verbose_eval=10, 
                  early_stopping_rounds=100,
                 ) 

Training until validation scores don't improve for 100 rounds
[10]	valid_0's l2: 10.22
[20]	valid_0's l2: 7.95245
[30]	valid_0's l2: 6.90587
[40]	valid_0's l2: 6.39137
[50]	valid_0's l2: 6.16852
[60]	valid_0's l2: 6.04062
[70]	valid_0's l2: 5.9529
[80]	valid_0's l2: 5.89756
[90]	valid_0's l2: 5.86631
[100]	valid_0's l2: 5.83417
[110]	valid_0's l2: 5.80337
[120]	valid_0's l2: 5.77679
[130]	valid_0's l2: 5.74296
[140]	valid_0's l2: 5.72329
[150]	valid_0's l2: 5.70753
[160]	valid_0's l2: 5.67426
[170]	valid_0's l2: 5.65068
[180]	valid_0's l2: 5.62731
[190]	valid_0's l2: 5.60493
[200]	valid_0's l2: 5.5951
[210]	valid_0's l2: 5.58198
[220]	valid_0's l2: 5.57189
[230]	valid_0's l2: 5.55677
[240]	valid_0's l2: 5.54461
[250]	valid_0's l2: 5.53322
[260]	valid_0's l2: 5.52208
[270]	valid_0's l2: 5.50861
[280]	valid_0's l2: 5.49478
[290]	valid_0's l2: 5.48026
[300]	valid_0's l2: 5.46103
[310]	valid_0's l2: 5.44407
[320]	valid_0's l2: 5.42693
[330]	valid_0's l2: 5.4154
[340]	valid_0's l2: 5.39811


In [14]:
m_lgb.save_model("model_v1.lgb")

<lightgbm.basic.Booster at 0x28a885c5bc8>

In [5]:
m_lgb = lgb.Booster(model_file="model_v1.lgb")

### 学習ここまで

### 予測ここから

In [7]:
y_pred = m_lgb.predict(X_test)

In [26]:
test = pd.concat([test, pd.Series(y_pred, name='demand')], axis = 1)

In [24]:
# idがvalidationとかなければつける
#test = test.assign(id=test.id + "_" + np.where(test.d <= 1941, "validation", "evaluation"),
#                   F="F" + (test.d - 1913 - 28 * (test.d > 1941)).astype("str"))

In [29]:
sample_submission = pd.read_csv(SAMPLE_SUBMIT_PATH)

In [31]:
test = test.assign(F="F" + (test.d - 1913 - 28 * (test.d > 1941)).astype("str"))
submission = test.pivot(index="id", columns="F", values="demand").reset_index()[sample_submission.columns]
submission = sample_submission[["id"]].merge(submission, how="left", on="id")
submission.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.81334,0.718993,0.704529,0.648331,0.775682,0.992471,1.008007,0.710761,0.704353,...,0.855882,1.07245,1.053544,0.786684,0.723608,0.727034,0.732214,0.891831,1.140633,1.065064
1,HOBBIES_1_002_CA_1_validation,0.286431,0.269616,0.272273,0.277852,0.328231,0.350539,0.315933,0.242888,0.194041,...,0.256483,0.311692,0.298974,0.215887,0.20781,0.27365,0.266588,0.299794,0.334512,0.336518
2,HOBBIES_1_003_CA_1_validation,0.365714,0.34051,0.351393,0.368052,0.468741,0.584589,0.50227,0.320419,0.319222,...,0.423254,0.617693,0.552785,0.438575,0.411388,0.432698,0.440525,0.514527,0.690679,0.660371
3,HOBBIES_1_004_CA_1_validation,1.969896,1.559717,1.566354,1.520647,1.820497,2.426325,2.930362,1.924078,1.548568,...,1.982726,2.371639,2.570169,1.99188,1.68305,1.544717,1.541905,1.918762,2.591923,2.742036
4,HOBBIES_1_005_CA_1_validation,0.925106,0.786058,0.857674,0.993413,1.033872,1.467519,1.650921,1.180856,1.148202,...,1.19512,1.531255,1.446043,0.936965,0.863706,0.868228,0.878532,1.027789,1.358957,1.496262


In [32]:
submission.to_csv('team_sun_submission1_lightgbm_model_v1.0.csv', index=False)