## Prediction Model
#### Import package

In [16]:
import datetime
import h5py
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestRegressor

from catboost import CatBoostRegressor
from hyperopt import fmin, tpe, hp
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

#### Read dataset

In [2]:
dataset_folder = os.getcwd() + '/../dataset/'
train = pd.read_csv('../dataset/2019-02-27_19-31_processed_train.csv', index_col=0)
test = pd.read_csv('../dataset/2019-02-27_19-31_processed_test.csv', index_col=0)

In [26]:
X = train.drop(['revenue'], axis=1).values
y = np.log(train.revenue.values + 1)
X_test = test.drop(['revenue'], axis=1).values

#### Set parameters

In [112]:
# RandomForestRegressor
params = {'n_jobs': 4,
          'random_state': 0,
          'n_estimators': hp.choice('n_estimators', [100, 250, 500, 750, 1000]),
          'max_depth': hp.choice('max_depth', np.arange(3, 15, dtype=int)),
          'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2']),
          'min_samples_split': hp.uniform('min_samples_split', 0, 0.5),
          'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5)}

In [101]:
# XGBRegressor
params = {'silent': True,
          'objective': 'reg:linear',
          'seed': 0,
          'n_estimators': hp.choice('n_estimators', [100, 250, 500, 750, 1000]),
          'max_depth': hp.choice('max_depth', np.arange(5, 15, dtype=int)),
          'learning_rate': hp.uniform('learning_rate', 0, 0.75),  
          'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
          'subsample': hp.uniform('subsample', 0.5, 1)}

In [109]:
# CatBoostRegressor
params = {'random_seed': 0,
          'early_stopping_rounds': 500,
          'eval_metric': 'RMSE',
          'logging_level': 'Silent',
          'n_estimators': 1000,
          'depth': hp.choice('depth', np.arange(3, 15, dtype=int)),
          'learning_rate': hp.uniform('learning_rate', 0, 0.75),
          'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1)}

In [97]:
# LGBMRegressor
params = {'seed': 0,
          'tree_learner_type': hp.choice('tree_learner_type', ['serial', 'feature', 'data', 'voting']),
          'n_estimators': hp.choice('n_estimators', [100, 250, 500, 750, 1000]),
          'num_leaves': hp.choice('num_leaves', [50, 100, 150, 200, 250, 500, 750, 1000]),
          'min_data_in_leaf' : hp.choice('min_data_in_leaf', [10, 20, 30, 40, 50, 100]),
          'max_depth': hp.choice('max_depth', np.arange(3, 15, dtype=int)),
          'learning_rate': hp.uniform('learning_rate', 0, 0.75),}

#### Define optimization function

In [115]:
def KfoldCV(args): 
    
    rmlse_score = list()
    kf = KFold(n_splits=4)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model = RandomForestRegressor(**args)
        #model = CatBoostRegressor(**args)
        #model = LGBMRegressor(**args)
        #model = XGBRegressor(**args)
        model.fit(X_train, y_train)
        #model.fit(X_train, y_train, verbose=False)
        
        pred = model.predict(X_test)
        pred[pred < 0] = 0        
        
        rmlse_score.append(np.sqrt(mean_squared_error(pred, y_test)))
    
    return np.mean(rmlse_score)

#### Parameters of best prediction model

In [116]:
best = fmin(KfoldCV, params, algo=tpe.suggest, max_evals=10)
print("Best estimate parameters: ", best)

100%|██████████| 10/10 [01:05<00:00,  6.54s/it, best loss: 2.426852888889002]
Best estimate parameters:  {'max_features': 2, 'min_samples_leaf': 0.007496219809590587, 'n_estimators': 0, 'max_depth': 2, 'min_samples_split': 0.19018878860212296}


#### Define model based on the parameters found

In [None]:
# Submission #006
model = LGBMRegressor(n_estimators=500,
                      min_data_in_leaf=10, 
                      learning_rate=0.0100,
                      bagging_fraction=0.427,
                      max_depth=7,
                      num_leaves=250)

In [18]:
# Submission #007
model = CatBoostRegressor(iterations=1000,
                          eval_metric='RMSE',
                          early_stopping_rounds=200,
                          metric_period=None,
                          depth=8,
                          learning_rate=0.043,
                          colsample_bylevel=0.153,
                          bagging_temperature=0.063)

In [22]:
# Submission #008
model = XGBRegressor(silent=True,
                     objective='reg:linear', 
                     n_estimators=500,
                     learning_rate=0.021,  
                     colsample_bytree = 0.394,
                     subsample = 0.466,
                     max_depth=7)

In [26]:
# Submission #009
model_lgb = LGBMRegressor(n_estimators=500, min_data_in_leaf=10, learning_rate=0.0100,
                          bagging_fraction=0.427, max_depth=7, num_leaves=250)
model_cat = CatBoostRegressor(iterations=1000, eval_metric='RMSE', early_stopping_rounds=200,
                          metric_period=None, depth=8, learning_rate=0.043,
                          colsample_bylevel=0.153, bagging_temperature=0.063)
model_xgb = XGBRegressor(silent=True, objective='reg:linear', n_estimators=500,
                         learning_rate=0.021, colsample_bytree = 0.394, subsample = 0.466,
                         max_depth=7)

In [28]:
model_lgb.fit(X, y)
model_cat.fit(X, y)
model_xgb.fit(X, y)

pred_lgb = model_lgb.predict(X_test)
pred_cat = model_cat.predict(X_test)
pred_xgb = model_xgb.predict(X_test)
pred = (pred_lgb + pred_cat + pred_xgb) / 3
submission = (np.e ** pred - 1)

0:	learn: 15.5916733	total: 18ms	remaining: 18s
1:	learn: 14.9647462	total: 37.5ms	remaining: 18.7s
2:	learn: 14.3510315	total: 48.7ms	remaining: 16.2s
3:	learn: 13.7635742	total: 61ms	remaining: 15.2s
4:	learn: 13.2116722	total: 80ms	remaining: 15.9s
5:	learn: 12.6847103	total: 88.7ms	remaining: 14.7s
6:	learn: 12.1769509	total: 94.2ms	remaining: 13.4s
7:	learn: 11.6989610	total: 106ms	remaining: 13.2s
8:	learn: 11.2285461	total: 116ms	remaining: 12.7s
9:	learn: 10.7815237	total: 124ms	remaining: 12.3s
10:	learn: 10.3605104	total: 138ms	remaining: 12.4s
11:	learn: 9.9514694	total: 143ms	remaining: 11.8s
12:	learn: 9.5623842	total: 155ms	remaining: 11.8s
13:	learn: 9.1919346	total: 167ms	remaining: 11.7s
14:	learn: 8.8330996	total: 176ms	remaining: 11.6s
15:	learn: 8.4917577	total: 180ms	remaining: 11s
16:	learn: 8.1822391	total: 187ms	remaining: 10.8s
17:	learn: 7.8724586	total: 190ms	remaining: 10.4s
18:	learn: 7.5794074	total: 198ms	remaining: 10.2s
19:	learn: 7.3023183	total: 210ms

In [117]:
# Submission #010, #11
# weight 1/2.02331
model_rf = RandomForestRegressor(n_estimators=500, max_features='auto', max_depth=8, 
                                 min_samples_leaf=0.00033, min_samples_split=0.06752)
# weight 1/1.91323
model_lgb = LGBMRegressor(n_estimators=250, min_data_in_leaf=10, learning_rate=0.10550,
                          tree_learner_type='data', max_depth=4, num_leaves=50)
# weight 1/1.86258
model_xgb = XGBRegressor(silent=True, objective='reg:linear', n_estimators=250, learning_rate=0.04639,  
                         colsample_bytree=0.63095, subsample=0.77854, max_depth=7)
# weight 1/1.92542
model_cat = CatBoostRegressor(iterations=1000, eval_metric='RMSE', early_stopping_rounds=200,
                              logging_level='Silent', depth=8, learning_rate=0.043, colsample_bylevel=0.153,
                              bagging_temperature=0.063)

In [123]:
model_lgb.fit(X, y)
model_cat.fit(X, y)
model_xgb.fit(X, y)

pred_lgb = model_lgb.predict(X_test)
pred_cat = model_cat.predict(X_test)
pred_xgb = model_xgb.predict(X_test)

pred = (1/1.91323 * pred_lgb + 1/1.92542 * pred_cat + 1/1.86258 * pred_xgb) / (1/1.91323 + 1/1.86258 + 1/1.92542)
submission = (np.e ** pred_xgb - 1)

In [5]:
# Submission #012
models = [LGBMRegressor(n_estimators=500, min_data_in_leaf=10, learning_rate=0.0100,
                        bagging_fraction=0.427, max_depth=7, num_leaves=250), 
          CatBoostRegressor(iterations=1000, eval_metric='RMSE', early_stopping_rounds=200,
                          logging_level='Silent', depth=8, learning_rate=0.043, colsample_bylevel=0.153,
                          bagging_temperature=0.063),
          XGBRegressor(silent=True, objective='reg:linear', n_estimators=500, learning_rate=0.021,  
                       colsample_bytree = 0.394, subsample = 0.466, max_depth=7)]

In [27]:
pred = np.zeros((X.shape[0], len(models)))
    
for i, model in enumerate(models):
    model.fit(X, y)
    pred[:, i] = model.predict(X)

pred[pred < 0] = 0
stacking = LinearRegression()
stacking.fit(pred, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [28]:
pred = np.zeros((X_test.shape[0], len(models)))
for i, model in enumerate(models):
    pred[:, i] = model.predict(X_test)

pred[pred < 0] = 0
submission = (np.e ** stacking.predict(pred) - 1)

#### Fit the model to all training dataset and make prediction

In [23]:
model.fit(X, y)
print(np.sqrt(mean_squared_error(model.predict(X), y)))
submission = (np.e ** model.predict(X_test) - 1)

0.90695914568


#### Output submission file

In [29]:
time = datetime.datetime.now()
time = '{:4d}-{:02d}-{:02d}_{:02d}-{:02d}'.format(time.year, time.month, time.day, time.hour, time.minute)

submission = pd.DataFrame({'id': np.arange(3001, 7399), 'revenue': submission})
submission.to_csv(dataset_folder + time + '_submission.csv', index=None)

In [30]:
with open(dataset_folder + time + '_model.pickle', 'wb') as f:
    pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)