## Prediction Model
#### Import package

In [1]:
import datetime
import h5py
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestRegressor

from catboost import CatBoostRegressor
from hyperopt import fmin, tpe, hp
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you won't need to install the gcc compiler anymore.
Instead of that, you'll need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


#### Read dataset

In [2]:
dataset_folder = os.getcwd() + '/../dataset/'
train = pd.read_csv('../dataset/2019-02-27_19-31_processed_train.csv', index_col=0)
test = pd.read_csv('../dataset/2019-02-27_19-31_processed_test.csv', index_col=0)

In [3]:
X = train.drop(['revenue'], axis=1).values
y = np.log(train.revenue.values + 1)
X_test = test.drop(['revenue'], axis=1).values

#### Set parameters

In [None]:
# RandomForestRegressor
params = {'n_jobs': 4,
          'n_estimators': 500,
          'criterion': hp.choice('criterion', ['mse']), 
          'max_depth': hp.choice('max_depth', np.arange(5, 15, dtype=int)),
          'min_samples_split': hp.uniform('min_samples_split', 0, 0.5),
          'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5)}

In [10]:
# XGBRegressor
params = {'silent': True,
          'n_estimators': 500, 
          'objective': 'reg:linear',
          'learning_rate': hp.uniform('learning_rate', 0, 0.1),  
          'colsample_bytree': hp.uniform('colsample_bytree', 0, 0.5),
          'subsample': hp.uniform('subsample', 0, 0.5),
          'max_depth': hp.choice('max_depth', np.arange(5, 15, dtype=int))}

In [None]:
# CatBoostRegressor
params = {'iterations': 1000,
          'eval_metric': 'RMSE',
          'early_stopping_rounds': 200,
          'metric_period': None,
          'depth': hp.choice('depth', np.arange(5, 15, dtype=int)),
          'learning_rate': hp.uniform('learning_rate', 0, 0.1),
          'colsample_bylevel': hp.uniform('colsample_bylevel', 0, 0.75),
          'bagging_temperature': hp.uniform('bagging_temperature', 0, 0.25)}

In [4]:
# LGBMRegressor
params = {'n_estimators': 500,
          'max_depth': hp.choice('max_depth', np.arange(5, 15, dtype=int)),
          'learning_rate': hp.uniform('learning_rate', 0, 0.1),
          'bagging_fraction': hp.uniform('bagging_fraction', 0.25, 0.75),
          'num_leaves': hp.choice('num_leaves', np.array([50, 100, 150, 200, 250, 500, 750, 1000])),
          'min_data_in_leaf' : hp.choice('min_data_in_leaf', np.array([10, 20, 30, 40, 50, 100]))}

#### Define optimization function

In [6]:
def KfoldCV(args): 
    
    rmlse_score = list()
    kf = KFold(n_splits=4)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        #model = RandomForestRegressor(**args)
        #model = CatBoostRegressor(**args)
        model = LGBMRegressor(**args)
        #model = XGBRegressor(**args)
        model.fit(X_train, y_train, verbose=False)
        
        pred = model.predict(X_test)
        pred[pred < 0] = 0        
        
        rmlse_score.append(np.sqrt(mean_squared_error(pred, y_test)))
    
    return np.mean(rmlse_score)

#### Parameters of best prediction model

In [7]:
best = fmin(KfoldCV, params, algo=tpe.suggest, max_evals=50)
print("Best estimate parameters: ", best)

100%|██████████| 50/50 [02:55<00:00,  3.14s/it, best loss: 1.914983419072453]
Best estimate parameters:  {'min_data_in_leaf': 0, 'learning_rate': 0.010147824037812225, 'bagging_fraction': 0.42683766871913165, 'max_depth': 2, 'num_leaves': 4}


#### Define model based on the parameters found

In [10]:
# Submission #006
model = LGBMRegressor(n_estimators=500,
                      min_data_in_leaf=10, 
                      learning_rate=0.0100,
                      bagging_fraction=0.427,
                      max_depth=7,
                      num_leaves=250)

#### Fit the model to all training dataset and make prediction

In [11]:
model.fit(X, y)
print(np.sqrt(mean_squared_error(model.predict(X), y)))
submission = (np.e ** model.predict(X_test) - 1)

1.22689101586


#### Output submission file

In [12]:
time = datetime.datetime.now()
time = '{:4d}-{:02d}-{:02d}_{:02d}-{:02d}'.format(time.year, time.month, time.day, time.hour, time.minute)

submission = pd.DataFrame({'id': np.arange(3001, 7399), 'revenue': submission})
submission.to_csv(dataset_folder + time + '_submission.csv', index=None)

In [13]:
with open(dataset_folder + time + '_model.pickle', 'wb') as f:
    pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)