## Prediction Model
#### Import package

In [10]:
import datetime
import h5py
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestRegressor


from catboost import CatBoostRegressor
from hyperopt import fmin, tpe, hp
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

#### Read dataset

In [2]:
dataset_folder = os.getcwd() + '/../dataset/'
train = pd.read_csv('../dataset/2019-02-27_19-31_processed_train.csv', index_col=0)
test = pd.read_csv('../dataset/2019-02-27_19-31_processed_test.csv', index_col=0)

In [3]:
X = train.drop(['revenue'], axis=1).values
y = np.log(train.revenue.values + 1)
X_test = test.drop(['revenue'], axis=1).values

#### Set parameters

In [None]:
# RandomForestRegressor
params = {'n_jobs': 4,
          'n_estimators': 500,
          'criterion': hp.choice('criterion', ['mse']), 
          'max_depth': hp.choice('max_depth', np.arange(5, 15, dtype=int)),
          'min_samples_split': hp.uniform('min_samples_split', 0, 0.5),
          'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5)}

In [None]:
# XGBRegressor
params = {'silent': False,
          'n_estimators': 500, 
          'objective': 'reg:linear',
          'learning_rate': hp.uniform('learning_rate', 0, 0.1),  
          'colsample_bytree': hp.uniform('colsample_bytree', 0, 0.5),
          'subsample': hp.uniform('subsample', 0, 0.5),
          'max_depth': hp.choice('max_depth', np.arange(5, 15, dtype=int))}

In [15]:
# CatBoostRegressor
params = {'iterations': 1000,
          'eval_metric': 'RMSE',
          'early_stopping_rounds': 200,
          'metric_period': None,
          'depth': hp.choice('depth', np.arange(5, 15, dtype=int)),
          'learning_rate': hp.uniform('learning_rate', 0, 0.1),
          'colsample_bylevel': hp.uniform('colsample_bylevel', 0, 0.75),
          'bagging_temperature': hp.uniform('bagging_temperature', 0, 0.25)}

#### Define optimization function

In [13]:
def KfoldCV(args): 
    
    rmlse_score = list()
    kf = KFold(n_splits=4)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        #model = RandomForestRegressor(**args)
        model = CatBoostRegressor(**args)
        model.fit(X_train, y_train, verbose=False)
        
        pred = model.predict(X_test)
        pred[pred < 0] = 0        
        
        rmlse_score.append(np.sqrt(mean_squared_error(pred, y_test)))
    
    return np.mean(rmlse_score)

#### Parameters of best prediction model

In [17]:
best = fmin(KfoldCV, params, algo=tpe.suggest, max_evals=10)
print("Best estimate parameters: ", best)

100%|██████████| 10/10 [49:02<00:00, 271.17s/it, best loss: 1.8919829765472338]
Best estimate parameters:  {'colsample_bylevel': 0.1529669182250617, 'learning_rate': 0.043451218382335344, 'depth': 3, 'bagging_temperature': 0.06268638193907206}


#### Define model based on the parameters found

In [18]:
model = CatBoostRegressor(iterations=1000,
                          eval_metric='RMSE',
                          early_stopping_rounds=200,
                          metric_period=None,
                          depth=3,
                          learning_rate=0.043,
                          colsample_bylevel=0.153,
                          bagging_temperature=0.063)

#### Fit the model to all training dataset and make prediction

In [22]:
model.fit(X, y)
print(np.sqrt(mean_squared_error(model.predict(X), y)))
submission = (np.e ** model.predict(X_test) - 1)

0:	learn: 15.5886335	total: 3.73ms	remaining: 3.72s
1:	learn: 14.9468115	total: 15.8ms	remaining: 7.9s
2:	learn: 14.3335596	total: 25ms	remaining: 8.31s
3:	learn: 13.7477938	total: 31.6ms	remaining: 7.87s
4:	learn: 13.1891013	total: 36.4ms	remaining: 7.24s
5:	learn: 12.6554102	total: 40.4ms	remaining: 6.69s
6:	learn: 12.1447890	total: 44.6ms	remaining: 6.33s
7:	learn: 11.6551951	total: 48.2ms	remaining: 5.98s
8:	learn: 11.1885802	total: 52.1ms	remaining: 5.74s
9:	learn: 10.7442501	total: 56.7ms	remaining: 5.62s
10:	learn: 10.3208377	total: 62.3ms	remaining: 5.6s
11:	learn: 9.9163339	total: 66.3ms	remaining: 5.46s
12:	learn: 9.5312756	total: 80.1ms	remaining: 6.08s
13:	learn: 9.1603126	total: 87.3ms	remaining: 6.15s
14:	learn: 8.8062471	total: 92.1ms	remaining: 6.05s
15:	learn: 8.4729178	total: 97.1ms	remaining: 5.97s
16:	learn: 8.1492710	total: 101ms	remaining: 5.84s
17:	learn: 7.8468371	total: 106ms	remaining: 5.76s
18:	learn: 7.5561712	total: 110ms	remaining: 5.69s
19:	learn: 7.28050

#### Output submission file

In [23]:
time = datetime.datetime.now()
time = '{:4d}-{:02d}-{:02d}_{:02d}-{:02d}'.format(time.year, time.month, time.day, time.hour, time.minute)

submission = pd.DataFrame({'id': np.arange(3001, 7399), 'revenue': submission})
submission.to_csv(dataset_folder + time + '_submission.csv', index=None)

In [24]:
with open(dataset_folder + time + '_model.pickle', 'wb') as f:
    pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)