## Prediction Model
#### Import package

In [1]:
import datetime
import numpy as np
import os
import pandas as pd

from sklearn.ensemble import RandomForestRegressor

from hyperopt import fmin, tpe, hp
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import KFold

#### Read dataset

In [2]:
dataset_folder = os.getcwd() + '/../dataset/'
train = pd.read_csv('../dataset/2019-02-25_18-47_processed_train.csv', index_col=0)
test = pd.read_csv('../dataset/2019-02-25_18-47_processed_test.csv', index_col=0)

In [3]:
X = train.drop(['revenue'], axis=1).values
y = np.log(train.revenue.values + 1)
X_test = test.drop(['revenue'], axis=1).values

#### Set parameters

In [4]:
params = {'n_jobs': 4,
          'n_estimators': 500,
          'criterion': hp.choice('criterion', ['mse']), 
          'max_depth': hp.uniform('max_depth', 5, 15),
          'min_samples_split': hp.uniform('min_samples_split', 0, 0.5),
          'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5)}

#### Define optimization function

In [5]:
def KfoldCV(args): 
    
    rmlse_score = list()
    kf = KFold(n_splits=4)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model = RandomForestRegressor(**args)
        model.fit(X_train, y_train)
        
        pred = model.predict(X_test)
        pred[pred < 0] = 0
        
        rmlse_score.append(np.sqrt(mean_squared_error(pred, y_test)))
    
    return np.mean(rmlse_score)

#### Parameters of best prediction model

In [6]:
best = fmin(KfoldCV, params, algo=tpe.suggest, max_evals=100)
print("Best estimate parameters: ", best)

100%|██████████| 100/100 [12:52<00:00,  6.67s/it, best loss: 1.9809493359185326]
Best estimate parameters:  {'criterion': 0, 'min_samples_leaf': 0.0014014519693034888, 'max_depth': 12.758616222654084, 'min_samples_split': 0.0095203830105519}


#### Define model based on the parameters found

In [7]:
model = RandomForestRegressor(n_estimators=500,
                              max_depth=9, 
                              min_samples_leaf=0.002,
                              min_samples_split=0.027)

#### Fit the model to all training dataset and make prediction

In [8]:
model.fit(X, y)
submission = (np.e ** model.predict(X_test) - 1)

#### Output submission file

In [9]:
time = datetime.datetime.now()
time = '{:4d}-{:02d}-{:02d}_{:02d}-{:02d}'.format(time.year, time.month, time.day, time.hour, time.minute)

submission = pd.DataFrame({'id': np.arange(3001, 7399), 'revenue': submission})
submission.to_csv(dataset_folder + time + '_submission.csv', index=None)