## Prediction Model
#### Import package

In [1]:
import datetime
import h5py
import numpy as np
import os
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestRegressor


from hyperopt import fmin, tpe, hp
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import KFold
from xgboost import XGBRegressor

#### Read dataset

In [2]:
dataset_folder = os.getcwd() + '/../dataset/'
train = pd.read_csv('../dataset/2019-02-27_19-31_processed_train.csv', index_col=0)
test = pd.read_csv('../dataset/2019-02-27_19-31_processed_test.csv', index_col=0)

In [3]:
X = train.drop(['revenue'], axis=1).values
y = np.log(train.revenue.values + 1)
X_test = test.drop(['revenue'], axis=1).values

#### Set parameters

In [None]:
params = {'n_jobs': 4,
          'n_estimators': 500,
          'criterion': hp.choice('criterion', ['mse']), 
          'max_depth': hp.choice('max_depth', np.arange(5, 15, dtype=int)),
          'min_samples_split': hp.uniform('min_samples_split', 0, 0.5),
          'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5)}

In [None]:
params = {'silent': False,
          'n_estimators': 100, 
          'objective': 'reg:linear',
          'learning_rate': hp.uniform('learning_rate', 0, 0.1),  
          'colsample_bytree': hp.uniform('colsample_bytree', 0, 0.5),
          'subsample': hp.uniform('subsample', 0, 0.5),
          'max_depth': hp.choice('max_depth', np.arange(5, 15, dtype=int))}

#### Define optimization function

In [9]:
#### Old Model
def KfoldCV(args): 
    
    rmlse_score = list()
    kf = KFold(n_splits=4)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        #model = RandomForestRegressor(**args)
        #model.fit(X_train, y_train)
        
        #pred = model.predict(X_test)
        #pred[pred < 0] = 0
        
        model = XGBRegressor(**args)
        
        eval_set = [(X_train, y_train), (X_test, y_test)]
        eval_metric = ['rmse']
        
        model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=True)
        
        pred = model.predict(X_test)
        pred[pred < 0] = 0
        
        rmlse_score.append(np.sqrt(mean_squared_error(pred, y_test)))
    
    return np.mean(rmlse_score)

#### Parameters of best prediction model

#### Define model based on the parameters found

In [4]:
model = XGBRegressor(silent=False, 
                     scale_pos_weight=1,
                     learning_rate=0.01,  
                     colsample_bytree = 0.4,
                     subsample = 0.8,
                     objective='reg:linear', 
                     n_estimators=500, 
                     reg_alpha = 0.3,
                     max_depth=4, 
                     gamma=10)

#### Fit the model to all training dataset and make prediction

In [7]:
model.fit(X, y)
submission = (np.e ** model.predict(X_test) - 1)

[22:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 2 pruned nodes, max_depth=4
[22:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 4 pruned nodes, max_depth=4
[22:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=4
[22:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 16 extra nodes, 2 pruned nodes, max_depth=4
[22:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 2 pruned nodes, max_depth=4
[22:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 6 pruned nodes, max_depth=4
[22:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 2 pruned nodes, max_depth=4
[22:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 6 pruned nodes, max_depth=4
[22:57:30] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 16 extra nodes, 0 pruned nodes, max_

#### Output submission file

In [8]:
time = datetime.datetime.now()
time = '{:4d}-{:02d}-{:02d}_{:02d}-{:02d}'.format(time.year, time.month, time.day, time.hour, time.minute)

submission = pd.DataFrame({'id': np.arange(3001, 7399), 'revenue': submission})
submission.to_csv(dataset_folder + time + '_submission.csv', index=None)

In [None]:
with open(dataset_folder + time + '_model.pickle', 'wb') as f:
    pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)