In [1]:
import numpy as np
import pandas as pd 
import xgboost as xgb
import sklearn

import matplotlib.pyplot as plt

for p in [np, pd, xgb, sklearn]:
    print (p.__name__, p.__version__)

numpy 1.14.3
pandas 0.23.0
xgboost 0.72
sklearn 0.19.1


In [58]:
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import PredefinedSplit, cross_val_score, GridSearchCV

from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, make_scorer

In [3]:
RANDOM_STATE = 71

In [4]:
# Evaluation criterion
def smape(pred, actual):
    """
    pred: a numpy array of predictions
    actual: a numpy array of actual values
    
    for a perfectly predicted zero observation, the smape is defined to be 0. 
    
    """
    
    selector = ~((pred == 0) & (actual == 0))
    numerator = np.abs(pred-actual)
    denom = (np.abs(pred) + np.abs(actual)) / 2
    return 100*np.sum((numerator[selector] / denom[selector])) / pred.shape[0]

smape_scorer = make_scorer(smape, greater_is_better=False)

In [5]:
# Test cases
for actual, pred in zip([np.array([1,4,0,5])]*3, 
                        [np.array([1,3,0,5]), np.array([0.5,4,1,6]), np.array([2,7,-1,4])]):
    print(smape(pred, actual))

7.142857142857142
71.2121212121212
85.85858585858585


In [6]:
train = pd.read_csv("../input/train.csv.zip")
test = pd.read_csv("../input/test.csv.zip")
sample_submission = pd.read_csv("../input/sample_submission.csv.zip")

In [7]:
# Convert the date field
train.loc[:,'date'] = pd.to_datetime(train.date)
test.loc[:,'date'] = pd.to_datetime(test.date)

In [8]:
data = pd.concat([train, test], sort=False).fillna(0)   # test data has id column

In [9]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [10]:
data = downcast_dtypes(data)

In [11]:
# Lag featurizer
class Lag_Featurizer(TransformerMixin):
    def __init__(self, index_col, time_col, value_col, output_col, output_time_index=False, shift=0, freq='1D'):
        self.index_col = index_col
        self.time_col = time_col
        self.value_col = value_col
        self.output_col = output_col
        self.output_time_index=output_time_index
        self.shift = shift
        self.freq = freq
        
    def fit(self, X):                
        pass
    
    def transform(self, X):
        assert isinstance(self.index_col, list)
        
        time_key = pd.Grouper(freq=self.freq)      
        time_index = self.index_col + [time_key]
        resampled = X.groupby(time_index)[self.value_col].sum().reset_index().set_index(self.time_col)
        shifted= resampled.groupby(self.index_col).shift(self.shift, freq=self.freq).drop(self.index_col, axis=1).reset_index().rename(columns={self.value_col:self.output_col})
        merged = pd.merge(X, shifted, how='left',left_on=self.index_col + [self.time_col], right_on=self.index_col + [self.time_col])
        if self.output_time_index:
            return merged.set_index(self.time_col)
        else:
            return merged

#### 

#### Add lag features
Store-item lag sales

store lag sales

item lag sales

lag periods (days): 1, 2, 3, 4, 7, 14, 21, 28,  84, 168, 336

In [12]:
data = data.set_index('date')

#### Add lag features

In [13]:
# 2018-01 ~ 2018-03, a total of 92 days
lag_feature_pipeline = Pipeline(
[
    # lag store, item sales
    
    ('store_item_lag_3m',   Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m',   output_time_index=True, shift=98)),
    ('store_item_lag_3m_1', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m_1', output_time_index=True, shift=99)),
    ('store_item_lag_3m_2', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m_2', output_time_index=True, shift=100)),
    ('store_item_lag_3m_3', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m_3', output_time_index=True, shift=101)),
    ('store_item_lag_3m_4', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m_4', output_time_index=True, shift=102)),
    ('store_item_lag_3m_5', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m_5', output_time_index=True, shift=103)),
    ('store_item_lag_3m_6', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m_6', output_time_index=True, shift=104)),   
    ('store_item_lag_4m', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',  output_col='sales_4m',     output_time_index=True, shift=112)),
    ('store_item_lag_5m', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',  output_col='sales_5m',     output_time_index=True, shift=140)),
    ('store_item_lag_6m', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',  output_col='sales_6m',     output_time_index=True, shift=168)),
    ('store_item_lag_9m', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',  output_col='sales_9m',     output_time_index=True, shift=252)),
    ('store_item_lag_1y', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',  output_col='sales_1y',     output_time_index=True, shift=336)),
    
    #lag store sales
    ('store_lag_3m',   Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m',   output_time_index=True, shift=98)),
    ('store_lag_3m_1', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m_1', output_time_index=True, shift=99)),
    ('store_lag_3m_2', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m_2', output_time_index=True, shift=100)),
    ('store_lag_3m_3', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m_3', output_time_index=True, shift=101)),
    ('store_lag_3m_4', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m_4', output_time_index=True, shift=102)),
    ('store_lag_3m_5', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m_5', output_time_index=True, shift=103)),
    ('store_lag_3m_6', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m_6', output_time_index=True, shift=104)),   
    ('store_lag_4m', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales', output_col='store_sales_4m',    output_time_index=True, shift=112)),
    ('store_lag_5m', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales', output_col='store_sales_5m',    output_time_index=True, shift=140)),
    ('store_lag_6m', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales', output_col='store_sales_6m',    output_time_index=True, shift=168)),
    ('store_lag_9m', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales', output_col='store_sales_9m',    output_time_index=True, shift=252)),
    ('store_lag_1y', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales', output_col='store_sales_1y',    output_time_index=True, shift=336)),
    
    # lag item sales
    ('item_lag_3m',   Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m',   output_time_index=True, shift=98)),
    ('item_lag_3m_1', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m_1', output_time_index=True, shift=99)),
    ('item_lag_3m_2', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m_2', output_time_index=True, shift=100)),
    ('item_lag_3m_3', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m_3', output_time_index=True, shift=101)),
    ('item_lag_3m_4', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m_4', output_time_index=True, shift=102)),
    ('item_lag_3m_5', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m_5', output_time_index=True, shift=103)),
    ('item_lag_3m_6', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m_6', output_time_index=True, shift=104)),   
    ('item_lag_4m', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',  output_col='item_sales_4m',   output_time_index=True, shift=112)),
    ('item_lag_5m', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',  output_col='item_sales_5m',   output_time_index=True, shift=140)),
    ('item_lag_6m', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',  output_col='item_sales_6m',   output_time_index=True, shift=168)),
    ('item_lag_9m', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',  output_col='item_sales_9m',   output_time_index=True, shift=252)),
    ('item_lag_1y', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',  output_col='item_sales_1y',   output_time_index=True, shift=336)),

]
)

In [14]:
%time data = lag_feature_pipeline.transform(data)

Wall time: 46.5 s


In [15]:
# drop all rows with nulls. Part of 2013 data is kept since the maximum lag is 336 days. 
data.dropna(inplace=True)
data.loc[:,'weekend'] = ((data.index.weekday == 5) |  (data.index.weekday == 6)) + 0

In [16]:
cols = [
    
    'sales',

    'sales_3m',  
    'sales_3m_1',
    'sales_3m_2',
    'sales_3m_3',
    'sales_3m_4',
    'sales_3m_5',
    'sales_3m_6',
    'sales_4m',  
    'sales_5m',  
    'sales_6m',  
    'sales_9m',  
    'sales_1y',  

    'store_sales_3m',  
    'store_sales_3m_1',
    'store_sales_3m_2',
    'store_sales_3m_3',
    'store_sales_3m_4',
    'store_sales_3m_5',
    'store_sales_3m_6',
    'store_sales_4m',
    'store_sales_5m',
    'store_sales_6m',
    'store_sales_9m',
    'store_sales_1y',

    'item_sales_3m',  
    'item_sales_3m_1',
    'item_sales_3m_2',
    'item_sales_3m_3',
    'item_sales_3m_4',
    'item_sales_3m_5',
    'item_sales_3m_6',
    'item_sales_4m',  
    'item_sales_5m',  
    'item_sales_6m',  
    'item_sales_9m',  
    'item_sales_1y',  

    'weekend'
]

In [17]:
training = data.loc['2016-01':'2017-03',cols]
validation_split = np.where((training.index >= pd.Timestamp(2017,1,1)) & (training.index <= pd.Timestamp(2017,3,31)), 0, -1)

In [18]:
# Validation data
X_validation = training.loc[validation_split == 0, cols[1:]]
y_validation = training.loc[validation_split == 0, 'sales']
print('Number of training instances = {0:d}'.format(X_validation.shape[0]))
print('Number of features           = {0:d}'.format(X_validation.shape[1]))
print('Date range = {0} to {1}'.format(X_validation.index[0].strftime('%Y-%m-%d'), X_validation.index[-1].strftime('%Y-%m-%d')))

Number of training instances = 45000
Number of features           = 37
Date range = 2017-01-01 to 2017-03-31


In [19]:
# training matrices
X_training = training.loc[:,cols[1:]]
y_training = training.loc[:,'sales']
print('Number of training instances = {0:d}'.format(X_training.shape[0]))
print('Number of features           = {0:d}'.format(X_training.shape[1]))
print('Date range = {0} to {1}'.format(training.index[0].strftime('%Y-%m-%d'), training.index[-1].strftime('%Y-%m-%d')))

Number of training instances = 228000
Number of features           = 37
Date range = 2016-01-01 to 2017-03-31


In [20]:
testing = data.loc['2018-01':,cols]
X_testing = testing.loc[:,cols[1:]]
y_testing = testing.loc[:,'sales']
print('Number of test instances = {0:d}'.format(X_testing.shape[0]))
print('Number of features       = {0:d}'.format(X_testing.shape[1]))
print('Date range = {0} to {1}'.format(testing.index[0].strftime('%Y-%m-%d'), testing.index[-1].strftime('%Y-%m-%d')))

Number of test instances = 45000
Number of features       = 37
Date range = 2018-01-01 to 2018-03-31


In [21]:
training_full = data.loc[:'2017-12',cols]
X_training_full = training_full.loc[:,cols[1:]]
y_training_full = training_full.loc[:,'sales']
print('Number of training instances = {0:d}'.format(X_training_full.shape[0]))
print('Number of features           = {0:d}'.format(X_training_full.shape[1]))
print('Date range = {0} to {1}'.format(training_full.index[0].strftime('%Y-%m-%d'), training_full.index[-1].strftime('%Y-%m-%d')))

Number of training instances = 745000
Number of features           = 37
Date range = 2013-12-03 to 2017-12-31


In [46]:
# Lasso
lasso = Lasso(random_state=RANDOM_STATE, max_iter=2000)
lasso_params = {'alpha': np.logspace(-3,3,7)}

In [47]:
reg = GridSearchCV(lasso, lasso_params, scoring='neg_mean_absolute_error', n_jobs=8, cv=PredefinedSplit(validation_split), verbose=20)

In [48]:
reg.fit(X_training,y_training)

Fitting 1 folds for each of 7 candidates, totalling 7 fits


[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:   21.9s
[Parallel(n_jobs=8)]: Done   2 out of   7 | elapsed:   24.7s remaining:  1.0min
[Parallel(n_jobs=8)]: Done   3 out of   7 | elapsed:   30.2s remaining:   40.3s
[Parallel(n_jobs=8)]: Done   4 out of   7 | elapsed:   33.1s remaining:   24.8s
[Parallel(n_jobs=8)]: Done   5 out of   7 | elapsed:   54.2s remaining:   21.6s
[Parallel(n_jobs=8)]: Done   7 out of   7 | elapsed:   58.2s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   7 out of   7 | elapsed:   58.2s finished


GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=2000,
   normalize=False, positive=False, precompute=False, random_state=71,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'alpha': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=20)

In [49]:
print('Best score = {0:.4f}; Best Parameter = {1}'.format(-reg.best_score_, reg.best_params_)) # Score is the negative because large score indicates better fit

Best score = 9.0651; Best Parameter = {'alpha': 0.1}


In [52]:
pred_validation_lasso = reg.predict(X_validation)
print(smape(pred_validation_lasso, y_validation.values))

16.94179893432121


In [62]:
reg_best = reg.best_estimator_
# reg_best.fit(X_training_full, y_training_full)

In [63]:
pred_reg_best = reg_best.predict(X_testing)

In [84]:
# Make predictions and a submission: 18.15; Use smape_scorer = 18.65; somehow best score = 19.39
submission_lasso = pd.DataFrame({'Id': sample_submission.id, 'sales': pred_reg_best})
submission_lasso.to_csv('submission_lasso.csv', index=False)

#### Random Forest Model

In [22]:
# rf = RandomForestRegressor(random_state=RANDOM_STATE, criterion='mae')

In [24]:
# rf_params = {"n_estimators": np.arange(100, 510, 100),
#               "max_depth": np.arange(4, 10, 1)
#               "min_samples_split": np.arange(10,110,10),
#               "min_samples_leaf": np.arange(5,10,1),
#               "max_leaf_nodes": np.arange(5,15,1)
#             }

In [25]:
# rf = GridSearchCV(rf, rf_params, scoring=smape_scorer, n_jobs=1, cv=PredefinedSplit(validation_split), verbose=50)

In [None]:
# %time rf.fit(X_training,y_training)

In [89]:
# print('Best score = {0:.4f}; Best Parameter = {1}'.format(-rf.best_score_, rf.best_params_))

In [90]:
# rf_best = rf.best_estimator_
# rf_best.fit(X_training_full, y_training_full)
# pred_rf_best = rf_best.predict(X_testing)
# submission_rf = pd.DataFrame({'Id': sample_submission.id, 'sales': pred_rf_best})
# submission_rf.to_csv('../output/submission_rf.csv', index=False)

#### xgboost Model

In [36]:
reg_xgb = xgb.XGBRegressor(objective='reg:linear', booster='gbtree', n_jobs=8)

In [37]:
xgb_params = {
                'n_estimators': np.arange(50,200,50),
                'min_child_weight': np.arange(10,100,10), 
                'learning_rate': [0.1], 
                'colsample_bytree': np.arange(0.8,1.0,0.1), 
                'max_depth': np.arange(1,10,2),
                'subsample': [0.9], 
                'reg_lambda': [1.]
}

fit_params={"early_stopping_rounds":50, 
            "eval_metric" : "mae", 
            "eval_set" : [[X_validation.values, y_validation.values]]}


In [38]:
grid_xgb = GridSearchCV(reg_xgb, xgb_params, scoring=None, n_jobs=8, cv=PredefinedSplit(validation_split), verbose=30, fit_params=fit_params, verbose=50)

In [40]:
%time grid_xgb.fit(X_training.values,y_training.values)



Fitting 1 folds for each of 270 candidates, totalling 270 fits


[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:   17.4s
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   19.2s
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed:   21.1s
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed:   25.5s
[Parallel(n_jobs=8)]: Done   5 tasks      | elapsed:   28.4s
[Parallel(n_jobs=8)]: Done   6 tasks      | elapsed:   32.4s
[Parallel(n_jobs=8)]: Done   7 tasks      | elapsed:   35.1s
[Parallel(n_jobs=8)]: Done   8 tasks      | elapsed:   38.8s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:   44.8s
[Parallel(n_jobs=8)]: Done  10 tasks      | elapsed:   48.0s
[Parallel(n_jobs=8)]: Done  11 tasks      | elapsed:   56.8s
[Parallel(n_jobs=8)]: Done  12 tasks      | elapsed:   58.8s
[Parallel(n_jobs=8)]: Done  13 tasks      | elapsed:  1.1min
[Parallel(n_jobs=8)]: Done  14 tasks      | elapsed:  1.2min
[Parallel(n_jobs=8)]: Done  15 tasks      | elapsed:  1.2min
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 136 tasks      | elapsed: 40.9min
[Parallel(n_jobs=8)]: Done 137 tasks      | elapsed: 40.9min
[Parallel(n_jobs=8)]: Done 138 tasks      | elapsed: 41.2min
[Parallel(n_jobs=8)]: Done 139 tasks      | elapsed: 41.3min
[Parallel(n_jobs=8)]: Done 140 tasks      | elapsed: 41.4min
[Parallel(n_jobs=8)]: Done 141 tasks      | elapsed: 41.6min
[Parallel(n_jobs=8)]: Done 142 tasks      | elapsed: 41.8min
[Parallel(n_jobs=8)]: Done 143 tasks      | elapsed: 41.8min
[Parallel(n_jobs=8)]: Done 144 tasks      | elapsed: 42.0min
[Parallel(n_jobs=8)]: Done 145 tasks      | elapsed: 42.2min
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed: 42.2min
[Parallel(n_jobs=8)]: Done 147 tasks      | elapsed: 42.3min
[Parallel(n_jobs=8)]: Done 148 tasks      | elapsed: 42.4min
[Parallel(n_jobs=8)]: Done 149 tasks      | elapsed: 42.4min
[Parallel(n_jobs=8)]: Done 150 tasks      | elapsed: 42.6min
[Parallel(n_jobs=8)]: Done 151 tasks      | elapsed: 42.7min
[Parallel(n_jobs=8)]: Do

[0]	validation_0-mae:40.524
Will train until validation_0-mae hasn't improved in 50 rounds.
[1]	validation_0-mae:36.1721
[2]	validation_0-mae:32.2473
[3]	validation_0-mae:28.7213
[4]	validation_0-mae:25.5823
[5]	validation_0-mae:22.7058
[6]	validation_0-mae:20.1798
[7]	validation_0-mae:17.9916
[8]	validation_0-mae:15.9889
[9]	validation_0-mae:14.2558
[10]	validation_0-mae:12.7256
[11]	validation_0-mae:11.4146
[12]	validation_0-mae:10.3287
[13]	validation_0-mae:9.38572
[14]	validation_0-mae:8.58973
[15]	validation_0-mae:7.92707
[16]	validation_0-mae:7.38957
[17]	validation_0-mae:6.96254
[18]	validation_0-mae:6.61195
[19]	validation_0-mae:6.33682
[20]	validation_0-mae:6.11907
[21]	validation_0-mae:5.9473
[22]	validation_0-mae:5.8181
[23]	validation_0-mae:5.7175
[24]	validation_0-mae:5.64256
[25]	validation_0-mae:5.58735
[26]	validation_0-mae:5.54644
[27]	validation_0-mae:5.51849
[28]	validation_0-mae:5.4985
[29]	validation_0-mae:5.48571
[30]	validation_0-mae:5.47692
[31]	validation_0-mae

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=8, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={'early_stopping_rounds': 50, 'eval_metric': 'mae', 'eval_set': [[array([[ 27.,  26., ..., 161.,   1.],
       [ 17.,  27., ..., 130.,   0.],
       ...,
       [ 56.,  64., ..., 736.,   0.],
       [ 64.,  56., ..., 790.,   0.]]), array([19., 15., ..., 68., 69.], dtype=float32)]]},
       iid=True, n_jobs=8,
       param_grid={'n_estimators': array([ 50, 100, 150]), 'min_child_weight': array([10, 20, 30, 40, 50, 60, 70, 80, 90]), 'learning_rate': [0.1], 'colsample_bytree': ar

In [44]:
# PB score = 15.29464
xgb_best = grid_xgb.best_estimator_
# rf_best.fit(X_training_full, y_training_full)
pred_xgb_best = xgb_best.predict(X_testing.values)
submission_xgb = pd.DataFrame({'Id': sample_submission.id, 'sales': pred_xgb_best})
submission_xgb.to_csv('../output/submission_xgb.csv', index=False)

In [54]:
pred_validation_xgb = xgb_best.predict(X_validation.values)
print(smape(pred_validation_xgb, y_validation.values))

13.823542751736111


#### Stacking
Using ridge regression with validation data

In [57]:
X_stack = np.stack((pred_validation_lasso, pred_validation_xgb), axis=-1)
y_stack = y_validation.values
print(X_stack.shape)

(45000, 2)


In [59]:
reg_stack = Ridge(random_state=RANDOM_STATE, alpha=0.1)
reg_stack.fit(X_stack, y_stack)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=71, solver='auto', tol=0.001)

In [61]:
pred_validation_stack = reg_stack.predict(X_stack)
print(smape(pred_validation_stack, y_validation.values))

13.503561026036206


In [64]:
# Prediction using the combined predictor
X_stack_test = np.stack((pred_reg_best, pred_xgb_best), axis=-1)
pred_test_stack = reg_stack.predict(X_stack_test)
submission_stack = pd.DataFrame({'Id': sample_submission.id, 'sales': pred_test_stack})
submission_stack.to_csv('../output/submission_stack.csv', index=False)