In [2]:
import numpy as np
import pandas as pd 
import xgboost as xgb
import sklearn

import matplotlib.pyplot as plt

for p in [np, pd, xgb, sklearn]:
    print (p.__name__, p.__version__)

numpy 1.14.3
pandas 0.23.0
xgboost 0.72
sklearn 0.19.1


In [3]:
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import PredefinedSplit, cross_val_score, GridSearchCV

from sklearn.linear_model import Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, make_scorer

In [4]:
RANDOM_STATE = 71

In [5]:
# Evaluation criterion
def smape(pred, actual):
    """
    pred: a numpy array of predictions
    actual: a numpy array of actual values
    
    for a perfectly predicted zero observation, the smape is defined to be 0. 
    
    """
    
    selector = ~((pred == 0) & (actual == 0))
    numerator = np.abs(pred-actual)
    denom = (np.abs(pred) + np.abs(actual)) / 2
    return 100*np.sum((numerator[selector] / denom[selector])) / pred.shape[0]

smape_scorer = make_scorer(smape, greater_is_better=False)

In [6]:
train = pd.read_csv("../input/train.csv.zip")
test = pd.read_csv("../input/test.csv.zip")
sample_submission = pd.read_csv("../input/sample_submission.csv.zip")

In [29]:
TRAINING_END_DATE = '2017-12-31'

In [30]:
# Convert the date field
train.loc[:,'date'] = pd.to_datetime(train.date)
test.loc[:,'date'] = pd.to_datetime(test.date)

In [41]:
data = pd.concat([train, test], sort=False)   # test data has id column

In [42]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [43]:
data = downcast_dtypes(data)

In [44]:
# Lag featurizer
class Lag_Featurizer(TransformerMixin):
    def __init__(self, index_col, time_col, value_col, output_col, output_time_index=False, shift=0, freq='1D'):
        self.index_col = index_col
        self.time_col = time_col
        self.value_col = value_col
        self.output_col = output_col
        self.output_time_index=output_time_index
        self.shift = shift
        self.freq = freq
        
    def fit(self, X):                
        pass
    
    def transform(self, X):
        assert isinstance(self.index_col, list)
        
        time_key = pd.Grouper(freq=self.freq)      
        time_index = self.index_col + [time_key]
        resampled = X.groupby(time_index)[self.value_col].sum().reset_index().set_index(self.time_col)
        shifted= resampled.groupby(self.index_col).shift(self.shift, freq=self.freq).drop(self.index_col, axis=1).reset_index().rename(columns={self.value_col:self.output_col})
        merged = pd.merge(X, shifted, how='left',left_on=self.index_col + [self.time_col], right_on=self.index_col + [self.time_col])
        if self.output_time_index:
            return merged.set_index(self.time_col)
        else:
            return merged

#### Add Mean encodings for weekday and month

In [45]:
data.set_index('date', inplace=True)

In [46]:
data['weekday'] = data.index.weekday
data['week'] = data.index.week
data['month'] = data.index.month

In [47]:
sales_monthly_avg = data.loc[:TRAINING_END_DATE].reset_index().groupby(['store', 'item','month'])['sales'].mean().reset_index().rename(columns={'sales':'avg_monthly_sales'})
sales_weekly_avg = data.loc[:TRAINING_END_DATE].reset_index().groupby(['store', 'item','week'])['sales'].mean().reset_index().rename(columns={'sales':'avg_weekly_sales'})
sales_weekday_avg = data.loc[:TRAINING_END_DATE].reset_index().groupby(['store', 'item','weekday'])['sales'].mean().reset_index().rename(columns={'sales':'avg_weekday_sales'})

In [49]:
# join to the data
data = pd.merge(data.reset_index(), sales_monthly_avg, how='left', 
                                    left_on=['store', 'item','month'], right_on=['store', 'item','month'])
data = pd.merge(data.reset_index(), sales_weekly_avg, how='left', 
                                    left_on=['store', 'item','week'], right_on=['store', 'item','week'])
data = pd.merge(data.reset_index(), sales_weekday_avg, how='left', 
                                    left_on=['store', 'item','weekday'], right_on=['store', 'item','weekday'])

In [51]:
data.drop(['level_0', 'index'], axis=1, inplace=True)

#### Add lag features
Store-item lag sales

store lag sales

item lag sales

lag periods (days): 1, 2, 3, 4, 7, 14, 21, 28,  84, 168, 336

#### Add lag features

In [56]:
data.set_index('date', inplace=True)

In [57]:
# 2018-01 ~ 2018-03, a total of 92 days
lag_feature_pipeline = Pipeline(
[
    # lag store, item sales
    
    ('store_item_lag_3m',   Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m',   output_time_index=True, shift=98)),
    ('store_item_lag_3m_1', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m_1', output_time_index=True, shift=99)),
    ('store_item_lag_3m_2', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m_2', output_time_index=True, shift=100)),
    ('store_item_lag_3m_3', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m_3', output_time_index=True, shift=101)),
    ('store_item_lag_3m_4', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m_4', output_time_index=True, shift=102)),
    ('store_item_lag_3m_5', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m_5', output_time_index=True, shift=103)),
    ('store_item_lag_3m_6', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',output_col='sales_3m_6', output_time_index=True, shift=104)),   
    ('store_item_lag_4m', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',  output_col='sales_4m',     output_time_index=True, shift=112)),
    ('store_item_lag_5m', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',  output_col='sales_5m',     output_time_index=True, shift=140)),
    ('store_item_lag_6m', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',  output_col='sales_6m',     output_time_index=True, shift=168)),
    ('store_item_lag_9m', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',  output_col='sales_9m',     output_time_index=True, shift=252)),
    ('store_item_lag_1y', Lag_Featurizer(index_col=['store', 'item'],time_col='date',value_col='sales',  output_col='sales_1y',     output_time_index=True, shift=336)),
    
    #lag store sales
    ('store_lag_3m',   Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m',   output_time_index=True, shift=98)),
    ('store_lag_3m_1', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m_1', output_time_index=True, shift=99)),
    ('store_lag_3m_2', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m_2', output_time_index=True, shift=100)),
    ('store_lag_3m_3', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m_3', output_time_index=True, shift=101)),
    ('store_lag_3m_4', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m_4', output_time_index=True, shift=102)),
    ('store_lag_3m_5', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m_5', output_time_index=True, shift=103)),
    ('store_lag_3m_6', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales',output_col='store_sales_3m_6', output_time_index=True, shift=104)),   
    ('store_lag_4m', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales', output_col='store_sales_4m',    output_time_index=True, shift=112)),
    ('store_lag_5m', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales', output_col='store_sales_5m',    output_time_index=True, shift=140)),
    ('store_lag_6m', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales', output_col='store_sales_6m',    output_time_index=True, shift=168)),
    ('store_lag_9m', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales', output_col='store_sales_9m',    output_time_index=True, shift=252)),
    ('store_lag_1y', Lag_Featurizer(index_col=['store'],time_col='date',value_col='sales', output_col='store_sales_1y',    output_time_index=True, shift=336)),
    
    # lag item sales
    ('item_lag_3m',   Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m',   output_time_index=True, shift=98)),
    ('item_lag_3m_1', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m_1', output_time_index=True, shift=99)),
    ('item_lag_3m_2', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m_2', output_time_index=True, shift=100)),
    ('item_lag_3m_3', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m_3', output_time_index=True, shift=101)),
    ('item_lag_3m_4', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m_4', output_time_index=True, shift=102)),
    ('item_lag_3m_5', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m_5', output_time_index=True, shift=103)),
    ('item_lag_3m_6', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',output_col='item_sales_3m_6', output_time_index=True, shift=104)),   
    ('item_lag_4m', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',  output_col='item_sales_4m',   output_time_index=True, shift=112)),
    ('item_lag_5m', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',  output_col='item_sales_5m',   output_time_index=True, shift=140)),
    ('item_lag_6m', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',  output_col='item_sales_6m',   output_time_index=True, shift=168)),
    ('item_lag_9m', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',  output_col='item_sales_9m',   output_time_index=True, shift=252)),
    ('item_lag_1y', Lag_Featurizer(index_col=['item'],time_col='date',value_col='sales',  output_col='item_sales_1y',   output_time_index=True, shift=336)),

]
)

In [58]:
%time data = lag_feature_pipeline.transform(data)

Wall time: 54.3 s


In [59]:
cols = [
    
    'sales',

    'sales_3m',  
    'sales_3m_1',
    'sales_3m_2',
    'sales_3m_3',
    'sales_3m_4',
    'sales_3m_5',
    'sales_3m_6',
    'sales_4m',  
    'sales_5m',  
    'sales_6m',  
    'sales_9m',  
    'sales_1y',  

    'store_sales_3m',  
    'store_sales_3m_1',
    'store_sales_3m_2',
    'store_sales_3m_3',
    'store_sales_3m_4',
    'store_sales_3m_5',
    'store_sales_3m_6',
    'store_sales_4m',
    'store_sales_5m',
    'store_sales_6m',
    'store_sales_9m',
    'store_sales_1y',

    'item_sales_3m',  
    'item_sales_3m_1',
    'item_sales_3m_2',
    'item_sales_3m_3',
    'item_sales_3m_4',
    'item_sales_3m_5',
    'item_sales_3m_6',
    'item_sales_4m',  
    'item_sales_5m',  
    'item_sales_6m',  
    'item_sales_9m',  
    'item_sales_1y',  

    'avg_monthly_sales',
    'avg_weekly_sales',
    'avg_weekday_sales'
]

In [70]:
training = data.loc['2015-01':'2017-12',cols] # 2017-10 to 2017-12 as validation set
validation_split = np.where((training.index >= pd.Timestamp(2017,9,1)) & (training.index <= pd.Timestamp(2017,12,31)), 0, -1)

In [71]:
# Validation data
X_validation = training.loc[validation_split == 0, cols[1:]]
y_validation = training.loc[validation_split == 0, 'sales']
print('Number of training instances = {0:d}'.format(X_validation.shape[0]))
print('Number of features           = {0:d}'.format(X_validation.shape[1]))
print('Date range = {0} to {1}'.format(X_validation.index[0].strftime('%Y-%m-%d'), X_validation.index[-1].strftime('%Y-%m-%d')))

Number of training instances = 61000
Number of features           = 39
Date range = 2017-09-01 to 2017-12-31


In [72]:
# training matrices
X_training = training.loc[:,cols[1:]]
y_training = training.loc[:,'sales']
print('Number of training instances = {0:d}'.format(X_training.shape[0]))
print('Number of features           = {0:d}'.format(X_training.shape[1]))
print('Date range = {0} to {1}'.format(training.index[0].strftime('%Y-%m-%d'), training.index[-1].strftime('%Y-%m-%d')))

Number of training instances = 548000
Number of features           = 39
Date range = 2015-01-01 to 2017-12-31


In [73]:
testing = data.loc['2018-01':,cols]
X_testing = testing.loc[:,cols[1:]]
y_testing = testing.loc[:,'sales']
print('Number of test instances = {0:d}'.format(X_testing.shape[0]))
print('Number of features       = {0:d}'.format(X_testing.shape[1]))
print('Date range = {0} to {1}'.format(testing.index[0].strftime('%Y-%m-%d'), testing.index[-1].strftime('%Y-%m-%d')))

Number of test instances = 45000
Number of features       = 39
Date range = 2018-01-01 to 2018-03-31


In [74]:
training_full = data.loc[:'2017-12',cols]
X_training_full = training_full.loc[:,cols[1:]]
y_training_full = training_full.loc[:,'sales']
print('Number of training instances = {0:d}'.format(X_training_full.shape[0]))
print('Number of features           = {0:d}'.format(X_training_full.shape[1]))
print('Date range = {0} to {1}'.format(training_full.index[0].strftime('%Y-%m-%d'), training_full.index[-1].strftime('%Y-%m-%d')))

Number of training instances = 913000
Number of features           = 39
Date range = 2013-01-01 to 2017-12-31


In [75]:
# Lasso
lasso = Lasso(random_state=RANDOM_STATE, max_iter=2000)
lasso_params = {'alpha': np.logspace(-3,3,7)}

In [76]:
reg = GridSearchCV(lasso, lasso_params, scoring='neg_mean_absolute_error', n_jobs=8, cv=PredefinedSplit(validation_split), verbose=20)

In [77]:
reg.fit(X_training,y_training)

Fitting 1 folds for each of 7 candidates, totalling 7 fits


[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:   28.2s
[Parallel(n_jobs=8)]: Done   2 out of   7 | elapsed:   54.6s remaining:  2.3min
[Parallel(n_jobs=8)]: Done   3 out of   7 | elapsed:  1.3min remaining:  1.7min
[Parallel(n_jobs=8)]: Done   4 out of   7 | elapsed:  2.5min remaining:  1.8min
[Parallel(n_jobs=8)]: Done   5 out of   7 | elapsed:  3.2min remaining:  1.3min
[Parallel(n_jobs=8)]: Done   7 out of   7 | elapsed:  3.6min remaining:    0.0s
[Parallel(n_jobs=8)]: Done   7 out of   7 | elapsed:  3.6min finished


GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=2000,
   normalize=False, positive=False, precompute=False, random_state=71,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'alpha': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=20)

In [78]:
print('Best score = {0:.4f}; Best Parameter = {1}'.format(-reg.best_score_, reg.best_params_)) # Score is the negative because large score indicates better fit

Best score = 6.5443; Best Parameter = {'alpha': 0.001}


In [79]:
pred_validation_lasso = reg.predict(X_validation)
print(smape(pred_validation_lasso, y_validation.values))

13.516891494863843


In [80]:
reg_best = reg.best_estimator_

In [81]:
pred_reg_best = reg_best.predict(X_testing)

In [82]:
# Make predictions and a submission: 18.15; Use smape_scorer = 18.65; somehow best score = 19.39
submission_lasso = pd.DataFrame({'Id': sample_submission.id, 'sales': pred_reg_best})
submission_lasso.to_csv('../output/submission_lasso.csv', index=False)

#### xgboost Model

In [83]:
reg_xgb = xgb.XGBRegressor(objective='reg:linear', booster='gbtree', n_jobs=8)

In [84]:
xgb_params = {
                'n_estimators': np.arange(50,200,50),
                'min_child_weight': np.arange(10,100,10), 
                'learning_rate': [0.1], 
                'colsample_bytree': np.arange(0.8,1.0,0.1), 
                'max_depth': np.arange(1,10,2),
                'subsample': [0.9], 
                'reg_lambda': [1.]
}

fit_params={"early_stopping_rounds":50, 
            "eval_metric" : "mae", 
            "eval_set" : [[X_validation.values, y_validation.values]]}


In [88]:
grid_xgb = GridSearchCV(reg_xgb, xgb_params, scoring=None, n_jobs=8, cv=PredefinedSplit(validation_split), verbose=50, fit_params=fit_params)

In [89]:
%time grid_xgb.fit(X_training.values,y_training.values)



Fitting 1 folds for each of 270 candidates, totalling 270 fits
[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:   37.1s
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   43.1s
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed:   50.0s
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed:  1.3min
[Parallel(n_jobs=8)]: Done   5 tasks      | elapsed:  1.5min
[Parallel(n_jobs=8)]: Done   6 tasks      | elapsed:  1.5min
[Parallel(n_jobs=8)]: Done   7 tasks      | elapsed:  1.5min
[Parallel(n_jobs=8)]: Done   8 tasks      | elapsed:  1.6min
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:  2.1min
[Parallel(n_jobs=8)]: Done  10 tasks      | elapsed:  2.7min
[Parallel(n_jobs=8)]: Done  11 tasks      | elapsed:  2.9min
[Parallel(n_jobs=8)]: Done  12 tasks      | elapsed:  2.9min
[Parallel(n_jobs=8)]: Done  13 tasks      | elapsed:  3.7min
[Parallel(n_jobs=8)]: Done  14 tasks      | elapsed:  4.0min
[Parallel(n_jobs=8)]: Done  15 tasks      | elapsed:  4.2min
[Parallel(n_jobs=8)]: 

[Parallel(n_jobs=8)]: Done 134 tasks      | elapsed: 153.0min
[Parallel(n_jobs=8)]: Done 135 tasks      | elapsed: 153.8min
[Parallel(n_jobs=8)]: Done 136 tasks      | elapsed: 154.1min
[Parallel(n_jobs=8)]: Done 137 tasks      | elapsed: 155.2min
[Parallel(n_jobs=8)]: Done 138 tasks      | elapsed: 155.9min
[Parallel(n_jobs=8)]: Done 139 tasks      | elapsed: 156.3min
[Parallel(n_jobs=8)]: Done 140 tasks      | elapsed: 157.1min
[Parallel(n_jobs=8)]: Done 141 tasks      | elapsed: 157.9min
[Parallel(n_jobs=8)]: Done 142 tasks      | elapsed: 158.1min
[Parallel(n_jobs=8)]: Done 143 tasks      | elapsed: 159.1min
[Parallel(n_jobs=8)]: Done 144 tasks      | elapsed: 159.3min
[Parallel(n_jobs=8)]: Done 145 tasks      | elapsed: 160.1min
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed: 160.2min
[Parallel(n_jobs=8)]: Done 147 tasks      | elapsed: 161.2min
[Parallel(n_jobs=8)]: Done 148 tasks      | elapsed: 161.3min
[Parallel(n_jobs=8)]: Done 149 tasks      | elapsed: 161.5min
[Paralle

[12]	validation_0-mae:15.2923
[13]	validation_0-mae:13.973
[14]	validation_0-mae:12.8128
[15]	validation_0-mae:11.7837
[16]	validation_0-mae:10.8893
[17]	validation_0-mae:10.117
[18]	validation_0-mae:9.45354
[19]	validation_0-mae:8.88722
[20]	validation_0-mae:8.40548
[21]	validation_0-mae:7.99694
[22]	validation_0-mae:7.65801
[23]	validation_0-mae:7.3671
[24]	validation_0-mae:7.11682
[25]	validation_0-mae:6.90746
[26]	validation_0-mae:6.73296
[27]	validation_0-mae:6.59137
[28]	validation_0-mae:6.46491
[29]	validation_0-mae:6.36373
[30]	validation_0-mae:6.27198
[31]	validation_0-mae:6.196
[32]	validation_0-mae:6.13186
[33]	validation_0-mae:6.07471
[34]	validation_0-mae:6.02799
[35]	validation_0-mae:5.99042
[36]	validation_0-mae:5.95782
[37]	validation_0-mae:5.92792
[38]	validation_0-mae:5.9025
[39]	validation_0-mae:5.88025
[40]	validation_0-mae:5.8609
[41]	validation_0-mae:5.8447
[42]	validation_0-mae:5.83024
[43]	validation_0-mae:5.81706
[44]	validation_0-mae:5.80586
[45]	validation_0-

GridSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
       error_score='raise',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=8, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params={'early_stopping_rounds': 50, 'eval_metric': 'mae', 'eval_set': [[array([[24.     , 22.     , ..., 21.97143, 21.01533],
       [35.     , 24.     , ..., 21.97143, 22.97318],
       ...,
       [97.     , 97.     , ..., 55.97143, 82.85441],
       [99.     , 97.     , ..., 55.97143, 87.98467]], dtype=float32), array([38., 26., ..., 62., 82.], dtype=float32)]]},
       iid=True, n_jobs=8,
       param_grid={'n_estimators': array([ 50, 100, 150]), 'min_child_weight': array([10, 2

In [91]:
xgb_best = grid_xgb.best_estimator_
pred_validation_xgb = xgb_best.predict(X_validation.values)
print(smape(pred_validation_xgb, y_validation.values))

11.48302222079918


In [92]:
# PB score = 14.09
pred_xgb_best = xgb_best.predict(X_testing.values)
submission_xgb = pd.DataFrame({'Id': sample_submission.id, 'sales': pred_xgb_best})
submission_xgb.to_csv('../output/submission_xgb.csv', index=False)

In [106]:
# Feature importance
feature_importance = pd.DataFrame(data=cols[1:],columns=['feature'])
feature_importance['weight'] = xgb_best.feature_importances_
feature_importance = feature_importance.sort_values('weight',ascending=False)
feature_importance.to_csv('../output/feature_importance.csv', index=False)
feature_importance

Unnamed: 0,feature,weight
23,store_sales_1y,0.073849
37,avg_weekly_sales,0.050204
35,item_sales_1y,0.049792
38,avg_weekday_sales,0.047701
36,avg_monthly_sales,0.045926
12,store_sales_3m,0.042883
22,store_sales_9m,0.03778
34,item_sales_9m,0.036164
19,store_sales_4m,0.031473
31,item_sales_4m,0.030237


#### Xgboost with select features

In [None]:
feature_importance

#### Stacking
Using ridge regression with validation data

In [93]:
X_stack = np.stack((pred_validation_lasso, pred_validation_xgb), axis=-1)
y_stack = y_validation.values
print(X_stack.shape)

(61000, 2)


In [94]:
reg_stack = Ridge(random_state=RANDOM_STATE, alpha=0.1)
reg_stack.fit(X_stack, y_stack)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=71, solver='auto', tol=0.001)

In [95]:
pred_validation_stack = reg_stack.predict(X_stack)
print(smape(pred_validation_stack, y_validation.values))

11.458029038385781


In [96]:
# Prediction using the combined predictor
X_stack_test = np.stack((pred_reg_best, pred_xgb_best), axis=-1)
pred_test_stack = reg_stack.predict(X_stack_test)
submission_stack = pd.DataFrame({'Id': sample_submission.id, 'sales': pred_test_stack})
submission_stack.to_csv('../output/submission_stack.csv', index=False)