In [180]:
import sys
import os
import pathlib
import pandas as pd
import numpy as np
# import sklearn

In [181]:
ROOT = pathlib.Path().absolute().parent.parent
RAW_DATA_PATH = ROOT / 'data' / 'raw'
INTERIM_DATA_PATH = ROOT / 'data' / 'interim'
PROCESSED_DATA_PATH = ROOT / 'data' / 'processed'
DAYS_PRED = 28

# endure this project is in the path
sys.path.insert(0, ROOT.absolute().as_posix())
from src.data.process_data import reduce_memory_usage

In [182]:
# we will work with just one department in one store
df = \
(pd
 .read_parquet(PROCESSED_DATA_PATH / 'train_validation' /'FOODS_3_CA_1.parquet')
 .pipe(reduce_memory_usage)
 .astype({c:'category' for c in ['wday','month','year']})
 .drop(columns=['id','dept_id','cat_id','store_id',
                'state_id','date','wday','d']))

# get rid of categories from aother parts of the dataset
for col, dtype in zip(df.columns, df.dtypes):
    if str(dtype) == 'category':
        df[col] = df[col].cat.remove_unused_categories()

# I want to undo get_dummies on event_names and event_types
# for simplicity only consider the first event if there is more than 1
event_name = \
(df
 .filter(like='event_name_')
 .assign(event_name_none=lambda df: ~df.any(axis=1))
 .stack()
 .reset_index()
 .rename(columns={'level_0':'index','level_1':'event_name',0:'is_event'})
 .query("""is_event == True""")
 .groupby('index', as_index=True)
 .first()
 .drop(columns=['is_event']))

event_type = \
(df
 .filter(like='event_type_')
 .assign(event_type_none=lambda df: ~df.any(axis=1))
 .stack()
 .reset_index()
 .rename(columns={'level_0':'index','level_1':'event_type',0:'is_event'})
 .query("""is_event == True""")
 .groupby('index', as_index=True)
 .first()
 .drop(columns=['is_event']))

# join in these 'un-dummied' features and drop the dummies
df = \
(df
 .join(event_type)
 .join(event_name)
 .filter(regex=r'^((?!event_name_).)*$')
 .filter(regex=r'^((?!event_type_).)*$')
#  .replace(
#      {'event_type':{'event_type_none':'none'},
#       'event_name':{'event_name_none':'none'}})
 .astype({c:'category' for c in ['event_type','event_name',
                                 'quarter','weekofyear',
                                 'day','dayofyear']}))

# separate out a test set
train = df[df.part=='train'].drop(columns=['part'])
train_labels = train.pop('demand')

test = df[df.part=='validation'].drop(columns=['part'])
test_labels = test.pop('demand')
del df, event_name, event_type

In [90]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1257138 entries, 0 to 1280153
Data columns (total 44 columns):
 #   Column                           Non-Null Count    Dtype   
---  ------                           --------------    -----   
 0   item_id                          1257138 non-null  category
 1   weekday                          1257138 non-null  category
 2   month                            1257138 non-null  category
 3   year                             1257138 non-null  category
 4   is_weekend                       1257138 non-null  bool    
 5   event                            1257138 non-null  bool    
 6   snap                             1257138 non-null  bool    
 7   sell_price_cent                  1257138 non-null  uint16  
 8   shift_t28                        1257138 non-null  uint16  
 9   shift_t29                        1257138 non-null  uint16  
 10  shift_t30                        1257138 non-null  uint16  
 11  rolling_std_t7                   1257

In [183]:
train.groupby('event_type').size()

event_type
event_type_cultural       24680
event_type_national       33819
event_type_none         1156336
event_type_religious      32820
event_type_sporting        9483
dtype: int64

In [184]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# select features
to_drop = ['day','weekofyear','dayofyear','item_id','quarter']

num_attribs = [col for col, dtype in zip(train.columns, train.dtypes)
               if ('float' in str(dtype) or 'int' in str(dtype))
               and col not in to_drop]

cat_attribs = [col for col, dtype in zip(train.columns, train.dtypes)
               if str(dtype)=='category'
               and col not in to_drop]

# define pipelines
num_pipeline = Pipeline([
    ('std_scalar', StandardScaler())])

cat_pipeline = Pipeline([
    ('one_hot_enc', OneHotEncoder())])

# combine pipelines
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs),
    ('drop', 'drop', to_drop)],
    remainder='passthrough')

train_prepared = full_pipeline.fit_transform(train, train_labels)
test_prepared = full_pipeline.transform(test)

In [189]:
# try permutation importance as an alternative of feature importance

# I need to split out a validation set for CV
# need to set up GridSearch for a selection of models and hyperparams
# the grid search should include feature selection

# experiment witrh XGBoost, CatBoost, LightGBM

# package all this into a class for more convenient training
# when a good model is chosen using CV train on the full dataset and predict

# for completion, build backwards so I start with the raw data

# finally, recreate using dask, then GPU

from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

# fit
m = SGDRegressor(random_state=42, verbose=1)
m.fit(train_prepared, train_labels)

# scores
train_preds = m.predict(train_prepared)
mse = mean_squared_error(train_labels, m.predict(train_prepared))
print(f'train rmse: {mse**0.5}')

test_preds = m.predict(test_prepared)
mse = mean_squared_error(test_labels, m.predict(test_prepared))
print(f'test rmse: {mse**0.5}')

-- Epoch 1
Norm: 4.76, NNZs: 95, Bias: 1.276247, T: 1257138, Avg. loss: 13.818281
Total training time: 0.75 seconds.
-- Epoch 2
Norm: 4.87, NNZs: 95, Bias: 1.318003, T: 2514276, Avg. loss: 12.218457
Total training time: 1.49 seconds.
-- Epoch 3
Norm: 4.99, NNZs: 95, Bias: 1.385635, T: 3771414, Avg. loss: 11.965164
Total training time: 2.24 seconds.
-- Epoch 4
Norm: 4.49, NNZs: 95, Bias: 1.470842, T: 5028552, Avg. loss: 11.995972
Total training time: 2.98 seconds.
-- Epoch 5
Norm: 4.85, NNZs: 95, Bias: 1.378741, T: 6285690, Avg. loss: 11.863051
Total training time: 3.73 seconds.
-- Epoch 6
Norm: 4.88, NNZs: 95, Bias: 1.429363, T: 7542828, Avg. loss: 11.714852
Total training time: 4.48 seconds.
-- Epoch 7
Norm: 4.90, NNZs: 95, Bias: 1.405134, T: 8799966, Avg. loss: 11.709570
Total training time: 5.22 seconds.
-- Epoch 8
Norm: 4.85, NNZs: 95, Bias: 1.400551, T: 10057104, Avg. loss: 11.666442
Total training time: 5.97 seconds.
-- Epoch 9
Norm: 5.01, NNZs: 95, Bias: 1.469039, T: 11314242, A

In [228]:
from sklearn.inspection import permutation_importance
import plotly.express as px

result = permutation_importance(
    m, train_prepared, train_labels, n_repeats=5,
    random_state=42, n_jobs=-1)

# sorted_idx = result.importances_mean.argsort()
result.importances_mean.sort()
# to_plot = pd.DataFrame(
#     result.importances[sorted_idx].T,
# #     columns=test.columns[sorted_idx]
# ).melt()

# px.box(to_plot, y='value', color='variable')

In [229]:
result.importances_mean

array([-9.95512861e-05, -8.26393101e-07, -2.46331754e-07, -1.44200609e-07,
       -4.26519501e-08, -2.44073391e-08, -1.79787167e-08,  1.57038993e-09,
        3.43931643e-08,  4.37970619e-08,  2.64177412e-07,  4.78101706e-07,
        4.80736763e-07,  7.93511370e-07,  8.17189930e-07,  1.11375458e-06,
        1.14608368e-06,  1.43053383e-06,  1.69891973e-06,  1.93078520e-06,
        2.13799012e-06,  2.16312937e-06,  2.85643631e-06,  3.37419511e-06,
        3.69594740e-06,  4.87860717e-06,  5.11642148e-06,  6.97536753e-06,
        7.06619659e-06,  7.86721484e-06,  8.64868904e-06,  8.77004068e-06,
        8.95686502e-06,  8.98378367e-06,  1.04406825e-05,  1.18264714e-05,
        1.56378462e-05,  1.57169063e-05,  1.58189364e-05,  1.68894597e-05,
        1.89793775e-05,  1.89793775e-05,  2.07666939e-05,  2.54774114e-05,
        3.09247891e-05,  3.15750376e-05,  3.30662231e-05,  3.38298823e-05,
        3.79562110e-05,  5.11035417e-05,  5.64881785e-05,  5.71239552e-05,
        6.65541103e-05,  