Read the combined dataset of sales, sell price, and calendar. Sell price is scaled and categorical variables one-hot encoded.

Then train a linear model and an Extra Trees Regressor to predict the sales of a product given the store, sell price, and calendar information. The last 28 days are held out for validation.

# Inputs

In [2]:
%reload_ext autoreload
%autoreload 2

import sys
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

ROOT = pathlib.Path().absolute().parent
RAW_DATA_PATH = ROOT / 'data' / 'raw'
PROCESSED_DATA_PATH = ROOT / 'data' / 'processed'

# endure this project is in the path
sys.path.insert(0, ROOT.absolute().as_posix())

# Data Pipeline

In [3]:
print('loading dataset')
df = pd.read_feather(PROCESSED_DATA_PATH / 'combined_dataset.feather')

# just select 1 in 100 products
# df = df[df.item_id.str.match(r'.*(1|3|5|7|9)$')]
# df = df[df.item_id.str.match(r'.*11$')]

# just use a sample of the dataset for now
# sample_df = df.copy()#.sample(frac=0.001, random_state=42)

# use the last 28 days for validation
print('separating train and valid data')
training_end_date = df.date.max()-pd.DateOffset(28)
train_df = df[df.date <= training_end_date].copy()
print('train_df created')
valid_df = df[df.date >  training_end_date].copy()
print('valid_df created')

del df

# separate features and targets
print('separating target and feature columns')
X_train = train_df.drop('sales', axis=1)
y_train = train_df['sales'].copy()
print('train done')

del train_df

X_valid = valid_df.drop('sales', axis=1)
y_valid = valid_df['sales'].copy()
print('valid done')

del valid_df

num_attribs = ['sell_price_cent']
cat_attribs = ['dept_id','store_id','weekday','month','year','item_id']
to_drop = [x for x in X_train.columns if x.startswith('event_name')]
to_drop += [#'item_id',
'cat_id','state_id','date','event']
to_pass = list(set(X_train.columns)-set(num_attribs)-set(cat_attribs)-set(to_drop))

full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attribs),
    ('cat', OneHotEncoder(), cat_attribs),
    ('drop', 'drop', to_drop)],
    remainder='passthrough')

print('transforming features')
# separate fit and transform so we can parallelize
full_pipeline.fit(X_train)
print('transformation fitted')
X_train_prepared = full_pipeline.transform(X_train)
print('x train done')
del X_train
X_valid_prepared = full_pipeline.transform(X_valid)
print('x valid done')
del X_valid

loading dataset
separating train and valid data
train_df created
valid_df created
separating target and feature columns
train done
valid done
transforming features
x train done
x valid done


<45174237x3097 sparse matrix of type '<class 'numpy.float64'>'
	with 334822378 stored elements in Compressed Sparse Row format>

# Simple Models

In [23]:
lin_reg = LinearRegression(n_jobs=-2)
lin_reg.fit(X_train_prepared, y_train)

# calculate rmse (not weighted)
print('train', mean_squared_error(lin_reg.predict(X_train_prepared), y_train)**0.5)
print('valid', mean_squared_error(lin_reg.predict(X_valid_prepared), y_valid)**0.5)

train 4.19061487111886
valid 3.487933995902688

train 1.3290738854407762
valid 1.276752570298425


In [5]:
rf = ExtraTreesRegressor(n_jobs=-3, max_features=4, bootstrap=True, max_samples=1_000_000, max_depth=4, n_estimators=500)
rf.fit(X_train_prepared, y_train)

# calculate rmse (not weighted)
print('train', mean_squared_error(rf.predict(X_train_prepared), y_train)**0.5)
print('valid', mean_squared_error(rf.predict(X_valid_prepared), y_valid)**0.5)

train 4.314117283051862
valid 3.585445652764168
