In [2]:
# imports for notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [3]:
# read in the dates and their recorded unique characteristics
dates = pd.read_csv('C:/Users/TWood/Downloads/m5-forecasting-accuracy/calendar.csv', parse_dates=[0])

In [4]:
# take a look at the dates df
dates

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2016-06-15,11620,Wednesday,5,6,2016,d_1965,,,,,0,1,1
1965,2016-06-16,11620,Thursday,6,6,2016,d_1966,,,,,0,0,0
1966,2016-06-17,11620,Friday,7,6,2016,d_1967,,,,,0,0,0
1967,2016-06-18,11621,Saturday,1,6,2016,d_1968,,,,,0,0,0


In [5]:
# most days have no event, replace NaN with "None"
dates.replace(np.NaN, 'None', inplace=True)

In [6]:
# read in the training data
val = pd.read_csv('C:/Users/TWood/Downloads/m5-forecasting-accuracy/sales_train_validation.csv')

In [7]:
val

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,2,0,0,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,2,1,0,2,0,1,0,0,1,0
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,1,0,0,1,0,3,1,3


In [8]:
CA1_F1 = val[(val['store_id'] == 'CA_1')&(val['dept_id'] == 'FOODS_1')]

In [9]:
# reducing the unnecessary columns to make the melt faster
CA1_F1.drop(columns=['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [10]:
items = CA1_F1['id'].unique()

In [11]:
CA1_F1_ts = CA1_F1.melt(id_vars=['id'], var_name='d', value_name='sales')

In [12]:
prices = pd.read_csv('C:/Users/TWood/Downloads/m5-forecasting-accuracy/sell_prices.csv')

In [13]:
prices['id'] = prices['item_id'] + '_' + prices['store_id'] + '_validation'

In [14]:
# most expensive item is $107.32
prices['sell_price'].max()

107.32

In [None]:
prices

In [17]:
# the sell prices are present for all 30490 items for the final week, matches length of val dataframe
(prices['wm_yr_wk'] == 11621).sum()

30490

In [18]:
CA1_F1_price = CA1_F1_ts.merge(dates, on='d').merge(prices.drop(columns=['store_id', 'item_id']), on=['id', 'wm_yr_wk'], how='left')

In [20]:
CA1_F1_price

Unnamed: 0,id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,FOODS_1_001_CA_1_validation,d_1,3,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0,2.00
1,FOODS_1_002_CA_1_validation,d_1,0,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0,7.88
2,FOODS_1_003_CA_1_validation,d_1,0,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0,2.88
3,FOODS_1_004_CA_1_validation,d_1,0,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0,
4,FOODS_1_005_CA_1_validation,d_1,3,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0,2.94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413203,FOODS_1_215_CA_1_validation,d_1913,0,2016-04-24,11613,Sunday,2,4,2016,,,,,0,0,0,1.84
413204,FOODS_1_216_CA_1_validation,d_1913,1,2016-04-24,11613,Sunday,2,4,2016,,,,,0,0,0,5.28
413205,FOODS_1_217_CA_1_validation,d_1913,3,2016-04-24,11613,Sunday,2,4,2016,,,,,0,0,0,3.54
413206,FOODS_1_218_CA_1_validation,d_1913,6,2016-04-24,11613,Sunday,2,4,2016,,,,,0,0,0,0.98


In [28]:
# 61292 missing price values
CA1_F1_price['sell_price'].isna().sum()

61292

In [29]:
# Every single time the price is missing, there are no sales
((CA1_F1_price['sell_price'].isna())&(CA1_F1_price['sales'] == 0)).sum()

61292

In [None]:
CA1

In [45]:
CA1_F1_price

KeyError: "['id' 'd' 'sales' 'date' 'wm_yr_wk' 'weekday' 'wday' 'month' 'year'\n 'event_name_1' 'event_type_1' 'event_name_2' 'event_type_2' 'snap_CA'\n 'snap_TX' 'snap_WI' 'sell_price'] not found in axis"

In [None]:
CA1_F1_price

In [None]:
mlb = MultiLabelBinarizer()

In [None]:
ohe = OneHotEncoder()
ohe.fit_transform()

In [None]:
preds = []
trues = []
for item in items:
    ts = CA1_F1_prices[CA1_F1_prices['id'] == item].drop(columns=['d', 'id'])
    train = ts['2014-03-28':'2016-03-28']
    test = ts['2016-03-28':]
    X_train = train.drop('sales', axis=1)
    X_test = test.drop('sales', axis=1)
    y_train = train['sales']
    y_test = test['sales']
    lgbm = LGBMRegressor().fit(X_train, y_train)
    preds.append(lgbm.predict(X_test))
    trues.append(y_test)

In [None]:
CA1_F1_ts = CA1_F1_ts.merge(dates, how='inner', on='d')

In [None]:
CA1_F1_ts['lag_1'] = CA1_F1_ts['sales'].shift(periods=216*1)

In [None]:
CA1_F1_ts['lag_2'] = CA1_F1_ts['sales'].shift(periods=216*2)

In [None]:
CA1_F1_ts['lag_3'] = CA1_F1_ts['sales'].shift(periods=216*3)

In [None]:
CA1_F1_ts['lag_4'] = CA1_F1_ts['sales'].shift(periods=216*4)

In [None]:
CA1_F1_ts['lag_5'] = CA1_F1_ts['sales'].shift(periods=216*5)

In [None]:
CA1_F1_ts['lag_6'] = CA1_F1_ts['sales'].shift(periods=216*6)

In [None]:
CA1_F1_ts['lag_7'] = CA1_F1_ts['sales'].shift(periods=216*7)

In [None]:
CA1_F1_ts['lag_14'] = CA1_F1_ts['sales'].shift(periods=216*14)

In [None]:
CA1_F1_ts['lag_21'] = CA1_F1_ts['sales'].shift(periods=216*21)

In [None]:
CA1_F1_ts['lag_28'] = CA1_F1_ts['sales'].shift(periods=216*28)

In [None]:
CA1_F1_ts.set_index('date', inplace=True)

In [None]:
items[0]

In [None]:
for col in ['event_name_1', 'event_name_2']:
    label = LabelEncoder().fit(CA1_F1_price[col])
    CA1_F1_price[col] = label.transform(CA1_F1_price[col])

In [None]:
CA1_F1_ts

In [None]:
preds = []
trues = []
for item in items:
    ts = CA1_F1_ts[CA1_F1_ts['id'] == item].drop(columns=['d', 'id'])
    train = ts['2014-03-28':'2016-03-28']
    test = ts['2016-03-28':]
    X_train = train.drop('sales', axis=1)
    X_test = test.drop('sales', axis=1)
    y_train = train['sales']
    y_test = test['sales']
    lgbm = LGBMRegressor().fit(X_train, y_train)
    forecast = []
    for X in X_test.index:
        pred = lgbm.predict(X_test[X])
        forecast.append(pred)
        
    preds.append(forecast)
    trues.append(y_test)

In [None]:
preds

In [None]:
trues

In [None]:
RMSEs = []
for i in range(216):
    RMSEs.append(mean_squared_error(trues[i], preds[i], squared=False))

In [None]:
np.mean(RMSEs)

In [None]:
CA1_F1_ts

In [None]:
trial.drop(columns=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], inplace=True)

In [None]:
trial_ts = trial.melt(var_name='d', value_name='sales')

In [None]:
trial_ts = trial_ts.merge(dates, how='inner', on='d')

In [None]:
trial_ts.set_index('date', inplace=True)

In [None]:
trial_ts['lag_1'] = trial_ts['sales'].shift(periods=216*1)

In [None]:
trial_ts['lag_2'] = trial_ts['sales'].shift(periods=216*2)

In [None]:
trial_ts['lag_3'] = trial_ts['sales'].shift(periods=216*3)

In [None]:
trial_ts['lag_4'] = trial_ts['sales'].shift(periods=216*4)

In [None]:
trial_ts['lag_5'] = trial_ts['sales'].shift(periods=216*5)

In [None]:
trial_ts['lag_6'] = trial_ts['sales'].shift(periods=216*6)

In [None]:
trial_ts['lag_7'] = trial_ts['sales'].shift(periods=216*7)

In [None]:
trial_ts['lag_14'] = trial_ts['sales'].shift(periods=216*14)

In [None]:
trial_ts['lag_21'] = trial_ts['sales'].shift(periods=216*21)

In [None]:
trial_ts['lag_28'] = trial_ts['sales'].shift(periods=216*28)

In [None]:
train = trial_ts['2014-03-28':'2016-03-28']

In [None]:
test = trial_ts['2016-03-28':]

In [None]:
X_train = train.drop(['sales', 'd', 'wm_yr_wk', 'event_name_1', 'event_name_2'], axis=1)
X_test = test.drop(['sales', 'd', 'wm_yr_wk', 'event_name_1', 'event_name_2'], axis=1)
y_train = train['sales']
y_test = test['sales']

In [None]:
y_test

In [None]:
label = LabelEncoder()
X_train['id'] = label.fit_transform(X_train['id'])
X_test['id'] = label.transform(X_test['id'])

In [None]:
trial_ts

In [None]:
X_train

In [None]:
rf2 = RandomForestRegressor(max_depth=20)
rf2.fit(X_train, y_train)

In [None]:
rf2.score(X_train, y_train)

In [None]:
rf2.score(X_test, y_test)

In [None]:
rf2_preds = rf2.predict(X_test)

In [None]:
mean_squared_error(y_test, rf2_preds)

In [None]:
val['dept_id'].unique()

In [None]:
CA1_f1 = val[(val['store_id'] == 'CA_1')&(val['dept_id'] == 'FOODS_1')]

In [None]:
CA1_f1_melt = CA1_f1.drop(columns=['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])

In [None]:
CA1_f1_ts = CA1_f1_melt.melt(id_vars=['id'], var_name='d', value_name='sales')

In [None]:
CA1_f1_ts

In [None]:
CA1_f1_ts = CA1_f1_ts.merge(dates, how='inner', on='d')

In [None]:
CA1_f1_ts

In [None]:
CA1_f1_ts.loc['2011-01-30']

In [None]:
CA1_f1_ts.set_index('date', inplace=True)

In [None]:
CA1_f1_ts['lag_1'] = CA1_f1_ts['sales'].shift(periods=216)

In [None]:
CA1_f1_ts['lag_2'] = CA1_f1_ts['sales'].shift(periods=(216*2))

In [None]:
CA1_f1_ts['lag_3'] = CA1_f1_ts['sales'].shift(periods=(216*3))

In [None]:
CA1_f1_ts['lag_4'] = CA1_f1_ts['sales'].shift(periods=(216*4))

In [None]:
CA1_f1_ts['lag_5'] = CA1_f1_ts['sales'].shift(periods=(216*5))

In [None]:
CA1_f1_ts['lag_6'] = CA1_f1_ts['sales'].shift(periods=(216*6))

In [None]:
CA1_f1_ts['lag_7'] = CA1_f1_ts['sales'].shift(periods=(216*7))

In [None]:
CA1_f1_ts['lag_14'] = CA1_f1_ts['sales'].shift(periods=(216*14))

In [None]:
CA1_f1_ts['lag_21'] = CA1_f1_ts['sales'].shift(periods=(216*21))

In [None]:
CA1_f1_ts['lag_28'] = CA1_f1_ts['sales'].shift(periods=(216*28))

In [None]:
CA1_f1_ts

In [None]:
train = CA1_f1_ts['2014-03-28':'2016-03-28']

In [None]:
test = CA1_f1_ts['2016-03-28':]

In [None]:
# item 7, 100, and 165 are missing, thus 216 per day
test['id'].unique()

In [None]:
train

In [None]:
test

In [None]:
X_train = train.drop(['sales', 'd', 'wm_yr_wk', 'event_name_1', 'event_name_2'], axis=1)
X_test = test.drop(['sales', 'd', 'wm_yr_wk', 'event_name_1', 'event_name_2'], axis=1)
y_train = train['sales']
y_test = test['sales']

In [None]:
y_test

In [None]:
label = LabelEncoder()
X_train['id'] = label.fit_transform(X_train['id'])
X_test['id'] = label.transform(X_test['id'])

In [None]:
rf1 = RandomForestRegressor(max_depth=20)
rf1.fit(X_train, y_train)

In [None]:
rf1.score(X_train, y_train)

In [None]:
rf1.score(X_test, y_test)

In [None]:
rf1_preds = rf1.predict(X_test)

In [None]:
rf1_preds = pd.Series(rf1_preds).set_axis(y_test.index)

In [None]:
fig, ax = plt.subplots()
ax.plot(rf1_preds)
ax.plot(y_test)

In [None]:
mean_squared_error(rf1_preds, y_test, squared=False)

In [None]:
lgbm_1 = LGBMRegressor()
lgbm_1.fit(X_train, y_train)

In [None]:
lgbm_1.score(X_train, y_train)

In [None]:
lgbm_1.score(X_test, y_test)

In [None]:
xgb_1 = XGBRegressor()
xgb_1.fit(X_train, y_train)

In [None]:
xgb_1.score(X_train, y_train)

In [None]:
xgb_1.score(X_test, y_test)

In [None]:
CA1_join = CA_1[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']]

In [None]:
#to_melt = train.drop(columns=['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])

In [None]:
#train_ts = to_melt.melt(id_vars=['id'], var_name='d', value_name='sales')

In [None]:
#to_join = train[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']]

In [None]:
train_ts = train_ts.merge(to_join, on='id', how='inner')

In [None]:
train_ts.to_csv('./timeseries.csv')

In [None]:
train_ts.merge(dates, on='d', how='inner')

In [None]:
train_ts['value'].max()

In [None]:
train_ts = pd.melt(train, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='day')

In [None]:
train_ts

In [None]:
train = pd.read_csv('C:/Users/TWood/Downloads/m5-forecasting-accuracy/sales_train_evaluation.csv')

In [None]:
val_ts = pd.melt(val.head(), id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d')

In [None]:
val_ts = val_ts.merge(dates, on='d')

In [None]:
val_ts.set_index('date')

In [None]:
df.describe()

In [None]:
pd.read_csv('C:/Users/TWood/Downloads/m5-forecasting-accuracy/sample_submission.csv')

In [None]:
pd.read_csv('C:/Users/TWood/Downloads/m5-forecasting-accuracy/sell_prices.csv')