In [1]:
# imports for notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [2]:
# read in the dates and their recorded unique characteristics
dates = pd.read_csv('C:/Users/TWood/Downloads/m5-forecasting-accuracy/calendar.csv', parse_dates=[0])

In [3]:
# take a look at the dates df
dates

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2016-06-15,11620,Wednesday,5,6,2016,d_1965,,,,,0,1,1
1965,2016-06-16,11620,Thursday,6,6,2016,d_1966,,,,,0,0,0
1966,2016-06-17,11620,Friday,7,6,2016,d_1967,,,,,0,0,0
1967,2016-06-18,11621,Saturday,1,6,2016,d_1968,,,,,0,0,0


In [4]:
# look at the data types, only event columns have missing values
dates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          1969 non-null   datetime64[ns]
 1   wm_yr_wk      1969 non-null   int64         
 2   weekday       1969 non-null   object        
 3   wday          1969 non-null   int64         
 4   month         1969 non-null   int64         
 5   year          1969 non-null   int64         
 6   d             1969 non-null   object        
 7   event_name_1  162 non-null    object        
 8   event_type_1  162 non-null    object        
 9   event_name_2  5 non-null      object        
 10  event_type_2  5 non-null      object        
 11  snap_CA       1969 non-null   int64         
 12  snap_TX       1969 non-null   int64         
 13  snap_WI       1969 non-null   int64         
dtypes: datetime64[ns](1), int64(7), object(6)
memory usage: 215.5+ KB


In [5]:
# most days have no event, replace NaN with "None"
dates.replace(np.NaN, 'None', inplace=True)

In [6]:
dates['event_name_1'].value_counts()

None                   1807
StPatricksDay             6
Ramadan starts            6
ValentinesDay             6
Mother's day              6
NBAFinalsEnd              6
LentWeek2                 6
SuperBowl                 6
Purim End                 6
NBAFinalsStart            6
MemorialDay               6
PresidentsDay             6
Pesach End                6
LentStart                 6
OrthodoxEaster            5
Eid al-Fitr               5
OrthodoxChristmas         5
MartinLutherKingDay       5
Halloween                 5
EidAlAdha                 5
IndependenceDay           5
Easter                    5
Christmas                 5
NewYear                   5
Thanksgiving              5
LaborDay                  5
Chanukah End              5
Cinco De Mayo             5
VeteransDay               5
ColumbusDay               5
Father's day              4
Name: event_name_1, dtype: int64

In [7]:
dates['event_name_2'].value_counts()

None              1964
Father's day         2
Cinco De Mayo        1
OrthodoxEaster       1
Easter               1
Name: event_name_2, dtype: int64

In [8]:
len(dates['event_name_1'].unique())

31

In [9]:
event_dict = dict(zip(dates['event_name_1'].unique(), range(0,31)))

In [10]:
dates.drop(columns=['weekday', 'year', 'event_type_1', 'event_type_2', 'snap_TX', 'snap_WI'], inplace=True)

In [11]:
# read in the training data
val = pd.read_csv('C:/Users/TWood/Downloads/m5-forecasting-accuracy/sales_train_validation.csv')

In [12]:
val

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,2,0,0,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,2,1,0,2,0,1,0,0,1,0
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,1,0,0,1,0,3,1,3


In [13]:
items = val['id'].unique()

In [14]:
items[0]

'HOBBIES_1_001_CA_1_validation'

In [15]:
trial = val[val['id'] == items[0]]

In [16]:
trial.drop(columns=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [17]:
trial_ts = trial.melt(var_name='d', value_name='sales')

In [18]:
trial_ts = trial_ts.merge(dates, how='inner', on='d')

In [19]:
trial_ts.set_index('date', inplace=True)

In [20]:
trial_ts['lag_1'] = trial_ts['sales'].shift(periods=1)

In [21]:
trial_ts['lag_2'] = trial_ts['sales'].shift(periods=2)

In [22]:
trial_ts['lag_3'] = trial_ts['sales'].shift(periods=3)

In [23]:
trial_ts['lag_4'] = trial_ts['sales'].shift(periods=4)

In [24]:
trial_ts['lag_5'] = trial_ts['sales'].shift(periods=5)

In [25]:
trial_ts['lag_6'] = trial_ts['sales'].shift(periods=6)

In [26]:
trial_ts['lag_7'] = trial_ts['sales'].shift(periods=7)

In [27]:
trial_ts['lag_14'] = trial_ts['sales'].shift(periods=14)

In [28]:
trial_ts['lag_21'] = trial_ts['sales'].shift(periods=21)

In [29]:
trial_ts['lag_28'] = trial_ts['sales'].shift(periods=28)

In [30]:
train = trial_ts['2014-03-28':'2016-03-28']

In [31]:
test = trial_ts['2016-03-28':]

In [32]:
X_train = train.drop(['sales', 'd', 'wm_yr_wk', 'event_name_1', 'event_name_2'], axis=1)
X_test = test.drop(['sales', 'd', 'wm_yr_wk', 'event_name_1', 'event_name_2'], axis=1)
y_train = train['sales']
y_test = test['sales']

In [33]:
y_test

date
2016-03-28    1
2016-03-29    0
2016-03-30    0
2016-03-31    0
2016-04-01    0
2016-04-02    0
2016-04-03    1
2016-04-04    0
2016-04-05    4
2016-04-06    2
2016-04-07    3
2016-04-08    0
2016-04-09    1
2016-04-10    2
2016-04-11    0
2016-04-12    0
2016-04-13    0
2016-04-14    1
2016-04-15    1
2016-04-16    3
2016-04-17    0
2016-04-18    1
2016-04-19    1
2016-04-20    1
2016-04-21    3
2016-04-22    0
2016-04-23    1
2016-04-24    1
Name: sales, dtype: int64

In [34]:
label = LabelEncoder()
X_train['id'] = label.fit_transform(X_train['id'])
X_test['id'] = label.transform(X_test['id'])

KeyError: 'id'

In [None]:
trial_ts

In [None]:
X_train

In [None]:
rf2 = RandomForestRegressor(max_depth=20)
rf2.fit(X_train, y_train)

In [None]:
rf2.score(X_train, y_train)

In [None]:
rf2.score(X_test, y_test)

In [None]:
rf2_preds = rf2.predict(X_test)

In [None]:
mean_squared_error(y_test, rf2_preds)

In [None]:
val['dept_id'].unique()

In [35]:
CA1_f1 = val[(val['store_id'] == 'CA_1')&(val['dept_id'] == 'FOODS_1')]

In [36]:
CA1_f1_melt = CA1_f1.drop(columns=['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])

In [37]:
CA1_f1_ts = CA1_f1_melt.melt(id_vars=['id'], var_name='d', value_name='sales')

In [38]:
CA1_f1_ts

Unnamed: 0,id,d,sales
0,FOODS_1_001_CA_1_validation,d_1,3
1,FOODS_1_002_CA_1_validation,d_1,0
2,FOODS_1_003_CA_1_validation,d_1,0
3,FOODS_1_004_CA_1_validation,d_1,0
4,FOODS_1_005_CA_1_validation,d_1,3
...,...,...,...
413203,FOODS_1_215_CA_1_validation,d_1913,0
413204,FOODS_1_216_CA_1_validation,d_1913,1
413205,FOODS_1_217_CA_1_validation,d_1913,3
413206,FOODS_1_218_CA_1_validation,d_1913,6


In [39]:
CA1_f1_ts = CA1_f1_ts.merge(dates, how='inner', on='d')

In [40]:
CA1_f1_ts

Unnamed: 0,id,d,sales,date,wm_yr_wk,wday,month,event_name_1,event_name_2,snap_CA
0,FOODS_1_001_CA_1_validation,d_1,3,2011-01-29,11101,1,1,,,0
1,FOODS_1_002_CA_1_validation,d_1,0,2011-01-29,11101,1,1,,,0
2,FOODS_1_003_CA_1_validation,d_1,0,2011-01-29,11101,1,1,,,0
3,FOODS_1_004_CA_1_validation,d_1,0,2011-01-29,11101,1,1,,,0
4,FOODS_1_005_CA_1_validation,d_1,3,2011-01-29,11101,1,1,,,0
...,...,...,...,...,...,...,...,...,...,...
413203,FOODS_1_215_CA_1_validation,d_1913,0,2016-04-24,11613,2,4,,,0
413204,FOODS_1_216_CA_1_validation,d_1913,1,2016-04-24,11613,2,4,,,0
413205,FOODS_1_217_CA_1_validation,d_1913,3,2016-04-24,11613,2,4,,,0
413206,FOODS_1_218_CA_1_validation,d_1913,6,2016-04-24,11613,2,4,,,0


In [41]:
CA1_f1_ts.loc['2011-01-30']

KeyError: '2011-01-30'

In [42]:
CA1_f1_ts.set_index('date', inplace=True)

In [43]:
CA1_f1_ts['lag_1'] = CA1_f1_ts['sales'].shift(periods=216)

In [44]:
CA1_f1_ts['lag_2'] = CA1_f1_ts['sales'].shift(periods=(216*2))

In [45]:
CA1_f1_ts['lag_3'] = CA1_f1_ts['sales'].shift(periods=(216*3))

In [46]:
CA1_f1_ts['lag_4'] = CA1_f1_ts['sales'].shift(periods=(216*4))

In [47]:
CA1_f1_ts['lag_5'] = CA1_f1_ts['sales'].shift(periods=(216*5))

In [48]:
CA1_f1_ts['lag_6'] = CA1_f1_ts['sales'].shift(periods=(216*6))

In [49]:
CA1_f1_ts['lag_7'] = CA1_f1_ts['sales'].shift(periods=(216*7))

In [50]:
CA1_f1_ts['lag_14'] = CA1_f1_ts['sales'].shift(periods=(216*14))

In [51]:
CA1_f1_ts['lag_21'] = CA1_f1_ts['sales'].shift(periods=(216*21))

In [52]:
CA1_f1_ts['lag_28'] = CA1_f1_ts['sales'].shift(periods=(216*28))

In [None]:
CA1_f1_ts

In [53]:
train = CA1_f1_ts['2014-03-28':'2016-03-28']

In [54]:
test = CA1_f1_ts['2016-03-28':]

In [None]:
# item 7, 100, and 165 are missing, thus 216 per day
test['id'].unique()

In [None]:
train

In [None]:
test

In [55]:
X_train = train.drop(['sales', 'd', 'wm_yr_wk', 'event_name_1', 'event_name_2'], axis=1)
X_test = test.drop(['sales', 'd', 'wm_yr_wk', 'event_name_1', 'event_name_2'], axis=1)
y_train = train['sales']
y_test = test['sales']

In [None]:
y_test

In [56]:
label = LabelEncoder()
X_train['id'] = label.fit_transform(X_train['id'])
X_test['id'] = label.transform(X_test['id'])

In [None]:
rf1 = RandomForestRegressor(max_depth=20)
rf1.fit(X_train, y_train)

In [None]:
rf1.score(X_train, y_train)

In [None]:
rf1.score(X_test, y_test)

In [None]:
rf1_preds = rf1.predict(X_test)

In [None]:
rf1_preds = pd.Series(rf1_preds).set_axis(y_test.index)

In [None]:
fig, ax = plt.subplots()
ax.plot(rf1_preds)
ax.plot(y_test)

In [None]:
mean_squared_error(rf1_preds, y_test, squared=False)

In [57]:
lgbm_1 = LGBMRegressor()
lgbm_1.fit(X_train, y_train)

LGBMRegressor()

In [58]:
lgbm_1.score(X_train, y_train)

0.6203307535794378

In [59]:
lgbm_1.score(X_test, y_test)

0.42213660347836546

In [60]:
preds = lgbm_1.predict(X_test)

In [61]:
mean_squared_error(y_test, preds)

3.4174933079315295

In [None]:
xgb_1 = XGBRegressor()
xgb_1.fit(X_train, y_train)

In [None]:
xgb_1.score(X_train, y_train)

In [None]:
xgb_1.score(X_test, y_test)

In [None]:
CA1_join = CA_1[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']]

In [None]:
#to_melt = train.drop(columns=['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])

In [None]:
#train_ts = to_melt.melt(id_vars=['id'], var_name='d', value_name='sales')

In [None]:
#to_join = train[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']]

In [None]:
train_ts = train_ts.merge(to_join, on='id', how='inner')

In [None]:
train_ts.to_csv('./timeseries.csv')

In [None]:
train_ts.merge(dates, on='d', how='inner')

In [None]:
train_ts['value'].max()

In [None]:
train_ts = pd.melt(train, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='day')

In [None]:
train_ts

In [None]:
train = pd.read_csv('C:/Users/TWood/Downloads/m5-forecasting-accuracy/sales_train_evaluation.csv')

In [None]:
val_ts = pd.melt(val.head(), id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d')

In [None]:
val_ts = val_ts.merge(dates, on='d')

In [None]:
val_ts.set_index('date')

In [None]:
df.describe()

In [None]:
pd.read_csv('C:/Users/TWood/Downloads/m5-forecasting-accuracy/sample_submission.csv')

In [None]:
pd.read_csv('C:/Users/TWood/Downloads/m5-forecasting-accuracy/sell_prices.csv')