# Getting Data

In [1]:
#!kaggle competitions download -c store-sales-time-series-forecasting

In [None]:
# Setup feedback system
from learntools.core import binder
binder.bind(globals())
from learntools.time_series.ex5 import *

# Setup notebook
from pathlib import Path
from learntools.time_series.style import *  # plot style settings
from learntools.time_series.utils import plot_periodogram, seasonal_plot

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from xgboost import XGBRegressor

In [None]:
store_sales = pd.read_csv(
    "/kaggle/input/store-sales-time-series-forecasting/train.csv",
    usecols=['store_nbr', 'family', 'date', 'sales'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
store_sales['date'] = store_sales.date.dt.to_period('D')
display(store_sales)

In [None]:
submit=pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv", parse_dates=['date'], index_col=['date'])
submit.head()

In [None]:
submit_sample=pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv")
submit_sample.head()

In [None]:
stores=pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
print(stores.shape)
stores.head()

In [None]:
oil = pd.read_csv(
    "/kaggle/input/store-sales-time-series-forecasting/oil.csv",
    dtype={
        'dcoilwtico': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
oil = oil.set_index('date').to_period('D')
display(oil)

In [None]:
holidays_events = pd.read_csv(
    "/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv",
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
holidays_events = holidays_events.set_index('date').to_period('D')
holidays = (
    holidays_events
    .query("locale in ['National', 'Regional']")
    .assign(description=lambda x: x.description.cat.remove_unused_categories())
)
display(holidays)

In [None]:
transactions=pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/transactions.csv",parse_dates=['date'], index_col=['date'])
transactions.head()

# Missing Values

In [None]:
store_sales.isna().sum()

In [None]:
stores.isna().sum()

In [None]:
oil.isna().sum()

In [None]:
oil[oil['dcoilwtico'].isna()]

random values are missing. there are no values missing for a consecutive group of days. therefore, it is okay to replace each missing value with previous value. first value on index 0 can be filled by value at index 1

In [None]:
oil=oil.fillna(method="ffill")
oil=oil.fillna(method="bfill") #for 1st row

In [None]:
oil.isna().sum()

In [None]:
holidays_events.isna().sum()

# Trend

In [None]:
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
store_sales.head()

In [None]:
average_sales = store_sales.groupby('date').mean()['sales']
average_sales.head()

In [None]:
average_sales.plot(title="Average Sales", **plot_params)

In [None]:
y = average_sales.copy()  # the target
y.index

dp = DeterministicProcess(
    index=y.index,  # dates from the training data
    order=3,             # the time dummy (trend)
)

# YOUR CODE HERE: Create the feature set for the dates given in y.index
X = dp.in_sample()

# Create features for a 16-day forecast.
X_fore = dp.out_of_sample(steps=16)

model = LinearRegression()
model.fit(X, y)

y_pred = pd.Series(model.predict(X), index=X.index)
y_fore = pd.Series(model.predict(X_fore), index=X_fore.index)

ax = y.plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = y_pred.plot(ax=ax, linewidth=3, label="Trend", color='C0')
ax = y_fore.plot(ax=ax, linewidth=3, label="Trend Forecast", color='C3')
ax.legend();

In [None]:
ax = y['2017'].plot(**plot_params, alpha=0.5, title="Average Sales", ylabel="items sold")
ax = y_pred['2017'].plot(ax=ax, linewidth=3, label="Trend", color='C0')
ax = y_fore['2017'].plot(ax=ax, linewidth=3, label="Trend Forecast", color='C3')
ax.legend();

It appears that trend for our forecast can be ignored as it is a straight line

# Seasonality

In [None]:
plot_periodogram(average_sales.loc['2015':'2017'])

In [None]:
y = average_sales.copy()

# YOUR CODE HERE
fourier = CalendarFourier(freq="M", order=4)
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    #order=1,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
)
X = dp.in_sample()

X = X.loc['2017']

# Adding New Year

There seems (very less sales) an anamoly on newyear.

In [None]:
average_sales[average_sales<10]

In [None]:
X['NewYear'] = (X.index.dayofyear == 1).astype(int)
display(X)

# Adding Holidays

In [None]:
temp = holidays.loc[holidays['transferred'] == False]
temp = pd.get_dummies(temp, columns=['locale'])
temp = temp[['locale_National', 'locale_Regional']]
holidays = temp
display(holidays)

In [None]:
X = X.join(holidays, on='date').fillna(0.0)
display(X)

# Adding Oil

In [None]:
oilX = oil.loc[:'2017-08-15']
display(oilX)

In [None]:
oilX = pd.DataFrame((oilX.dcoilwtico - oilX.dcoilwtico.mean()) / oilX.dcoilwtico.std())
display(oilX)

In [None]:
#X = X.join(oilX, on='date').fillna(0.0)
display(X)

# On promotion

In [None]:
promo = pd.read_csv(
    "/kaggle/input/store-sales-time-series-forecasting/train.csv",
    usecols=['store_nbr', 'family', 'date', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion' : 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
promo['date'] = promo.date.dt.to_period('D')
promo = promo.set_index(['store_nbr', 'family', 'date']).sort_index()
display(promo)

In [None]:
promo = promo.unstack(['store_nbr', 'family']).loc["2017"]           #for 2017 only
display(promo)

In [None]:
X = X.join(promo, on='date').fillna(0.0)
display(X)

# Modelling - Linear Regression

In [None]:
#X = X.loc['2017']
display(X)

In [None]:
display(store_sales)

In [None]:
y = store_sales.unstack(['store_nbr', 'family']).loc["2017"]
display(y)

In [None]:
model = LinearRegression(fit_intercept=False)
model.fit(X, y)
y_pred = pd.DataFrame(model.predict(X), index=X.index, columns=y.columns)
display(y_pred)

## Loading Test Data

In [None]:
df_test = pd.read_csv(
    "/kaggle/input/store-sales-time-series-forecasting/test.csv",
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
df_test['date'] = df_test.date.dt.to_period('D')
df_test = df_test.set_index(['store_nbr', 'family', 'date']).sort_index()
display(df_test)

## Creating test features

In [None]:
X_test = dp.out_of_sample(steps=16)
X_test.index.name = 'date'
X_test['NewYear'] = (X_test.index.dayofyear == 1).astype(int)
display(X_test)

### Adding Holidays

In [None]:
X_test = X_test.join(holidays, on='date').fillna(0.0)
display(X_test)

## Adding promotions

In [None]:
promo_test = pd.read_csv(
    "/kaggle/input/store-sales-time-series-forecasting/test.csv",
    usecols=['store_nbr', 'family', 'date', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion' : 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
promo_test['date'] = promo_test.date.dt.to_period('D')
promo_test = promo_test.set_index(['store_nbr', 'family', 'date']).sort_index()
display(promo_test)

In [None]:
promo_test = promo_test.unstack(['store_nbr', 'family']).loc["2017"]           #for 2017 only
display(promo_test)

In [None]:
X_test = X_test.join(promo_test, on='date').fillna(0.0)
display(X_test)

### Adding Oil

In [None]:
oilN = oil.loc[:'2017-08-15']
oilY = oil.loc['2017-08-16':'2017-08-31']
oilY = pd.DataFrame((oilY.dcoilwtico - oilN.dcoilwtico.mean()) / oilN.dcoilwtico.std())
display(oilY)

In [None]:
#X_test = X_test.join(oilY, on='date').fillna(0.0)
print(X_test.shape)
display(X_test)

In [None]:
y_submit = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=y.columns)
display(y_submit)

In [None]:
y_submit = y_submit.stack(['store_nbr', 'family'])
display(y_submit)

In [None]:
y_submit = y_submit.join(df_test.id).reindex(columns=['id', 'sales'])
display(y_submit)

In [None]:
y_submit.to_csv('submission.csv', index=False)