In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import seaborn as sns
from collections import Counter
import numpy as np

import pylab

import os
import gc
import time
import math
import datetime
from math import log, floor
from sklearn.neighbors import KDTree

from pathlib import Path
from sklearn.utils import shuffle
from tqdm.notebook import tqdm as tqdm

import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.colors import Normalize

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import pywt
from statsmodels.robust import mad

import scipy
import statsmodels
from scipy import signal
import statsmodels.api as sm

# !pip install fbprophet
# from fbprophet import Prophet
from scipy.signal import butter, deconvolve
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt

# train_path =  '../input/cleanedm5data/cleaned_train_data (1).csv'
# test_path = '../input/cleanedm5data/cleaned_test_data (1).csv'
# sample_path = '../input/afcs2021/sample_submission_afcs2021.csv'

calendar = pd.read_csv('../input/afcs2021/calendar_afcs2021.csv')
selling_prices = pd.read_csv('../input/afcs2021/sell_prices_afcs2021.csv')
sample_submission = pd.read_csv('../input/afcs2021/sample_submission_afcs2021.csv')
sales_train_val = pd.read_csv('../input/afcs2021/sales_train_validation_afcs2021.csv')
sales_test_val = pd.read_csv('../input/afcs2021/sales_test_validation_afcs2021.csv')

# Plot Sales

In [None]:
ids = sorted(list(set(sales_train_val['id'])))
d_cols = [c for c in sales_train_val.columns if 'd_' in c]
x_1 = sales_train_val.set_index('id')[d_cols].values[0]

fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Scatter(x=np.arange(len(x_1)), y=x_1, showlegend=False,
                    mode='lines', name="First sample",
                         marker=dict(color="mediumseagreen")),
             row=1, col=1)

fig.update_layout(title_text="Sales")
fig.show()

# Rolling Average Sales for TX_3 Store

In [None]:
sales = sales_train_val.set_index('id')[d_cols] \
    .T \
    .merge(calendar.set_index('d')['date'],
           left_index=True,
           right_index=True,
            validate='1:1') \
    .set_index('date')

store_list = selling_prices['store_id'].unique()
means = []
fig = go.Figure()
for s in store_list:
    store_items = [c for c in sales.columns if s in c]
    data = sales[store_items].sum(axis=1).rolling(90).mean()
    means.append(np.mean(sales[store_items].sum(axis=1)))
    fig.add_trace(go.Scatter(x=np.arange(len(data)), y=data, name=s))
    
fig.update_layout(yaxis_title="Sales", xaxis_title="Time", title="Rolling Average Sales vs. Time TX_3 Store")

In [None]:
fig = go.Figure()

for i, s in enumerate(store_list):
        store_items = [c for c in sales.columns if s in c]
        data = sales[store_items].sum(axis=1).rolling(90).mean()
        fig.add_trace(go.Box(x=[s]*len(data), y=data, name=s))
    
fig.update_layout(yaxis_title="Sales", xaxis_title="Time", title="Rolling Average Sales vs. TX store ")

# Model training

In [None]:
train_dataset = sales_train_val[d_cols[-100:-30]]
val_dataset = sales_train_val[d_cols[-30:]]

In [None]:
fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(x=np.arange(70), mode='lines', y=train_dataset.loc[0].values, marker=dict(color="green"), showlegend=False,name="Original signal"),row=1, col=1)

fig.add_trace(
    go.Scatter(x=np.arange(70, 100), y=val_dataset.loc[0].values, mode='lines', marker=dict(color="red"), showlegend=False,name="Denoised signal"),row=1, col=1)

fig.update_layout(title_text="Train (green) vs. Validation (red) sales")
fig.show()

# Naive

In [None]:
predictions = []
for i in range(len(val_dataset.columns)):
    if i == 0:
        predictions.append(train_dataset[train_dataset.columns[-1]].values)
    else:
        predictions.append(val_dataset[val_dataset.columns[i-1]].values)
    
predictions = np.transpose(np.array([row.tolist() for row in predictions]))
error_naive = np.linalg.norm(predictions[:3] - val_dataset.values[:3])/len(predictions[0])

In [None]:
pred_1 = predictions[0]
fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(x=np.arange(70), mode='lines', y=train_dataset.loc[0].values, marker=dict(color="green"),name="Train"),row=1, col=1)

fig.add_trace(
    go.Scatter(x=np.arange(70, 100), y=val_dataset.loc[0].values, mode='lines', marker=dict(color="red"),name="Val"),row=1, col=1)

fig.add_trace(
    go.Scatter(x=np.arange(70, 100), y=pred_1, mode='lines', marker=dict(color="blue"),name="Pred"),row=1, col=1)

fig.update_layout(title_text="Naive approach")
fig.show()

# Moving Average

In [None]:
predictions = []
for i in range(len(val_dataset.columns)):
    if i == 0:
        predictions.append(np.mean(train_dataset[train_dataset.columns[-30:]].values, axis=1))
    if i < 31 and i > 0:
        predictions.append(0.5 * (np.mean(train_dataset[train_dataset.columns[-30+i:]].values, axis=1) + \
                                  np.mean(predictions[:i], axis=0)))
    if i > 31:
        predictions.append(np.mean([predictions[:i]], axis=1))
    
predictions = np.transpose(np.array([row.tolist() for row in predictions]))
error_avg = np.linalg.norm(predictions[:3] - val_dataset.values[:3])/len(predictions[0])

In [None]:
pred_1 = predictions[0]
fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(x=np.arange(70), mode='lines', y=train_dataset.loc[0].values, marker=dict(color="green"),name="Train"),row=1, col=1)

fig.add_trace(
    go.Scatter(x=np.arange(70, 100), y=val_dataset.loc[0].values, mode='lines', marker=dict(color="red"),
               name="Val"),row=1, col=1)

fig.add_trace(
    go.Scatter(x=np.arange(70, 100), y=pred_1, mode='lines', marker=dict(color="blue"),
               name="Pred"),row=1, col=1)

fig.update_layout(title_text="Moving average")
fig.show()

In [None]:
predictions = []
for row in tqdm(train_dataset[train_dataset.columns[-30:]].values[:3]):
    fit = Holt(row).fit(smoothing_level = 0.3, smoothing_slope = 0.01)
    predictions.append(fit.forecast(30))
predictions = np.array(predictions).reshape((-1, 30))
error_holt = np.linalg.norm(predictions - val_dataset.values[:len(predictions)])/len(predictions[0])

In [None]:
pred_1 = predictions[0]
fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(x=np.arange(70), mode='lines', y=train_dataset.loc[0].values, marker=dict(color="green"),name="Train"),row=1, col=1)

fig.add_trace(
    go.Scatter(x=np.arange(70, 100), y=val_dataset.loc[0].values, mode='lines', marker=dict(color="red"),name="Val"),row=1, col=1
)

fig.add_trace(
    go.Scatter(x=np.arange(70, 100), y=pred_1, mode='lines', marker=dict(color="blue"),name="Pred"),row=1, col=1
)

fig.update_layout(title_text="Holt linear")
fig.show()


In [None]:
predictions = []
for row in tqdm(train_dataset[train_dataset.columns[-30:]].values[:3]):
    fit = ExponentialSmoothing(row, seasonal_periods=3).fit()
    predictions.append(fit.forecast(30))
predictions = np.array(predictions).reshape((-1, 30))
error_exponential = np.linalg.norm(predictions[:3] - val_dataset.values[:3])/len(predictions[0])

In [None]:
pred_1 = predictions[0]
fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(x=np.arange(70), mode='lines', y=train_dataset.loc[0].values, marker=dict(color="green"),name="Train"),row=1, col=1)

fig.add_trace(
    go.Scatter(x=np.arange(70, 100), y=val_dataset.loc[0].values, mode='lines', marker=dict(color="red"),name="Val"),row=1, col=1)

fig.add_trace(
    go.Scatter(x=np.arange(70, 100), y=pred_1, mode='lines', marker=dict(color="blue"),name="Pred"),row=1, col=1)

fig.update_layout(title_text="Exponential smoothing")
fig.show()

# Arima

In [None]:
predictions = []
for row in tqdm(train_dataset[train_dataset.columns[-30:]].values[:3]):
    fit = sm.tsa.statespace.SARIMAX(row, seasonal_order=(0, 1, 1, 7)).fit()
    predictions.append(fit.forecast(30))
predictions = np.array(predictions).reshape((-1, 30))
error_arima = np.linalg.norm(predictions[:3] - val_dataset.values[:3])/len(predictions[0])

In [None]:
pred_1 = predictions[0]
fig = make_subplots(rows=1, cols=1)

fig.add_trace(
    go.Scatter(x=np.arange(70), mode='lines', y=train_dataset.loc[0].values, marker=dict(color="green"),name="Train"),row=1, col=1)

fig.add_trace(
    go.Scatter(x=np.arange(70, 100), y=val_dataset.loc[0].values, mode='lines', marker=dict(color="red"),name="Val"),row=1, col=1)

fig.add_trace(
    go.Scatter(x=np.arange(70, 100), y=pred_1, mode='lines', marker=dict(color="blue"),name="Pred"),row=1, col=1)

fig.update_layout(title_text="ARIMA")
fig.show()

In [None]:
days = range(1914, 1941 + 1)
time_series_columns = [f'd_{i}' for i in days]
time_series_data = sales_test[time_series_columns]

forecast = pd.DataFrame(time_series_data.iloc[:, -28:].mean(axis=1))
forecast = pd.concat([forecast] * 28, axis=1)
forecast.columns = [f'F{i}' for i in range(1, forecast.shape[1] + 1)]

validation_ids = sales_test['id'].values

predictions = pd.DataFrame(validation_ids, columns=['id'])
forecast = pd.concat([forecast]).reset_index(drop=True)
predictions = pd.concat([predictions, forecast], axis=1)

In [None]:
predictions.to_csv('submission.csv', index=False)

In [None]:
predictions

In [None]:
error = [error_naive, error_avg, error_holt, error_exponential, error_arima]
error_rounded = [ '%.4f' % elem for elem in error ]

names = ["Naive approach", "Moving average", "Holt linear", "Exponential smoothing", "ARIMA",]
df = pd.DataFrame(np.transpose([error_rounded, names]))
df.columns = ["RMSE Loss", "Model"]
px.bar(df, y="RMSE Loss", x="Model", title="RMSE Loss for each trained Model")

In [None]:
error_naive

In [None]:
# wday1 = df[df['wday'] == 2]
# wday1[wday1['event_name_1'] == 0].sum()

# day1 = 18586
# day2 = 68903
# day3 = 59373
# day4 = 29693
# day5 = 38425
# day6 = 50987
# day7 = 15704

In [None]:
# compact_df = df[['item_id', 'date', 'sales', 'sell_price','wday', 'month','year','event_name_1','event_name_2']]
# compact_df.fillna(0,inplace=True)
# compact_df['date'] = pd.to_datetime(compact_df['date'])
# compact_df.head()

In [None]:
# compact_df.loc[compact_df['event_name_1'] == 'ValentinesDay'].sum()

In [None]:
event = []
sales_ = []

for i in compact_df['event_name_1'].value_counts().index:
    event.append(i)
    sales = compact_df.loc[compact_df['event_name_1'] == i]["sales"].sum()
    sales_.append(sales)
#     print('Total sales on {} was {}'.format(i,sales))

event.pop(0)
sales_.pop(0)

In [None]:
import pylab

pylab.figure(1)
x = range(len(sales_))
pylab.xticks(x, event)
pylab.plot(x,sales_,"g")
pylab.xticks(rotation=90)

pylab.show()

In [None]:
weekday = []
wsales = []

for i in compact_df['wday'].value_counts().index:
    weekday.append(i)
    wsales_ = compact_df.loc[compact_df['wday'] == i]["sales"].sum()
    wsales.append(wsales_)
#     print('Total sales on {} was {}'.format(i,sales))

In [None]:
pylab.figure(1)
x = range(len(wsales))
pylab.xticks(x, weekday)
pylab.plot(x,wsales,"g")
pylab.xticks(rotation=90)

pylab.show()

In [None]:
month = []
msales = []

for i in compact_df['month'].value_counts().sort_index().index:
    month.append(i)
    msales_ = compact_df.loc[compact_df['month'] == i]["sales"].sum()
    msales.append(msales_)
#     print('Total sales on {} was {}'.format(i,sales))

In [None]:
pylab.figure(1)
x = range(len(msales))
pylab.xticks(x, month)
pylab.plot(x,msales,"g")
pylab.xticks(rotation=90)

pylab.show()

In [None]:
# 0 is no event, 1 is event 
for i in compact_df['event_name_1'].value_counts().keys():
    if i is not 0:
        compact_df['event_name_1'] = compact_df['event_name_1'].replace(i,1)

In [None]:
# 0 is no event, 1 is event 
for i in compact_df['event_name_2'].value_counts().keys():
    if i is not 0:
        compact_df['event_name_2'] = compact_df['event_name_2'].replace(i,1)
#         compact_df['event_name_2'] = compact_df['event_name_2'].astype(int)


In [None]:
# fig=plt.gcf()
# fig.set_size_inches(30,12)

# df_correlation = compact_df[:].corr()

# mask = np.array(df_correlation)
# mask[np.tril_indices_from(mask)] = False

# sns.heatmap(data=df_correlation,mask=mask,square=True,annot=True,cbar=True)
# plt.show()

In [None]:
compact_df.set_index('item_id',inplace=True)

In [None]:
compact_df.to_csv('cleaned_test_data.csv')

## Baseline
The baseline is the mean per item_id

In [None]:
mean_sales = df.groupby(['item_id']).mean()
mean_sales['sales'].head()

## Training other way

In [None]:
train['date'] = pd.to_datetime(train['date'])
train.set_index('date',inplace=True)

In [None]:
train_data = train[train.index < pd.to_datetime("2016-03-27", format='%Y-%m-%d')]
test_data = train[train.index > pd.to_datetime("2016-03-27", format='%Y-%m-%d')]

# plt.plot(train_data, color = "black")
# plt.plot(test_data, color = "red")
# plt.ylabel('Sales')
# plt.xlabel('Date')
# plt.xticks(rotation=45)
# plt.title("Train/Test split for sales data")
# plt.show()

In [None]:
train_data['sales'].plot(figsize=(15,8), title= '', fontsize=14, label='train') 
test_data['sales'].plot(figsize=(15,8), title= '', fontsize=14, label='test') 
plt.xlabel("Datetime") 
plt.ylabel("Passenger count") 
plt.legend(loc='best') 
plt.show()

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [None]:
y = train_data['sales']
ARMAmodel = SARIMAX(y, order = (1, 0, 1))
ARMAmodel = ARMAmodel.fit()

y_pred = ARMAmodel.get_forecast(len(test_data.index))
y_pred_df = y_pred.conf_int(alpha = 0.05) 
y_pred_df["Predictions"] = ARMAmodel.predict(start = y_pred_df.index[0], end = y_pred_df.index[-1])
y_pred_df.index = test_data.index
y_pred_out = y_pred_df["Predictions"] 

In [None]:
plt.plot(y_pred_out, color='green', label = 'Predictions')
plt.legend()

In [None]:
from sklearn.metrics import mean_squared_error

arma_rmse = np.sqrt(mean_squared_error(test_data["sales"].values, y_pred_df["Predictions"]))
print("RMSE: ",arma_rmse)

In [None]:
from statsmodels.tsa.arima.model import ARIMA

ARIMAmodel = ARIMA(y, order = (2, 2, 2))
ARIMAmodel = ARIMAmodel.fit()

y_pred = ARIMAmodel.get_forecast(len(test_data.index))
y_pred_df = y_pred.conf_int(alpha = 0.05) 
y_pred_df["Predictions"] = ARIMAmodel.predict(start = y_pred_df.index[0], end = y_pred_df.index[-1])
y_pred_df.index = test_data.index
y_pred_out = y_pred_df["Predictions"] 
plt.plot(y_pred_out, color='Yellow', label = 'ARIMA Predictions')
plt.legend()

arma_rmse = np.sqrt(mean_squared_error(test_data["sales"].values, y_pred_df["Predictions"]))
print("RMSE: ",arma_rmse)

In [None]:
SARIMAXmodel = SARIMAX(y, order = (5, 4, 2), seasonal_order=(2,2,2,12))
SARIMAXmodel = SARIMAXmodel.fit()

y_pred = SARIMAXmodel.get_forecast(len(test_data.index))
y_pred_df = y_pred.conf_int(alpha = 0.05) 
y_pred_df["Predictions"] = SARIMAXmodel.predict(start = y_pred_df.index[0], end = y_pred_df.index[-1])
y_pred_df.index = test_data.index
y_pred_out = y_pred_df["Predictions"] 
plt.plot(y_pred_out, color='Blue', label = 'SARIMA Predictions')
plt.legend()

In [None]:
y_pred_df