# Import, setups

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
from tqdm.notebook import tqdm
import seaborn as sns

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, LSTM, Bidirectional, Dropout
from tensorflow.python.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt
from math import sqrt

from statsmodels.graphics.gofplots import qqplot
from statsmodels.stats.multitest import multipletests
from statsmodels.tsa.seasonal import seasonal_decompose

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import tensorflow.keras.backend as K

import scipy

import warnings

# Read data

In [None]:
DATA_PREFIX = '../data/'

In [None]:
item_categories = pd.read_csv(DATA_PREFIX + 'item_categories.csv')
items = pd.read_csv(DATA_PREFIX + 'items.csv')
sales_train = pd.read_csv(DATA_PREFIX + 'sales_train.csv')
shops = pd.read_csv(DATA_PREFIX + 'shops.csv')
test = pd.read_csv(DATA_PREFIX + 'test.csv')

In [None]:
shops.head()

In [None]:
print(len(item_categories))
item_categories.head()

In [None]:
print(len(items))
items.head()

In [None]:
# Merge category of item to each of the items
# items = items.merge(item_categories, on='item_category_id')
items.head()

In [None]:
sales_train.head()

In [None]:
sales_train.date_block_num.unique()

In [None]:
sales_train = sales_train.merge(items.loc[:, ['item_id', 'item_category_id']], on='item_id')

In [None]:
sales_train.describe()

In [None]:
sales_train.info()

In [None]:
sales_train.loc[:, 'date'] = pd.to_datetime(sales_train.date)

In [None]:
sales_train = sales_train.sort_values(
    ['date', 'shop_id', 'item_category_id', 'item_id']
).reset_index(drop=True)

In [None]:
sales_train.loc[:, 'day'] = sales_train.date.dt.day
sales_train.loc[:, 'month'] = sales_train.date.dt.month
sales_train.loc[:, 'year'] = sales_train.date.dt.year
sales_train.loc[:, 'dayofweek'] = sales_train.date.dt.dayofweek

In [None]:
sales_train.loc[:, 'income'] = sales_train.item_price * sales_train.item_cnt_day

In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 
          'August', 'September', 'October', 'November', 'December']
lstm_train = sales_train.groupby(['date_block_num', 'item_id', 'shop_id'])['item_cnt_day'].sum().reset_index()
lstm_train = lstm_train.rename(columns={'item_cnt_day':'item_cnt_month'})

# Data Preparation

In [None]:
seq_frame = sales_train.rename(columns={'item_cnt_day':'item_cnt_month'})
#seq_frame = seq_frame.query('item_cnt_month >= 0 and item_cnt_month <= 20')
seq_frame.head()

In [None]:
seq_frame = seq_frame.pivot_table(index = ['shop_id','item_id'],values = ['item_cnt_month'],columns = ['date_block_num'],fill_value = 0,aggfunc=np.sum).reset_index()
seq_frame.fillna(0,inplace = True)
seq_frame.head()

In [None]:
seq_frame = seq_frame[seq_frame.shop_id.isin(test.shop_id.unique())]
seq_frame = seq_frame[seq_frame.item_id.isin(test.item_id.unique())]

In [None]:
seq_frame = pd.merge(test,seq_frame,on = ['item_id','shop_id'],how = 'left')
seq_frame.fillna(0,inplace = True)
seq_frame

In [None]:
seq_frame.drop(['shop_id','item_id','ID'],inplace = True, axis=1)
seq_frame = seq_frame.clip(0, 20)
seq_frame

In [None]:
#all the columns before the second last one
X_train_seq = seq_frame.iloc[:,:-2].to_numpy()
# the second last one is the label
y_train_seq = seq_frame.iloc[:,-2:-1].to_numpy()
# all columns except last one
X_test_seq = seq_frame.iloc[:,1:-1].to_numpy()
# last one is the label
y_test_seq = seq_frame.iloc[:,-1:].to_numpy()

X_train_seq.shape, y_train_seq.shape, X_test_seq.shape, y_test_seq.shape

In [None]:
X_train_seq = X_train_seq.reshape((X_train_seq.shape[0], 32, 1))
X_test_seq = X_test_seq.reshape((X_test_seq.shape[0], 32, 1))
print(X_train_seq.shape, X_test_seq.shape)

# LSTM model for time series

In [None]:
model = Sequential()
model.add(LSTM(units = 64,input_shape = (32,1)))
model.add(Dropout(0.4))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

In [None]:
# int(len(X_train)/32)
model_check = ModelCheckpoint("./models5", monitor='loss',save_best_only=True,save_weights_only=False, mode='min')
history = model.fit(X_train_seq, y_train_seq, batch_size = 4096, epochs=50, verbose=2, shuffle=False, callbacks=[model_check])

In [None]:
# models 2 - minimum loss 1.9971
# models 3 - minimum loss 1.9813
# models 4 - RMSE of 0.902
# models 5 - RMSE of 0.905, min loss of 0.8230

In [None]:
new_model = load_model("./models5")

In [None]:
plt.plot(history.history['loss'], label='train')
plt.legend()
plt.show()

# Model validation

In [None]:
predictions = new_model.predict(X_test_seq)
X_test_seq = X_test_seq.reshape((X_test_seq.shape[0], X_test_seq.shape[1]))

In [None]:
rmse = sqrt(mean_squared_error(predictions, y_test_seq))
print('RMSE: %.3f' % rmse)

In [None]:
plt.plot(predictions, label='test data')
plt.legend()
plt.show()