<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import,-setups" data-toc-modified-id="Import,-setups-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import, setups</a></span></li><li><span><a href="#Read-data" data-toc-modified-id="Read-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Read data</a></span></li><li><span><a href="#EDA" data-toc-modified-id="EDA-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>EDA</a></span><ul class="toc-item"><li><span><a href="#Impact-of-the-day" data-toc-modified-id="Impact-of-the-day-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Impact of the day</a></span></li><li><span><a href="#Items-categories" data-toc-modified-id="Items-categories-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Items categories</a></span></li><li><span><a href="#For-how-long-itmes-have-been-sold" data-toc-modified-id="For-how-long-itmes-have-been-sold-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>For how long itmes have been sold</a></span></li><li><span><a href="#Shops-analysis" data-toc-modified-id="Shops-analysis-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Shops analysis</a></span></li><li><span><a href="#Did-shops-differ-in-offer?" data-toc-modified-id="Did-shops-differ-in-offer?-3.5"><span class="toc-item-num">3.5&nbsp;&nbsp;</span>Did shops differ in offer?</a></span></li></ul></li></ul></div>

# Import, setups

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
from tqdm.notebook import tqdm
import seaborn as sns

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, LSTM, Bidirectional
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint

import matplotlib.pyplot as plt
from math import sqrt

from statsmodels.graphics.gofplots import qqplot
from statsmodels.stats.multitest import multipletests
from statsmodels.tsa.seasonal import seasonal_decompose

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import tensorflow.keras.backend as K

import scipy

import warnings

# Read data

In [None]:
DATA_PREFIX = '../data/'

In [None]:
item_categories = pd.read_csv(DATA_PREFIX + 'item_categories.csv')
items = pd.read_csv(DATA_PREFIX + 'items.csv')
sales_train = pd.read_csv(DATA_PREFIX + 'sales_train.csv')
shops = pd.read_csv(DATA_PREFIX + 'shops.csv')
test = pd.read_csv(DATA_PREFIX + 'test.csv')

In [None]:
shops.head()

In [None]:
print(len(item_categories))
item_categories.head()

In [None]:
print(len(items))
items.head()

In [None]:
# Merge category of item to each of the items
# items = items.merge(item_categories, on='item_category_id')
items.head()

In [None]:
sales_train.head()

In [None]:
sales_train.date_block_num.unique()

In [None]:
sales_train = sales_train.merge(items.loc[:, ['item_id', 'item_category_id']], on='item_id')

In [None]:
sales_train.describe()

In [None]:
sales_train.info()

In [None]:
sales_train.loc[:, 'date'] = pd.to_datetime(sales_train.date)

In [None]:
sales_train = sales_train.sort_values(
    ['date', 'shop_id', 'item_category_id', 'item_id']
).reset_index(drop=True)

In [None]:
sales_train.loc[:, 'day'] = sales_train.date.dt.day
sales_train.loc[:, 'month'] = sales_train.date.dt.month
sales_train.loc[:, 'year'] = sales_train.date.dt.year
sales_train.loc[:, 'dayofweek'] = sales_train.date.dt.dayofweek

In [None]:
sales_train.loc[:, 'income'] = sales_train.item_price * sales_train.item_cnt_day

In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 
          'August', 'September', 'October', 'November', 'December']
lstm_train = sales_train.groupby(['date_block_num', 'item_id', 'shop_id'])['item_cnt_day'].sum().reset_index()
lstm_train = lstm_train.rename(columns={'item_cnt_day':'item_cnt_month'})

# EDA

In [None]:
sales_train.head()

In [None]:
px.histogram(sales_train.date)

In [None]:
px.line(
    sales_train.groupby('date_block_num')['item_id'].count().reset_index(), 
    x='date_block_num', y='item_id')

In [None]:
months = ['', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 
          'August', 'September', 'October', 'November', 'December']
day_sales = sales_train.groupby(['year', 'month'])['income'].sum().reset_index()
day_sales.loc[:, 'month'] = day_sales.month.apply(lambda x: months[x])
day_sales = day_sales.reset_index()
fig = px.scatter_polar(
    day_sales, r='income', theta='month', color='index', 
    color_continuous_scale='viridis')
fig.update_layout(
    coloraxis_colorbar=dict(
        title="months<br>since<br>beginning",
    ),
)

## Impact of the day

In [None]:
per_day_sales = sales_train.groupby(['dayofweek', 'date'])['income'].sum().reset_index()

In [None]:
px.box(
    per_day_sales, 
    x='dayofweek', y='income', points='outliers')\
.update_layout(
    yaxis_range=[0, per_day_sales.income.quantile(0.99)]
)

In [None]:
results = []
for day1 in range(7):
    for day2 in range(day1 + 1, 7):
        _, p = scipy.stats.ranksums(
            x=per_day_sales.loc[per_day_sales.dayofweek==day1, 'income'],
            y=per_day_sales.loc[per_day_sales.dayofweek==day2, 'income'],
        )
        results.append([(day1, day2), p])

In [None]:
# Benjamini/Hochberg correction for p values when performing many tests
_, p, _, _ = multipletests([i[1] for i in results], alpha=0.05, method='fdr_bh', returnsorted=False)
for i in range(len(results)):
    results[i][1] = p[i]
    
for days, p in results:
    if p < 0.05:
        print(days)

From this test and from above histogram we see that from Monday to Thursay sales are the lowest (and also we can't say which day makes more income - those differences are statistically not significant). Every day from Friday to Sunday produces significantly more income that other working days. In particular Saturday outperforms every other day including Friday and Sunday. There is no siginificant difference between Friday and Sunday.

## Items categories

In [None]:
print(f'Number of unique items {len(items)}\nNumber of categories {len(items.item_category_id.unique())}')

In [None]:
px.histogram(items.groupby('item_category_id')['item_id'].count(), nbins=1000)\
.update_layout(xaxis_title='group size', yaxis_title='number of groups')

In [None]:
tmp = sales_train.groupby(['item_category_id', 'item_id'])['item_price'].mean().reset_index()
order = tmp.groupby('item_category_id').mean().reset_index().sort_values('item_price').item_category_id.tolist()
px.box(
    tmp,
    x='item_category_id', y='item_price', log_y=True, category_orders={'item_category_id': order}
)\
.update_layout(
    xaxis_type='category',
    width=1500
)

Categories differs in price to each other

In [None]:
item_categories.loc[12] 

# indeed, the most expensive category is PS4 :)

In [None]:
item_categories.loc[71]

# Gifts - Bags, Albums, Mouse Pads are the cheapest

## For how long items have been sold 

In [None]:
sales_train

In [None]:
tmp = pd.pivot_table(sales_train, index='item_id', values='date', aggfunc=['min', 'max'])

tmp.loc[:, 'min_date'] = tmp.loc[:, 'min'].date
tmp.loc[:, 'max_date'] = tmp.loc[:, 'max'].date
tmp = tmp.drop(['min', 'max'], axis=1)

tmp.loc[:, 'item_last'] = (tmp.loc[:, 'max_date'] - tmp.loc[:, 'min_date']).dt.days

tmp = tmp.sort_values(['min_date', 'item_last'], axis=0)

In [None]:
px.histogram(tmp.item_last)\
.update_layout(
    title='Histogram of time of products being sold',
    xaxis_title='Number of days the product have been sold',
    yaxis_title='# of products'
)

In [None]:
fig = go.Figure()
for i, (item, data) in enumerate(tmp.iterrows()):
    data = data.values[:2]
    fig.add_trace(
        go.Scattergl(
            x=data, y=[i, i], 
            mode='lines', 
            line=dict(color='red'),
            showlegend=False,
        )
   )
fig.show('browser')

del fig

In [None]:
px.histogram(x=tmp.min_date).show()
px.histogram(x=tmp.max_date).show()
px.histogram(x=tmp.min_date.dt.month).show()
px.histogram(x=tmp.max_date.dt.month).show()
px.histogram(x=tmp.min_date.dt.day).show()
px.histogram(x=tmp.max_date.dt.day).show()

As seen from those plots - most of items are introducted within two weeks after New Year. They are recalled the most offen within first two weeks of December.

## Shops analysis

In [None]:
len(sales_train.shop_id.unique())

In [None]:
tmp = pd.pivot_table(sales_train, index='shop_id', values='date', aggfunc=['min', 'max'])
tmp.loc[:, 'min_date'] = tmp.loc[:, 'min'].date
tmp.loc[:, 'max_date'] = tmp.loc[:, 'max'].date
tmp = tmp.drop(['min', 'max'], axis=1)
tmp.loc[:, 'shop_last'] = (tmp.loc[:, 'max_date'] - tmp.loc[:, 'min_date']).dt.days
tmp = tmp.sort_values(['min_date', 'shop_last'], axis=0)

In [None]:
fig = go.Figure()
for i, (item, data) in enumerate(tmp.iterrows()):
    data = data.values[:2]
    fig.add_trace(
        go.Scattergl(
            x=data, y=[i, i], 
            mode='lines', 
            line=dict(color='red'),
            showlegend=False,
        )
   )
fig.show()
del fig

On average ~50 shops were constantly active. If show happend to be closed/open it was usually near New Year.

## Did shops differ in offer?

In [None]:
shop_specification = pd.pivot_table(
    sales_train, index='shop_id', columns='item_category_id', values='income', aggfunc='sum'
)
sns.clustermap(shop_specification.fillna(False).astype(bool), cmap='gray', vmin=-0.5, vmax=1.2)

In [None]:
sns.clustermap(np.nan_to_num(shop_specification, 0))

In [None]:
sns.clustermap(np.log10(np.maximum(np.nan_to_num(shop_specification, 0), 1)))

There are 3 clusters of shops. One comprises of most of the shops, they have big or huge earnings and sell similar kind of products. Within it we can distinguish approx. 6 shops that have enormous income.

The second cluster have smaller earnings and in general shops within in have smaller diversity of products.

This cluster is created by 2 shops which have very limited assortment. One of them sells products of completely different type than the others.

# Data Preparation

In [None]:
seq_frame = sales_train.rename(columns={'item_cnt_day':'item_cnt_month'}).pivot_table(index = ['shop_id','item_id'],values = ['item_cnt_month'],columns = ['date_block_num'],fill_value = 0,aggfunc='sum').reset_index()
seq_frame.fillna(0,inplace = True)
seq_frame

In [None]:
seq_frame = seq_frame[seq_frame.shop_id.isin(test.shop_id.unique())]
seq_frame = seq_frame[seq_frame.item_id.isin(test.item_id.unique())]

In [None]:
seq_frame = pd.merge(test,seq_frame,on = ['item_id','shop_id'],how = 'left')
seq_frame.fillna(0,inplace = True)
seq_frame

In [None]:
seq_frame.drop(['shop_id','item_id','ID'],inplace = True, axis=1)
seq_frame

In [None]:
#all the columns before the second last one
X_train_seq = seq_frame.iloc[:,:-2].to_numpy()
# the second last one is the label
y_train_seq = seq_frame.iloc[:,-2:-1].to_numpy()
# all columns except last one
X_test_seq = seq_frame.iloc[:,1:-1].to_numpy()
# last one is the label
y_test_seq = seq_frame.iloc[:,-1:].to_numpy()

X_train_seq.shape, y_train_seq.shape, X_test_seq.shape, y_test_seq.shape

In [None]:
X_train_seq = X_train_seq.reshape((X_train_seq.shape[0], 32, 1))
X_test_seq = X_test_seq.reshape((X_test_seq.shape[0], 32, 1))
print(X_train_seq.shape, X_test_seq.shape)

In [None]:
lstm_train['item_cnt_month'] = lstm_train['item_cnt_month'].clip(0, 20)

In [None]:
lstm_train = lstm_train[lstm_train['shop_id'].isin(lstm_test.shop_id.unique())]

In [None]:
lstm_train.sort_values(by='date_block_num',ascending=True).head()
time_inf = lstm_train.drop_duplicates(keep='first')

In [None]:
warnings.filterwarnings('ignore')

for shop_id in time_inf.shop_id[0:3]:
    for cat_id in time_inf.item_id[0:3]:
        data_local = time_inf[time_inf['shop_id']==shop_id][time_inf['item_category_id']==cat_id]
        res = seasonal_decompose(data_local.item_cnt_month.interpolate(), period=1, model='additive')
        resplot = res.plot()

In [None]:
time_inf.set_index('date')['income'].plot(figsize=(16, 6))

In [None]:
time_inf['item_id'].isnull().values.any(), time_inf['shop_id'].isnull().values.any()

In [None]:
# maybe input shows that aren't opened on Mondays, etc as 0 income?

In [None]:
len(time_inf[['item_id', 'item_category_id']].duplicated(keep='last')), len(time_inf[['item_id', 'item_category_id']])

In [None]:
len(time_inf[['item_id', 'item_category_id']]), len(time_inf[['item_id', 'item_category_id']].drop_duplicates(keep='last'))

In [None]:
time_inf.head()

In [None]:
time_inf.fillna(0,inplace = True)

In [None]:
lstm_train = time_inf[time_inf.date_block_num<=32]
lstm_test = time_inf[time_inf.date_block_num>32]

In [None]:
lstm_train = lstm_train[lstm_train['shop_id'].isin(lstm_test.shop_id.unique())]

In [None]:
lstm_train['item_cnt_month'] = lstm_train['item_cnt_month'].clip(0, 20)

In [None]:
X_train = lstm_train[['item_id', 'shop_id']].to_numpy()
y_train = lstm_train['item_cnt_month'].to_numpy()
X_test = lstm_test[['item_id', 'shop_id']].to_numpy()
y_test = lstm_test['item_cnt_month'].to_numpy()

In [None]:
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
print(X_train.shape, X_test.shape)

# LSTM model for time series

In [None]:
def rmse(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [None]:
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(32, 1)))
model.add(Dense(1))
#opt = Adam(learning_rate=0.005)
model.compile(loss='mse', optimizer='adam')

In [None]:
# int(len(X_train)/32)
callback = EarlyStopping(monitor='loss', patience=3)
model_check = ModelCheckpoint("./models", monitor='loss',save_best_only=True,save_weights_only=False, mode='min')
history = model.fit(X_train_seq, y_train_seq, batch_size = int(len(X_train)/32), epochs=50, verbose=2, shuffle=False, callbacks=[callback, model_check])

In [None]:
plt.plot(history.history['loss'], label='train')
plt.legend()
plt.show()

# Model validation

In [None]:
predictions = model.predict(X_test_seq)
X_test_seq = X_test_seq.reshape((X_test_seq.shape[0], X_test_seq.shape[1]))

In [None]:
rmse = sqrt(mean_squared_error(predictions, y_test))
print('RMSE: %.3f' % rmse)

In [None]:
plt.plot(predictions, label='test data')
plt.legend()
plt.show()