## Set-up (for colab)
---

In [None]:
# %%capture
# !pip install pymc3==3.11

# PyShopper example
---
- This notebook contains a quick example of PyShopper that includes:
1. Loading data
2. Instantiating and fitting the Shopper model via MCMC sampling or variational inference
3. Inference diagnostics
4. Prediction on unseen test data

In [None]:
# Imports

import numpy as np
import pandas as pd
import pymc3 as pm
import filelock
import warnings

import theano

from pyshopper import shopper
from scipy import stats

from tqdm.notebook import tqdm

# Ignore FutureWarning and UserWarning
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

In [None]:
import logging
logger = logging.getLogger('filelock')
logger.setLevel(logging.WARNING)

In [None]:
# URL to datasets
DATA_URL = 'https://github.com/topher-lo/PyShopper/blob/main/data'

## 1. Load data
---

In [None]:
# Load data
data = shopper.load_data(f'{DATA_URL}/train.tsv?raw=true', f'{DATA_URL}/prices.tsv?raw=true')
unique_items = sorted(data['item_id'].unique())
sessions_list = sorted(data['session_id'].unique())

# Limit data to C (most frequent) items and W sessions
# Note: we filter for trailing sessions because the tested dataset's sessions begin at the end of
# the training dataset's sessions
C = 3
W = 400

# Filter data
X_train = (data.loc[data['item_id'].isin(unique_items[:C])]
               .loc[data['session_id'].isin(sessions_list[-W:])]
               .reset_index(drop=True))

X_train

## 2. Instantiate and fit model
---

In [None]:
# Create Shopper instance

model = shopper.Shopper(X_train)

In [None]:
# # Fit model with MCMC sampling

# mcmc_res = model.fit(N=10000, method='MCMC')

In [None]:
# # Results summary:
# # Summary of common posterior statistics 
# # and sampling diagnostics

# mcmc_res.summary()

In [None]:
# Fit model with ADVI approximation

advi_res = model.fit(N=50000, method='ADVI')

In [None]:
# # Results summary:
# # Summary of common posterior statistics
# # Note: must define number of draws from approximated posterior distribution

# summary = advi_res.summary(draws=100)
# summary

## 3. Diagnostics
---

In [None]:
# # Sampling trace plot

# mcmc_res.trace_plot()

In [None]:
# ELBO plot (ADVI)

fig = advi_res.elbo_plot()

In [None]:
# ADVI posterior sampling trace plot

fig = advi_res.trace_plot(draws=5000)

## 4. Prediction
---

In [None]:
# Load test data
test_data = shopper.load_data(f'{DATA_URL}/test.tsv?raw=true',
                              f'{DATA_URL}/prices.tsv?raw=true')
test_sessions_list = sorted(test_data['session_id'].unique())

W_test = int(0.33*W)

# Limit data to C items and U users
X_test = (test_data.loc[test_data['item_id'].isin(unique_items[:C])]
                   .loc[test_data['session_id'].isin(test_sessions_list[-W_test:])]
                   .reset_index(drop=True))

X_test.iloc[np.r_[0:4, -4:0]]

In [None]:
# ADVI Predictions
preds = advi_res.predict(X_test, draws=5000)
sampled_preds = pd.DataFrame(preds['y'])

In [None]:
# Labels
test_labels = pd.Series(pd.Categorical(X_test['item_id']).codes)
test_labels.name = 'test_labels'

In [None]:
# Number of correctly labelled outcomes
(sampled_preds.mode() == test_labels).T.value_counts()

In [None]:
# Sanity check
sampled_preds.mode().T.join(test_labels)