# Ablation tests for group-by component

## Prepare dataset

Loading and preparing dataset as in [the training for Movielens/IMDB dataset](https://github.com/tinyclues/recsys-multi-atrribute-benchmark/blob/master/training/movielens%20simple%20model.ipynb).

In [1]:
DATASET = 'rees_ecommerce'
TECHNICAL_COLUMNS = ['user_id', 'date']
TASKS = ['product_id', 'category1', 'category2', 'category3', 'brand', 'priceCluster']
BATCH_SIZE_TRAIN = 5040
BATCH_SIZE_EVAL = 5040

In [2]:
from utils import load_dataset

datasets = {}
for split_name in ['train', 'val', 'test']:
    datasets[split_name] = load_dataset(DATASET, split_name)

In [3]:
from utils import AGG_PREFIX

all_columns = list(datasets['train'].element_spec.keys())
user_features = list(filter(lambda x: x.startswith(AGG_PREFIX), all_columns))
offer_features = list(filter(lambda x: x not in user_features + TECHNICAL_COLUMNS, all_columns))

In [4]:
from functools import partial
from uuid import uuid4

from utils import rebatch_by_events

datasets['train'] = rebatch_by_events(datasets['train'], batch_size=BATCH_SIZE_TRAIN, date_column='date', nb_events_by_user_by_day=8)
for key in ['val', 'test']:
    datasets[key] = rebatch_by_events(datasets[key], batch_size=BATCH_SIZE_EVAL, date_column='date', nb_events_by_user_by_day=8,
                                      seed=1729).cache(f'/tmp/{uuid4()}.tf')

In [5]:
from utils import add_equal_weights

for key in datasets:
    datasets[key] = datasets[key].map(partial(add_equal_weights, features=offer_features))

In [6]:
from utils import load_inverse_lookups
inverse_lookups = load_inverse_lookups(DATASET)

## Prepare evaluation dataset

In [7]:
%%time
from utils import prepare_single_task_dataset
test_datasets = {}
for task_offer_feature in TASKS:
    test_datasets[task_offer_feature] = \
        prepare_single_task_dataset(datasets['test'], task_offer_feature, offer_features)

CPU times: user 7min 23s, sys: 32.2 s, total: 7min 55s
Wall time: 7min 9s


## Model

In [8]:
if DATASET == 'movielens_imdb':
    from model_parameters import movielens_model as model_definition
    from model_parameters import MOVIELENS_EPOCHS as EPOCHS
elif DATASET == 'rees_ecommerce':
    from model_parameters import rees_model as model_definition
    from model_parameters import REES_EPOCHS as EPOCHS

NUMBER_OF_NEGATIVES = 4

In [9]:
import itertools

MODELS, EVAL_MODELS, EMBEDDING_MODELS = {}, {}, {}
for group_by, mask_net, bi_linear_interaction in itertools.product(*([[True, False]] * 3)):
    if mask_net and not group_by:
        continue
    key = group_by, mask_net, bi_linear_interaction
    name = '_'.join(map(lambda x: str(x).lower(), key))
    MODELS[key], EVAL_MODELS[key], EMBEDDING_MODELS[key] = \
        model_definition(user_features, offer_features, inverse_lookups, NUMBER_OF_NEGATIVES, name, *key)

## Train

In [10]:
len(MODELS)

6

In [11]:
for model in MODELS.values():
    model.fit(datasets['train'], epochs=EPOCHS, validation_data=datasets['val'])

Epoch 1/8


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 1/8




Epoch 2/8
Epoch 3/8
Epoch 4/8

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Epoch 8/8


## Evaluation

In [12]:
%%time
from collections import defaultdict
from utils import evaluate_model, wAUC

aucs = defaultdict(dict)
for task_offer_feature in TASKS:
    for key, eval_model in EVAL_MODELS.items():
        aucs[task_offer_feature][key] = \
                evaluate_model(eval_model, task_offer_feature, test_datasets, NUMBER_OF_NEGATIVES, inverse_lookups)

  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(i

CPU times: user 4h 4min 14s, sys: 19min 39s, total: 4h 23min 54s
Wall time: 1h 2min 52s


In [13]:
from utils import save_metrics
save_metrics(aucs, DATASET, 'ablation')

In [14]:
import pandas as pd
results = pd.DataFrame()
for task_name in aucs:
    for model_name in aucs[task_name]:
        w_auc = wAUC(aucs[task_name][model_name])
        group_by, mask_net, bi_linear_interaction = model_name
        results = pd.concat([results,
                             pd.Series({'wAUC': w_auc, 'offers': task_name,
                                        'group_by': group_by, 'mask_net': mask_net,
                                        'bi_linear_interaction': bi_linear_interaction}).to_frame().T],
                            ignore_index=True)

In [15]:
pd.pivot_table(results, 'wAUC', ['group_by', 'mask_net', 'bi_linear_interaction'], 'offers')\
    .rename(columns={'priceCluster': 'price'})[['product_id', 'category3', 'brand', 'category2', 'category1', 'price']]\
    .style.background_gradient(cmap='coolwarm').format(precision=3)

Unnamed: 0_level_0,Unnamed: 1_level_0,offers,product_id,category3,brand,category2,category1,price
group_by,mask_net,bi_linear_interaction,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,False,False,0.761,0.67,0.745,0.684,0.66,0.653
False,False,True,0.761,0.625,0.714,0.621,0.59,0.617
True,False,False,0.761,0.732,0.754,0.737,0.721,0.682
True,False,True,0.767,0.691,0.751,0.692,0.678,0.676
True,True,False,0.761,0.727,0.746,0.733,0.728,0.678
True,True,True,0.771,0.741,0.758,0.74,0.736,0.694
