# Training simple model and evalualing its predictions on different tasks

## Prepare dataset for training

Following the same steps as in [the training for Movielens/IMDB dataset](https://github.com/tinyclues/recsys-multi-atrribute-benchmark/blob/master/training/movielens%20simple%20model.ipynb), we first load splitted dataset generated in [notebook](https://github.com/tinyclues/recsys-multi-atrribute-benchmark/blob/master/dataset_preprocessing/rees_ecommerce.ipynb)

In [1]:
DATASET = 'rees_ecommerce'

In [2]:
from utils import load_dataset

datasets = {}
for split_name in ['train', 'val', 'test']:
    datasets[split_name] = load_dataset(DATASET, split_name)

We can parse features' names, they were chosen to easily distinguish between offer features and user features (aggregated history up to chosen date).

In [3]:
from utils import AGG_PREFIX

all_columns = list(datasets['train'].element_spec.keys())
technical_columns = ['user_id', 'date']
user_features = list(filter(lambda x: x.startswith(AGG_PREFIX), all_columns))
offer_features = list(filter(lambda x: x not in user_features + technical_columns, all_columns))

In [4]:
user_features

['aggregated_cart_category1',
 'aggregated_purchase_category2',
 'aggregated_cart_priceCluster',
 'aggregated_purchase_category3',
 'aggregated_cart_product_id',
 'aggregated_purchase_priceCluster',
 'aggregated_cart_category3',
 'aggregated_cart_brand',
 'aggregated_cart_category2',
 'aggregated_purchase_category1',
 'aggregated_purchase_product_id',
 'aggregated_purchase_brand']

In [5]:
offer_features

['priceCluster', 'category3', 'category2', 'category1', 'product_id', 'brand']

### Rebatching datasets

Splitting dataset into smaller batches in the same way as described in [the training for Movielens/IMDB dataset](https://github.com/tinyclues/recsys-multi-atrribute-benchmark/blob/master/training/movielens%20simple%20model.ipynb)

In [6]:
%%time

from functools import partial
from uuid import uuid4

from utils import rebatch_by_events, add_equal_weights

datasets['train'] = rebatch_by_events(datasets['train'], batch_size=5040, date_column='date', nb_events_by_user_by_day=8)
for key in ['val', 'test']:
    datasets[key] = rebatch_by_events(datasets[key], batch_size=5040, date_column='date', nb_events_by_user_by_day=8,
                                      seed=1729).cache(f'/tmp/{uuid4()}.tf')

for key in datasets:
    datasets[key] = datasets[key].map(partial(add_equal_weights, features=offer_features))

CPU times: user 26.2 s, sys: 1.26 s, total: 27.4 s
Wall time: 15.1 s


## Define simple model

In [7]:
from utils import load_inverse_lookups
inverse_lookups = load_inverse_lookups(DATASET)

In [8]:
import re

vocabulary_sizes = {}

for feature in offer_features:
    vocabulary_sizes[feature] = inverse_lookups[feature].vocabulary_size()

for feature in user_features:
    for key in inverse_lookups:
        pattern = re.compile(r"{}(\w+)_{}".format(AGG_PREFIX, key))
        if pattern.match(feature):
            vocabulary_sizes[feature] = vocabulary_sizes[key]

### Model architecture

In [9]:
import tensorflow as tf

<img src="resources/two_towers_model.png" alt="two tower model" width="800" />

### Model parameters

To choose model's parameters we did some manual tuning using validation set to maximize train and validation AUC while keeping mismatch between them small.

In [10]:
# model parameters
EMBEDDING_DIM = 100
L1_COEFF = 2e-7
DROPOUT = 0.1


def REGULARIZER():
    return {'class_name': 'L1L2', 'config': {'l1': L1_COEFF, 'l2': 0.}}

def USER_TOWER():
    return tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(100,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
        tf.keras.layers.Dense(50,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
    ], name='user_tower')

def OFFER_TOWER():
    return tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(100,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
        tf.keras.layers.Dense(50,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
    ], name='offer_tower')

EPOCHS = 8

NUMBER_OF_NEGATIVES = 4
LOSS = tf.keras.losses.BinaryCrossentropy(from_logits=True)
AUC_METRIC = tf.keras.metrics.AUC(from_logits=True)

import tensorflow_addons as tfa
OPTIMIZER = tfa.optimizers.AdamW(weight_decay=4e-8, learning_rate=0.0008)

In [11]:
from layers import get_input_layer, WeightedEmbeddings
from utils import WEIGHT_SUFFIX

embeddings, inputs = {}, {}
for feature in user_features + offer_features:
    if feature in offer_features:
        # for offer features we need weights:
        # with dummy weights during training, and the ones used for a feature's averaging at inference time
        inputs[f'{feature}{WEIGHT_SUFFIX}'] = get_input_layer(f'{feature}{WEIGHT_SUFFIX}', tf.float32)
    inputs[feature] = get_input_layer(feature)
    # here we use input feature modality from `vocabulary_sizes` to know embeddings matrix dimensions
    emb_layer = WeightedEmbeddings(vocabulary_sizes[feature],
                                   EMBEDDING_DIM, name=f'{feature}_embedding',
                                   embeddings_regularizer=REGULARIZER())
    embeddings[feature] = emb_layer(inputs[feature], inputs.get(f'{feature}{WEIGHT_SUFFIX}'))

In [12]:
embedded_user_features = [embeddings[feature] for feature in user_features]
embedded_offer_features = [embeddings[feature] for feature in offer_features]
user_tower = USER_TOWER()(tf.keras.layers.Concatenate(name='concat_user')(embedded_user_features))
offer_tower = OFFER_TOWER()(tf.keras.layers.Concatenate(name='concat_offer')(embedded_offer_features))

In [13]:
from layers import DotWithNegatives

# we don't apply sigmoid on the output and will have from_logits=True in both loss and metrics
output = DotWithNegatives(NUMBER_OF_NEGATIVES, name='prediction')([user_tower, offer_tower],
                                                                  generate_negatives=True)

In [14]:
from utils import BroadcastLoss, BroadcastMetric

model = tf.keras.Model(inputs, output, name='two_tower_model')
model.compile(optimizer=OPTIMIZER,
              loss=BroadcastLoss(LOSS, NUMBER_OF_NEGATIVES),
              metrics=[BroadcastMetric(AUC_METRIC, NUMBER_OF_NEGATIVES)])

### Training

In [15]:
model.fit(datasets['train'], epochs=EPOCHS, validation_data=datasets['val'])

Epoch 1/8


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f6dd4711df0>

## Single task models benchmark

As described in [notebook for Movielens](https://github.com/tinyclues/group-by-augmentations-model/blob/master/training/movielens%20simple%20model.ipynb) we can consider predictions on one chosen offer column as a single task and the whole setup as a multi-task problem. Let's now evaluate performance of a common model on a subset of tasks. We will compare its results against single task models sharing the same architecture, but using only one offer feature at time.

In [16]:
# offer columns we want to evaluate, specific to dataset we test
TASKS = ['product_id', 'category1', 'category2', 'category3', 'brand', 'priceCluster']

For simplicity of further code, let's wrap whole model definition into a function:

In [17]:
def two_tower_model(offer_features, name='two_tower_model'):
    # user_features, vocabulary_sizes, EMBEDDING_DIM, REGULARIZER, USER_TOWER, OFFER_TOWER,
    # OPTIMIZER, LOSS, NUMBER_OF_NEGATIVES
    # come from global scope, but can be passed as params instead
    embeddings, inputs = {}, {}
    for feature in user_features + offer_features:
        if feature in offer_features:
            # for offer features we need weights:
            # with dummy weights during training, and the ones used for a feature's averaging at inference time
            inputs[f'{feature}{WEIGHT_SUFFIX}'] = get_input_layer(f'{feature}{WEIGHT_SUFFIX}', tf.float32)
        inputs[feature] = get_input_layer(feature)
        # here we use input feature modality from `vocabulary_sizes` to know embeddings matrix dimensions
        emb_layer = WeightedEmbeddings(vocabulary_sizes[feature],
                                       EMBEDDING_DIM, name=f'{feature}_embedding',
                                       embeddings_regularizer=REGULARIZER())
        embeddings[feature] = emb_layer(inputs[feature], inputs.get(f'{feature}{WEIGHT_SUFFIX}'))
    
    embedded_user_features = [embeddings[feature] for feature in user_features]
    embedded_offer_features = [embeddings[feature] for feature in offer_features]
    user_tower = USER_TOWER()(tf.keras.layers.Concatenate(name='concat_user')(embedded_user_features))
    offer_tower = OFFER_TOWER()(tf.keras.layers.Concatenate(name='concat_offer')(embedded_offer_features))
    
    output = DotWithNegatives(NUMBER_OF_NEGATIVES, name='prediction')([user_tower, offer_tower], generate_negatives=True)
    model = tf.keras.Model(inputs, output, name=name)
    model.compile(optimizer=OPTIMIZER,
                  loss=BroadcastLoss(LOSS, NUMBER_OF_NEGATIVES),
                  metrics=[BroadcastMetric(AUC_METRIC, NUMBER_OF_NEGATIVES)])
    
    return model

We train models that use only one offer feature with same hyperparameters as the initial model.

In [18]:
mono_feature_models = {}
for task_offer_feature in TASKS:
    mono_feature_models[task_offer_feature] = two_tower_model([task_offer_feature],
                                                              name=f'{task_offer_feature}_model')
    mono_feature_models[task_offer_feature].fit(datasets['train'],
                                                epochs=EPOCHS,
                                                validation_data=datasets['val'])

Epoch 1/8


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8


  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


## Evaluation

Now let's load test dataset and generate some offers from it:
* we will consider all batches from test dataset
* we perform a group by using each feature from `TASKS` as a group by key
* for all offer features except the one we are using as key we generate ragged tensors with bag of values it can take
* we remove least popular values in each list
* so now each line of dataset corresponds to an offer of type `task_offer_feature = 'value'`

In [19]:
%%time
from utils import prepare_single_task_dataset
test_datasets = {}
for task_offer_feature in TASKS:
    test_datasets[task_offer_feature] = \
        prepare_single_task_dataset(datasets['test'], task_offer_feature, offer_features)

CPU times: user 3min 33s, sys: 16.9 s, total: 3min 50s
Wall time: 3min 18s


Now we can apply model on grouped features for each task and calculate AUC for each offer of type `task_offer_feature = 'value'`. Note, that negatives are generated in the same way as for training.

In [20]:
%%time
from collections import defaultdict
from utils import evaluate_model, wAUC

aucs = defaultdict(dict)
for task_offer_feature in TASKS:
    for model_name in TASKS:
        aucs[task_offer_feature][f'MONO:{model_name}'] = \
            evaluate_model(mono_feature_models[model_name],
                           task_offer_feature, test_datasets, NUMBER_OF_NEGATIVES, inverse_lookups)
    aucs[task_offer_feature]['simple model'] = \
            evaluate_model(model, task_offer_feature, test_datasets, NUMBER_OF_NEGATIVES, inverse_lookups)

  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(i

CPU times: user 1h 43min 38s, sys: 6min 28s, total: 1h 50min 7s
Wall time: 46min 40s


In [None]:
from utils import save_metrics
save_metrics(aucs, DATASET, 'simple_model')

## Aggregating results

### Popular offers

We can aggregate AUCs from individual offers to have one value we can compare among models: weighted macro AUC. We will keep only offers with more than 200 positive events and weight their AUCs by number of events:

In [21]:
import numpy as np
import pandas as pd

results = pd.DataFrame()
for task_name in aucs:
    for model_name in aucs[task_name]:
        w_auc = wAUC(aucs[task_name][model_name])
        results = pd.concat([results,
                             pd.Series({'wAUC': w_auc, 'offers': task_name, 'model': model_name}).to_frame().T],
                            ignore_index=True)

In [27]:
pd.pivot_table(results, 'wAUC', 'model', 'offers')\
    .rename(columns={'priceCluster': 'price'}, index={'MONO:priceCluster': 'MONO:price'})\
    .iloc[[6, 5, 3, 0, 2, 1, 4]][['product_id', 'category3', 'brand', 'category2', 'category1', 'price']]\
    .style.background_gradient(cmap='coolwarm').format(precision=3)

offers,product_id,category3,brand,category2,category1,price
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
simple model,0.762,0.679,0.743,0.675,0.656,0.654
MONO:product_id,0.756,0.663,0.741,0.662,0.648,0.642
MONO:category3,0.671,0.74,0.666,0.717,0.703,0.579
MONO:brand,0.728,0.643,0.756,0.643,0.627,0.604
MONO:category2,0.678,0.71,0.668,0.739,0.722,0.573
MONO:category1,0.672,0.703,0.655,0.725,0.737,0.572
MONO:price,0.701,0.596,0.653,0.582,0.579,0.698


### Cold-start

In [28]:
results = pd.DataFrame()
for task_name in ['product_id', 'brand', 'category3']:
    for model_name in aucs[task_name]:
        w_auc = wAUC(aucs[task_name][model_name], cutoff_low=10, cutoff_high=200)
        results = pd.concat([results,
                             pd.Series({'wAUC': w_auc, 'offers': task_name, 'model': model_name}).to_frame().T],
                            ignore_index=True)

In [29]:
pd.pivot_table(results, 'wAUC', 'model', 'offers')\
    .rename(columns={'priceCluster': 'price'}, index={'MONO:priceCluster': 'MONO:price'})\
    .iloc[[6, 5, 3, 0]][['product_id', 'category3', 'brand']]\
    .style.background_gradient(cmap='coolwarm').format(precision=3)

offers,product_id,category3,brand
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
simple model,0.76,0.76,0.769
MONO:product_id,0.752,0.752,0.754
MONO:category3,0.697,0.751,0.709
MONO:brand,0.728,0.755,0.769
