# Training simple model and evalualing its predictions on different tasks

## Prepare dataset for training

Let's follow the same steps as in [the notebook for Movielens/IMDB dataset](https://github.com/tinyclues/recsys-multi-atrribute-benchmark/blob/master/training/movielens%20simple%20model.ipynb)

In [None]:
DATASET = 'rees_ecommerce'

In [None]:
import tensorflow as tf

from utils import load_dataset

datasets = {}
for split_name in ['train', 'val', 'test']:
    datasets[split_name] = load_dataset(DATASET, split_name)

We can parse features' names, they were chosen to easily distinguish between offer features (that will be used to modelize film) and user features (aggregated history up to chosen date).

In [None]:
from utils import AGG_PREFIX

all_columns = list(datasets['train'].element_spec.keys())
technical_columns = ['user_id', 'date']
user_features = list(filter(lambda x: x.startswith(AGG_PREFIX), all_columns))
offer_features = list(filter(lambda x: x not in user_features + technical_columns, all_columns))

In [None]:
user_features

['aggregated_cart_product_id',
 'aggregated_purchase_brand',
 'aggregated_cart_category3',
 'aggregated_purchase_product_id',
 'aggregated_cart_category2',
 'aggregated_cart_brand',
 'aggregated_purchase_category2',
 'aggregated_purchase_category3',
 'aggregated_cart_category1',
 'aggregated_purchase_priceCluster',
 'aggregated_cart_priceCluster',
 'aggregated_purchase_category1']

In [None]:
offer_features

['priceCluster', 'category2', 'brand', 'category1', 'product_id', 'category3']

### Rebatch dataset by events

First we will unnest events for each user (stored in second dimension of saved tensors) and keep only limited number of them. This operation will be needed further to avoid collisions during generation of negative examples. Then we will rebatch results into smaller batches (`50400` events for validation and test sets and `10080` events for train set).

In [None]:
%%time

from functools import partial
from uuid import uuid4

from utils import rebatch_by_events, add_equal_weights

datasets['train'] = rebatch_by_events(datasets['train'], batch_size=5040, date_column='date', nb_events_by_user_by_day=8)
for key in ['val', 'test']:
    datasets[key] = rebatch_by_events(datasets[key], batch_size=5040, date_column='date', nb_events_by_user_by_day=8,
                                      seed=1729).cache(f'/tmp/{uuid4()}.tf')

for key in datasets:
    datasets[key] = datasets[key].map(partial(add_equal_weights, features=offer_features))

CPU times: user 30.1 s, sys: 732 ms, total: 30.9 s
Wall time: 16.6 s


## Define simple model

In [None]:
from utils import load_inverse_lookups
inverse_lookups = load_inverse_lookups(DATASET)

In [None]:
import re

vocabulary_sizes = {}

for feature in offer_features:
    vocabulary_sizes[feature] = inverse_lookups[feature].vocabulary_size()

for feature in user_features:
    for key in inverse_lookups:
        pattern = re.compile(r"{}(\w+)_{}".format(AGG_PREFIX, key))
        if pattern.match(feature):
            vocabulary_sizes[feature] = vocabulary_sizes[key]

### Model architecture

<img src="resources/two_towers_model.png" alt="two tower model" width="800" />

### Model parameters

To choose model's parameters we did some manual tuning using validation set to maximize train and validation AUC while keeping mismatch between them small.

In [None]:
# model parameters
EMBEDDING_DIM = 100
L1_COEFF = 4e-7
DROPOUT = 0.1


def REGULARIZER():
    return {'class_name': 'L1L2', 'config': {'l1': L1_COEFF, 'l2': 0.}}

def USER_TOWER():
    return tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(100,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
        tf.keras.layers.Dense(50,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
    ], name='user_tower')

def OFFER_TOWER():
    return tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(100,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
        tf.keras.layers.Dense(50,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
    ], name='offer_tower')

EPOCHS = 10

NUMBER_OF_NEGATIVES = 4
LOSS = tf.keras.losses.BinaryCrossentropy(from_logits=True)
AUC_METRIC = tf.keras.metrics.AUC(from_logits=True)

import tensorflow_addons as tfa
OPTIMIZER = tfa.optimizers.AdamW(weight_decay=4e-8, learning_rate=0.0009)

In [None]:
from layers import get_input_layer, WeightedEmbeddings
from utils import WEIGHT_SUFFIX

embeddings, inputs = {}, {}
for feature in user_features + offer_features:
    if feature in offer_features:
        # for offer features we need weights:
        # with dummy weights during training, and the ones used for a feature's averaging at inference time
        inputs[f'{feature}{WEIGHT_SUFFIX}'] = get_input_layer(f'{feature}{WEIGHT_SUFFIX}', tf.float32)
    inputs[feature] = get_input_layer(feature)
    # here we use input feature modality from `vocabulary_sizes` to know embeddings matrix dimensions
    emb_layer = WeightedEmbeddings(vocabulary_sizes[feature],
                                   EMBEDDING_DIM, name=f'{feature}_embedding',
                                   embeddings_regularizer=REGULARIZER())
    embeddings[feature] = emb_layer(inputs[feature], inputs.get(f'{feature}{WEIGHT_SUFFIX}'))

In [None]:
embedded_user_features = [embeddings[feature] for feature in user_features]
embedded_offer_features = [embeddings[feature] for feature in offer_features]
user_tower = USER_TOWER()(tf.keras.layers.Concatenate(name='concat_user')(embedded_user_features))
offer_tower = OFFER_TOWER()(tf.keras.layers.Concatenate(name='concat_offer')(embedded_offer_features))

In [None]:
from layers import DotWithNegatives

# we don't apply sigmoid on the output and will have from_logits=True in both loss and metrics
output = DotWithNegatives(NUMBER_OF_NEGATIVES, name='prediction')([user_tower, offer_tower],
                                                                  generate_negatives=True)

In [None]:
from utils import BroadcastLoss, BroadcastMetric

model = tf.keras.Model(inputs, output, name='two_tower_model')
model.compile(optimizer=OPTIMIZER,
              loss=BroadcastLoss(LOSS, NUMBER_OF_NEGATIVES),
              metrics=[BroadcastMetric(AUC_METRIC, NUMBER_OF_NEGATIVES)])

### Training

In [None]:
model.fit(datasets['train'], epochs=EPOCHS, validation_data=datasets['val'])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7faccd203ed0>

## Single task models benchmark

As described in (TODO link to article) we can consider predictions on one chosen offer column as a single task and the whole setup as a multi-task problem. Let's now evaluate performance of a common model on a subset of tasks. We will compare its results against single task models sharing the same architecture, but using only one offer feature at time.

In [None]:
# offer columns we want to evaluate, specific to dataset we test
TASKS = ['product_id', 'category1', 'category2', 'category3', 'brand', 'priceCluster']

For simplicity of further code, let's wrap whole model definition into a function:

In [None]:
def two_tower_model(offer_features, name='two_tower_model'):
    # user_features, vocabulary_sizes, EMBEDDING_DIM, REGULARIZER, USER_TOWER, OFFER_TOWER,
    # OPTIMIZER, LOSS, NUMBER_OF_NEGATIVES
    # come from global scope, but can be passed as params instead
    embeddings, inputs = {}, {}
    for feature in user_features + offer_features:
        if feature in offer_features:
            # for offer features we need weights:
            # with dummy weights during training, and the ones used for a feature's averaging at inference time
            inputs[f'{feature}{WEIGHT_SUFFIX}'] = get_input_layer(f'{feature}{WEIGHT_SUFFIX}', tf.float32)
        inputs[feature] = get_input_layer(feature)
        # here we use input feature modality from `vocabulary_sizes` to know embeddings matrix dimensions
        emb_layer = WeightedEmbeddings(vocabulary_sizes[feature],
                                       EMBEDDING_DIM, name=f'{feature}_embedding',
                                       embeddings_regularizer=REGULARIZER())
        embeddings[feature] = emb_layer(inputs[feature], inputs.get(f'{feature}{WEIGHT_SUFFIX}'))
    
    embedded_user_features = [embeddings[feature] for feature in user_features]
    embedded_offer_features = [embeddings[feature] for feature in offer_features]
    user_tower = USER_TOWER()(tf.keras.layers.Concatenate(name='concat_user')(embedded_user_features))
    offer_tower = OFFER_TOWER()(tf.keras.layers.Concatenate(name='concat_offer')(embedded_offer_features))
    
    output = DotWithNegatives(NUMBER_OF_NEGATIVES, name='prediction')([user_tower, offer_tower], generate_negatives=True)
    model = tf.keras.Model(inputs, output, name=name)
    model.compile(optimizer=OPTIMIZER,
                  loss=BroadcastLoss(LOSS, NUMBER_OF_NEGATIVES),
                  metrics=[BroadcastMetric(AUC_METRIC, NUMBER_OF_NEGATIVES)])
    
    return model

We train models that use only one offer feature with same hyperparameters as the initial model.

In [None]:
mono_feature_models = {}
for task_offer_feature in TASKS:
    mono_feature_models[task_offer_feature] = two_tower_model([task_offer_feature],
                                                              name=f'{task_offer_feature}_model')
    mono_feature_models[task_offer_feature].fit(datasets['train'],
                                                epochs=EPOCHS,
                                                validation_data=datasets['val'])

Epoch 1/10


  inputs = self._flatten_to_reference_inputs(inputs)


     61/Unknown - 41s 153ms/step - loss: 0.6308 - auc_4: 0.7034

## Evaluation

Now let's load test dataset and generate some offers from it:
* we will consider all batches from test dataset
* we perform a group by using each feature from `TASKS` as a group by key
* for all offer features except the one we are using as key we generate ragged tensors with bag of values it can take
* we remove least popular values in each list
* so now each line of dataset corresponds to an offer of type `task_offer_feature = 'value'`

In [None]:
raw_test_ds = load_dataset(DATASET, 'test')

In [None]:
%%time
from utils import prepare_single_task_dataset
test_datasets = {}
for task_offer_feature in TASKS:
    test_datasets[task_offer_feature] = \
        prepare_single_task_dataset(raw_test_ds, 5040, task_offer_feature, offer_features, 'date')

CPU times: user 6min 7s, sys: 29.2 s, total: 6min 36s
Wall time: 5min 42s


Now we can apply model on grouped features for each task and calculate AUC for each offer of type `task_offer_feature = 'value'`. Note, that negatives are generated in the same way as for training.

In [None]:
## %%time
from collections import defaultdict
from utils import evaluate_model, wAUC

aucs = defaultdict(dict)
for task_offer_feature in TASKS:
    for model_name in TASKS:
        aucs[task_offer_feature][f'MONO:{model_name}'] = \
            evaluate_model(mono_feature_models[model_name],
                           task_offer_feature, test_datasets, inverse_lookups, NUMBER_OF_NEGATIVES)
    aucs[task_offer_feature]['simple model'] = \
            evaluate_model(model, task_offer_feature, test_datasets, inverse_lookups, NUMBER_OF_NEGATIVES)

  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(inputs)
  inputs = self._flatten_to_reference_inputs(i

We can aggregate AUCs from individual offers to have one value we can compare among models: weighted macro AUC. We will keep only offers with more than 200 positive events and weight their AUCs by number of events:

In [None]:
import pandas as pd
results = pd.DataFrame()
for task_name in aucs:
    for model_name in aucs[task_name]:
        w_auc = wAUC(aucs[task_name][model_name])
        results = results.append({'wAUC': w_auc, 'offers': task_name, 'model': model_name}, ignore_index=True)

In [None]:
pd.pivot_table(results, 'wAUC', 'model', 'offers').style.background_gradient(cmap='coolwarm')

offers,brand,category1,category2,category3,priceCluster,product_id
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MONO:brand,0.720142,0.598288,0.607077,0.59804,0.606463,0.7185
MONO:category1,0.627355,0.668074,0.663821,0.646415,0.576787,0.662828
MONO:category2,0.627053,0.643934,0.676486,0.650058,0.568405,0.660762
MONO:category3,0.626376,0.625109,0.641843,0.668536,0.56729,0.647156
MONO:priceCluster,0.647901,0.572067,0.584299,0.591195,0.694362,0.716155
MONO:product_id,0.706076,0.610038,0.624329,0.62354,0.642318,0.761413
simple model,0.709834,0.623426,0.628428,0.634945,0.643003,0.760167
