# Embeddings

## Prepare dataset

Loading and preparing dataset as in [the training for Movielens/IMDB dataset](https://github.com/tinyclues/recsys-multi-atrribute-benchmark/blob/master/training/movielens%20simple%20model.ipynb).

In [None]:
DATASET = 'rees_ecommerce'

In [None]:
from utils import load_dataset

datasets = {}
for split_name in ['train', 'val', 'test']:
    datasets[split_name] = load_dataset(DATASET, split_name)

In [None]:
from utils import AGG_PREFIX

all_columns = list(datasets['train'].element_spec.keys())
technical_columns = ['user_id', 'date']
user_features = list(filter(lambda x: x.startswith(AGG_PREFIX), all_columns))
offer_features = list(filter(lambda x: x not in user_features + technical_columns, all_columns))

In [None]:
from functools import partial
from uuid import uuid4

from utils import rebatch_by_events

datasets['train'] = rebatch_by_events(datasets['train'], batch_size=5040, date_column='date', nb_events_by_user_by_day=8)
for key in ['val', 'test']:
    datasets[key] = rebatch_by_events(datasets[key], batch_size=5040, date_column='date', nb_events_by_user_by_day=8,
                                      seed=1729).cache(f'/tmp/{uuid4()}.tf')

In [None]:
from utils import add_equal_weights

for key in datasets:
    datasets[key] = datasets[key].map(partial(add_equal_weights, features=offer_features))

In [None]:
from utils import load_inverse_lookups
inverse_lookups = load_inverse_lookups(DATASET)

In [None]:
import re

vocabulary_sizes = {}

for feature in offer_features:
    vocabulary_sizes[feature] = inverse_lookups[feature].vocabulary_size()

for feature in user_features:
    for key in inverse_lookups:
        pattern = re.compile(r"{}(\w+)_{}".format(AGG_PREFIX, key))
        if pattern.match(feature):
            vocabulary_sizes[feature] = vocabulary_sizes[key]

## Prepare evaluation dataset

In [None]:
TASKS = ['product_id', 'category1', 'category2', 'category3', 'brand', 'priceCluster']

In [None]:
%%time
from utils import get_task_offer_features, remap_features_using_key
test_datasets, test_offer_tensors = {}
for task_offer_feature in TASKS:
    test_offer_tensors[task_offer_feature] = \
        get_task_offer_features(datasets['test'], task_offer_feature, offer_features)
    test_datasets[task_offer_feature] = \
        remap_features_using_key(datasets['test'], task_offer_feature,
                                 test_offer_tensors[task_offer_feature])

## Model

In [None]:
import tensorflow as tf

In [None]:
# model parameters
EMBEDDING_DIM = 100
L1_COEFF = 3e-7
DROPOUT = 0.1


def REGULARIZER():
    return {'class_name': 'L1L2', 'config': {'l1': L1_COEFF, 'l2': 0.}}

def USER_TOWER():
    return tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(100,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
        tf.keras.layers.Dense(50,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
    ], name='user_tower')

def OFFER_TOWER():
    return tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(100,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
        tf.keras.layers.Dense(50,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
    ], name='offer_tower')

EPOCHS = 10

NUMBER_OF_NEGATIVES = 4
LOSS = tf.keras.losses.BinaryCrossentropy(from_logits=True)
AUC_METRIC = tf.keras.metrics.AUC(from_logits=True)

import tensorflow_addons as tfa
OPTIMIZER = tfa.optimizers.AdamW(weight_decay=4e-8, learning_rate=0.00085)

In [None]:
NB_AUGMENTATIONS = 3
AVERAGE_NUMBER_OF_FEATURES_IN_AUGMENTATION = 2
USER_META_FEATURES = 5
OFFER_META_FEATURES = 3


def OUTPUT_DNN():
    return tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(100,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
        tf.keras.layers.Dense(50,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('tanh'),
        tf.keras.layers.Dense(1,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
    ], name='output_dnn')

In [None]:
from layers import *
from utils import *

def _model(group_by=False, mask_net=False, bilinear_interaction=False, name='model'):
    inputs = {}
    embedded_user_features, embedded_offer_features, variance_offer_features = {}, {}, {}
    for feature in user_features:
        inputs[feature] = get_input_layer(feature)
        emb_layer = WeightedEmbeddings(vocabulary_sizes[feature],
                                       EMBEDDING_DIM, name=f'{feature}_embedding',
                                       embeddings_regularizer=REGULARIZER())
        embedded_user_features[feature] = emb_layer(inputs[feature])
    for feature in offer_features:
        # for offer features we need weights:
        # with dummy weights during training, and the ones used for a feature's averaging at inference time
        inputs[f'{feature}_weight'] = get_input_layer(f'{feature}_weight', tf.float32)
        inputs[feature] = get_input_layer(feature)
        emb_layer = WeightedEmbeddings(vocabulary_sizes[feature],
                                       EMBEDDING_DIM, name=f'{feature}_embedding',
                                       embeddings_regularizer=REGULARIZER(),
                                       calculate_variance=True)
        embedded_offer_features[feature], variance_offer_features[feature] =\
            emb_layer(inputs[feature], inputs[f'{feature}_weight'])
    
    user_stacked = tf.stack(list(embedded_user_features.values()), axis=1)
    offer_stacked = tf.stack(list(embedded_offer_features.values()), axis=1)
    offer_variance = tf.stack(list(variance_offer_features.values()), axis=1)
    stacked_raw_offer_attrs = tf.stack([tf.cast(inp.values, tf.int32) for feature, inp in inputs.items()
                                        if feature in offer_features], axis=1)
    
    if group_by:
        group_by_layer = GroupBy(name='group_by')
        key_generator = KeyGenerator(number_of_offer_attributes=len(offer_features),
                                     average_number_of_attributes_in_key=AVERAGE_NUMBER_OF_FEATURES_IN_AUGMENTATION,
                                     name='grp_key_generator')
        
        augmentations = []
        for i in range(NB_AUGMENTATIONS):
            group_by_key = key_generator(stacked_raw_offer_attrs)
            augmentations.append(group_by_layer(group_by_key, offer_stacked))
        
    else:
        augmentations = [(offer_stacked, None)]
        
    if mask_net:
        user_compressed = UserFeaturesCompressor(USER_META_FEATURES, DROPOUT,
                                                 name='user_compressor')(user_stacked)
        
        offer_features_compressor = OfferFeaturesCompressor(OFFER_META_FEATURES, DROPOUT, name='offer_compressor')
        mask_net = MaskNet(OFFER_META_FEATURES, DROPOUT, name='mask_generation')
        apply_mask = tf.keras.layers.Multiply(name='apply_mask')
        
        attention_augmentations = []
        for mean_offer_emb, variance_offer_emb in augmentations:
            compressed_offer_embeddings = offer_features_compressor([mean_offer_emb, variance_offer_emb])
            mask = mask_net([mean_offer_emb, variance_offer_emb])
            attention_augmentations.append(apply_mask([compressed_offer_embeddings, mask]))
        
        compressed_offer_embeddings = offer_features_compressor([offer_stacked, offer_variance])
        mask = mask_net([offer_stacked, offer_variance])
        eval_offer_embeddings = apply_mask([compressed_offer_embeddings, mask])
    else:
        user_compressed = user_stacked
        attention_augmentations = [mean_offer_emb for mean_offer_emb, _ in augmentations]
        eval_offer_embeddings = offer_stacked
        
    if bilinear_interaction:
        if not mask_net:
            # we need to apply compression to keep model's footprint limited
            # and also to keep model robust with same hyperparams
            user_compressed = UserFeaturesCompressor(USER_META_FEATURES, DROPOUT,
                                                     name='user_compressor')(user_compressed)
            
        bi_linear_interaction = BiLinearInteraction(number_of_negatives=NUMBER_OF_NEGATIVES, dropout_rate=DROPOUT,
                                                    initializer='random_normal', regularizer=REGULARIZER(),
                                                    name='interaction')
        output_dnn = OUTPUT_DNN()
        
        augmentation_predictions = []
        for masked_offer_embeddings in attention_augmentations:
            augmentation_predictions.append(
                output_dnn(bi_linear_interaction([user_compressed, masked_offer_embeddings], generate_negatives=True))
            )
        output = tf.concat(augmentation_predictions, axis=1)
        
        eval_output = output_dnn(bi_linear_interaction([user_compressed, eval_offer_embeddings], generate_negatives=True))
    else:
        user_tower = USER_TOWER()(tf.keras.layers.Reshape((-1,), name='concat_user')(user_compressed))
        
        offer_reshape = tf.keras.layers.Reshape((-1,), name='concat_offer')
        offer_tower_layer = OFFER_TOWER()
        dot_interaction = DotWithNegatives(NUMBER_OF_NEGATIVES, name='prediction')
        augmentation_predictions = []
        for masked_offer_embeddings in attention_augmentations:
            offer_tower = offer_tower_layer(offer_reshape(masked_offer_embeddings))
            augmentation_predictions.append(dot_interaction([user_tower, offer_tower], generate_negatives=True))
        output = tf.concat(augmentation_predictions, axis=1)
        
        eval_offer_embeddings = offer_tower_layer(offer_reshape(eval_offer_embeddings))
        eval_output = dot_interaction([user_tower, eval_offer_embeddings],
                                      generate_negatives=True)
    
    model = tf.keras.Model(inputs, output, name=name)
    model.compile(optimizer=OPTIMIZER,
                  loss=BroadcastLoss(LOSS, NUMBER_OF_NEGATIVES),
                  metrics=[BroadcastMetric(AUC_METRIC, NUMBER_OF_NEGATIVES)])

    eval_model = tf.keras.Model(inputs, eval_output, name=f'{name}_eval')
    emb_model = tf.keras.Model(inputs, eval_offer_embeddings, name=f'{name}_emb')
    
    return model, eval_model, emb_model

In [None]:
import itertools

MODELS, EVAL_MODELS, EMBEDDING_MODELS = {}, {}, {}
for group_by, mask_net, bilinear_interaction in [(True, True, True), (False, False, True)]:
    if mask_net and not group_by:
        continue
    key = group_by, mask_net, bilinear_interaction
    MODELS[key], EVAL_MODELS[key], EMBEDDING_MODELS[key] = \
        _model(*key, name='_'.join(map(lambda x: str(x).lower(), key)))

## Train

In [None]:
len(MODELS)

In [None]:
for model in MODELS.values():
    model.fit(datasets['train'], epochs=EPOCHS, validation_data=datasets['val'])

## Embeddings similarities

In [None]:
model_with_augmentations = EMBEDDING_MODELS[(True, True, True)]
model_wo_augmentations = EMBEDDING_MODELS[(False, False, True)]

In [None]:
from utils import AGG_PREFIX

all_columns = list(datasets['train'].element_spec.keys())
technical_columns = ['user_id', 'date']
user_features = list(filter(lambda x: x.startswith(AGG_PREFIX), all_columns))
offer_features = list(filter(lambda x: x not in user_features + technical_columns, all_columns))

In [None]:
from functools import partial
from uuid import uuid4

from utils import rebatch_by_events

datasets['train'] = rebatch_by_events(datasets['train'], batch_size=5040, date_column='date', nb_events_by_user_by_day=8)
for key in ['val', 'test']:
    datasets[key] = rebatch_by_events(datasets[key], batch_size=5040, date_column='date', nb_events_by_user_by_day=8,
                                      seed=1729).cache(f'/tmp/{uuid4()}.tf')

In [None]:
from utils import add_equal_weights

for key in datasets:
    datasets[key] = datasets[key].map(partial(add_equal_weights, features=offer_features))

In [None]:
from utils import load_inverse_lookups
inverse_lookups = load_inverse_lookups(DATASET)

In [None]:
import re

vocabulary_sizes = {}

for feature in offer_features:
    vocabulary_sizes[feature] = inverse_lookups[feature].vocabulary_size()

for feature in user_features:
    for key in inverse_lookups:
        pattern = re.compile(r"{}(\w+)_{}".format(AGG_PREFIX, key))
        if pattern.match(feature):
            vocabulary_sizes[feature] = vocabulary_sizes[key]