In [None]:
domain = ''
group_by_pipeline_uuid = ''
two_tower_pipeline_uuid = ''
model_name = ''

from experimentations.pipeline_artifacts import PipelineArtifacts
artifacts = PipelineArtifacts(domain, group_by_pipeline_uuid)

datasets = {}
datasets['train'], training_ds_for_eval, test_ds = artifacts.dataloaders(model_name)
group_by_task, _ = artifacts.task_and_vectorization(model_name)
vocabulary_sizes = group_by_task._model_kwargs['size_dict']
offer_features = list(group_by_task.target_features)
user_features = [feature for feature in vocabulary_sizes.keys() if feature not in offer_features]

In [None]:
offer_features

In [None]:
TASKS = []
assert TASKS, 'Choose some offer features for single task benchmarks'

## Preparing training and evaluation datasets

In [None]:
def pop_response(batch):
    y = batch.pop('response')
    return batch, y

datasets['test'] = test_ds['test'].map(pop_response)

In [None]:
%%time
from utils import prepare_single_task_dataset
single_task_datasets = {}
for task_offer_feature in TASKS:
    single_task_datasets[task_offer_feature] = \
        prepare_single_task_dataset(datasets['test'], task_offer_feature, offer_features)

## Training models

### Two tower model from scratch

In [None]:
from layers import *
from utils import WEIGHT_SUFFIX, BroadcastLoss, BroadcastMetric

In [None]:
# One can take parameters from production config or change them
group_by_task.model_kwargs, group_by_task._optimizer_kwargs, group_by_task.epochs

In [None]:
# model parameters
EMBEDDING_DIM = 30
L1_COEFF = 2e-7
DROPOUT = 0.05


def REGULARIZER():
    return {'class_name': 'L1L2', 'config': {'l1': L1_COEFF, 'l2': 0.}}

def USER_TOWER():
    return tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(30,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('gelu'),
        tf.keras.layers.Dense(20,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('gelu'),
    ], name='user_tower')

def OFFER_TOWER():
    return tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(30,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('gelu'),
        tf.keras.layers.Dense(20,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('gelu'),
    ], name='offer_tower')

EPOCHS = 3

NUMBER_OF_NEGATIVES = 3
LOSS = tf.keras.losses.BinaryCrossentropy(from_logits=True)
AUC_METRIC = tf.keras.metrics.AUC(from_logits=True)

import tensorflow_addons as tfa
OPTIMIZER = tfa.optimizers.AdamW(weight_decay=4e-7, learning_rate=0.001)

In [None]:
# if these assertions are wrong, need to pass correct negative_positive_ratio in
# BroadcastLoss, BroadcastMetric and further in evaluate_model
assert NUMBER_OF_NEGATIVES == group_by_task.model_kwargs['negative_positive_ratio']
assert NUMBER_OF_NEGATIVES == two_tower_task.model_kwargs['negative_positive_ratio']

In [None]:
def get_two_tower_model(offer_features, name='two_tower_model'):
    # user_features, vocabulary_sizes, EMBEDDING_DIM, REGULARIZER, USER_TOWER, OFFER_TOWER,
    # OPTIMIZER, LOSS, NUMBER_OF_NEGATIVES
    # come from global scope, but can be passed as params instead
    embeddings, inputs = {}, {}
    for feature in user_features + offer_features:
        if feature in offer_features:
            # for offer features we need weights:
            # with dummy weights during training, and the ones used for a feature's averaging at inference time
            inputs[f'{feature}{WEIGHT_SUFFIX}'] = get_input_layer(f'{feature}{WEIGHT_SUFFIX}', tf.float32)
        inputs[feature] = get_input_layer(feature)
        # here we use input feature modality from `vocabulary_sizes` to know embeddings matrix dimensions
        emb_layer = WeightedEmbeddings(vocabulary_sizes[feature],
                                       EMBEDDING_DIM, name=f'{feature}_embedding',
                                       embeddings_regularizer=REGULARIZER())
        embeddings[feature] = emb_layer(inputs[feature], inputs.get(f'{feature}{WEIGHT_SUFFIX}'))
    
    embedded_user_features = [embeddings[feature] for feature in user_features]
    embedded_offer_features = [embeddings[feature] for feature in offer_features]
    user_tower = USER_TOWER()(tf.keras.layers.Concatenate(name='concat_user')(embedded_user_features))
    offer_tower = OFFER_TOWER()(tf.keras.layers.Concatenate(name='concat_offer')(embedded_offer_features))
    
    output = DotWithNegatives(NUMBER_OF_NEGATIVES, name='prediction')([user_tower, offer_tower], generate_negatives=True)
    model = tf.keras.Model(inputs, output, name=name)
    model.compile(optimizer=OPTIMIZER,
                  loss=BroadcastLoss(LOSS, NUMBER_OF_NEGATIVES),
                  metrics=[BroadcastMetric(AUC_METRIC, NUMBER_OF_NEGATIVES)])
    
    return model

In [None]:
two_tower_model = get_two_tower_model(offer_features, name='two_tower_model')

In [None]:
two_tower_model.fit(datasets['train'], epochs=EPOCHS, validation_data=datasets['test'])

### Two tower like model defined with production code

In [None]:
artifacts_two_tower = PipelineArtifacts(domain, wide_bi_pipeline_uuid)
two_tower_task, _ = artifacts_two_tower.task_and_vectorization(model_name)

# align vectorizations
two_tower_task.training_data_schema_by_block = group_by_task.training_data_schema_by_block
two_tower_task._model_kwargs['size_dict'] = group_by_task._model_kwargs['size_dict']
two_tower_task._model = None

two_tower_task.compile(two_tower_task.model)
metrics, eval_metrics, _, _, _ = train_fn(two_tower_task, datasets['train'], training_ds_for_eval, test_ds,
                                          verbose="auto")

In [None]:
metrics

In [None]:
eval_metrics

### Group-by augmentations model from scratch

In [None]:
NB_AUGMENTATIONS = 3
AVERAGE_NUMBER_OF_FEATURES_IN_AUGMENTATION = 2
USER_META_FEATURES = 7
OFFER_META_FEATURES = 5

def OUTPUT_DNN():
    return tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(30,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('gelu'),
        tf.keras.layers.Dense(20,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
        tf.keras.layers.Dropout(DROPOUT),
        tf.keras.layers.Activation('gelu'),
        tf.keras.layers.Dense(1,
                              kernel_regularizer=REGULARIZER(),
                              bias_regularizer=REGULARIZER()),
    ], name='output_dnn')

In [None]:
def get_group_by_augm_model(offer_features, name='group_by_model'):
    # user_features, vocabulary_sizes, EMBEDDING_DIM, REGULARIZER, USER_TOWER, OFFER_TOWER,
    # OPTIMIZER, LOSS, NUMBER_OF_NEGATIVES
    # come from global scope, but can be passed as params instead
    inputs = {}
    embedded_user_features, embedded_offer_features, variance_offer_features = {}, {}, {}
    for feature in user_features:
        inputs[feature] = get_input_layer(feature)
        emb_layer = WeightedEmbeddings(vocabulary_sizes[feature],
                                       EMBEDDING_DIM, name=f'{feature}_embedding',
                                       embeddings_regularizer=REGULARIZER())
        embedded_user_features[feature] = emb_layer(inputs[feature])
    for feature in offer_features:
        # for offer features we need weights:
        # with dummy weights during training, and the ones used for a feature's averaging at inference time
        inputs[f'{feature}_weight'] = get_input_layer(f'{feature}_weight', tf.float32)
        inputs[feature] = get_input_layer(feature)
        emb_layer = WeightedEmbeddings(vocabulary_sizes[feature],
                                       EMBEDDING_DIM, name=f'{feature}_embedding',
                                       embeddings_regularizer=REGULARIZER(),
                                       calculate_variance=True)
        embedded_offer_features[feature], variance_offer_features[feature] =\
            emb_layer(inputs[feature], inputs[f'{feature}_weight'])
    
        
    user_stacked = tf.stack(list(embedded_user_features.values()), axis=1)
    offer_stacked = tf.stack(list(embedded_offer_features.values()), axis=1)
    offer_variance = tf.stack(list(variance_offer_features.values()), axis=1)
    stacked_raw_offer_attrs = tf.stack([tf.cast(inp.values, tf.int32) for feature, inp in inputs.items()
                                        if feature in offer_features], axis=1)


    group_by = GroupBy(name='group_by')
    key_generator = KeyGenerator(number_of_offer_attributes=len(offer_features),
                                 average_number_of_attributes_in_key=AVERAGE_NUMBER_OF_FEATURES_IN_AUGMENTATION,
                                 name='grp_key_generator')

    user_compressed = UserFeaturesCompressor(USER_META_FEATURES, DROPOUT,
                                             name='user_compressor')(user_stacked)
    offer_features_compressor = OfferFeaturesCompressor(OFFER_META_FEATURES, DROPOUT, name='offer_compressor')
    mask_net = MaskNet(OFFER_META_FEATURES, DROPOUT, name='mask_generation')
    apply_mask = tf.keras.layers.Multiply(name='apply_mask')
    bi_linear_interaction = BiLinearInteraction(number_of_negatives=NUMBER_OF_NEGATIVES, dropout_rate=DROPOUT,
                                                initializer='random_normal', regularizer=REGULARIZER(),
                                                name='interaction')
    output_dnn = OUTPUT_DNN()

    augmentation_predictions = []
    for i in range(NB_AUGMENTATIONS):
        group_by_key = key_generator(stacked_raw_offer_attrs)
        mean_offer_emb, variance_offer_emb = group_by(group_by_key, offer_stacked)
        compressed_offer_embeddings = offer_features_compressor([mean_offer_emb, variance_offer_emb])
        mask = mask_net([mean_offer_emb, variance_offer_emb])
        masked_offer_embeddings = apply_mask([compressed_offer_embeddings, mask])
        _output = output_dnn(bi_linear_interaction([user_compressed, masked_offer_embeddings], generate_negatives=True))
        augmentation_predictions.append(_output)
    output = tf.concat(augmentation_predictions, axis=1)
    
    compressed_offer_embeddings = offer_features_compressor([offer_stacked, offer_variance])
    mask = mask_net([offer_stacked, offer_variance])
    masked_offer_embeddings = apply_mask([compressed_offer_embeddings, mask])

    eval_output = output_dnn(bi_linear_interaction([user_compressed, masked_offer_embeddings], generate_negatives=True))

    model = tf.keras.Model(inputs, output, name=name)
    model.compile(optimizer=OPTIMIZER,
                  loss=BroadcastLoss(LOSS, NUMBER_OF_NEGATIVES),
                  metrics=[BroadcastMetric(AUC_METRIC, NUMBER_OF_NEGATIVES)])

    eval_model = tf.keras.Model(inputs, eval_output, name=f'{name}_eval')
    
    return model, eval_model

In [None]:
group_by_model, group_by_eval_model = get_group_by_augm_model(offer_features, name='group_by_model')

In [None]:
group_by_model.fit(datasets['train'], epochs=EPOCHS, validation_data=datasets['test'])

### Group-by augmentations model defined with production code

In [None]:
from moksha.helpers.trainer import train_fn
metrics, eval_metrics, *_ = train_fn(group_by_task,
                                     datasets['train'], training_ds_for_eval, test_ds,
                                     verbose="auto")

In [None]:
metrics

In [None]:
eval_metrics

## Training baseline models

In [None]:
def bi_linear_interaction_model(single_task_feature, name='bi_linear_model'):
    # user_features, vocabulary_sizes, EMBEDDING_DIM, REGULARIZER, USER_TOWER, OFFER_TOWER,
    # OPTIMIZER, LOSS, NUMBER_OF_NEGATIVES
    # come from global scope, but can be passed as params instead
    inputs = {}
    embedded_user_features, embedded_offer_features, variance_offer_features = {}, {}, {}
    for feature in user_features:
        inputs[feature] = get_input_layer(feature)
        emb_layer = WeightedEmbeddings(vocabulary_sizes[feature],
                                       EMBEDDING_DIM, name=f'{feature}_embedding',
                                       embeddings_regularizer=REGULARIZER())
        embedded_user_features[feature] = emb_layer(inputs[feature])

    # for offer feature we need weights:
    # with dummy weights during training, and the ones used for a feature's averaging at inference time
    inputs[f'{single_task_feature}_weight'] = get_input_layer(f'{single_task_feature}_weight', tf.float32)
    inputs[single_task_feature] = get_input_layer(single_task_feature)
    emb_layer = WeightedEmbeddings(vocabulary_sizes[single_task_feature],
                                   EMBEDDING_DIM, name=f'{single_task_feature}_embedding',
                                   embeddings_regularizer=REGULARIZER())
    embedded_offer_feature = emb_layer(inputs[single_task_feature],
                                       inputs[f'{single_task_feature}_weight'])
    
    user_stacked = tf.stack(list(embedded_user_features.values()), axis=1)
    offer_stacked = tf.expand_dims(embedded_offer_feature, axis=1)
    
    
    user_compressed = UserFeaturesCompressor(USER_META_FEATURES, DROPOUT,
                                             name='user_compressor')(user_stacked)
    mask_net = MaskNet(OFFER_META_FEATURES, DROPOUT, name='mask_generation')
    apply_mask = tf.keras.layers.Multiply(name='apply_mask')
    bi_linear_interaction = BiLinearInteraction(number_of_negatives=NUMBER_OF_NEGATIVES, dropout_rate=DROPOUT,
                                                initializer='random_normal', regularizer=REGULARIZER(),
                                                name='interaction')
    output_dnn = OUTPUT_DNN()

    
    mask = mask_net([offer_stacked, offer_stacked])
    masked_offer_embeddings = apply_mask([offer_stacked, mask])
    
    output = OUTPUT_DNN()(bi_linear_interaction([user_compressed, masked_offer_embeddings],
                                                generate_negatives=True))

    model = tf.keras.Model(inputs, output, name=name)
    model.compile(optimizer=OPTIMIZER,
                  loss=BroadcastLoss(LOSS, NUMBER_OF_NEGATIVES),
                  metrics=[BroadcastMetric(AUC_METRIC, NUMBER_OF_NEGATIVES)])
    
    return model

In [None]:
mono_feature_models = {}
for task_offer_feature in TASKS:
    mono_feature_models[task_offer_feature] =\
        bi_linear_interaction_model(task_offer_feature, name=f'{task_offer_feature}_model')
    mono_feature_models[task_offer_feature].fit(datasets['train'],
                                                epochs=EPOCHS,
                                                validation_data=datasets['test'])

## Evaluation and comparison with baselines

In [None]:
MODEL_TO_EVAL = {
    'two_tower': two_tower_model,
    'group_by': group_by_eval_model,
    'prod two_tower': two_tower_task.model,
    'prod group_by': group_by_task.model,
    **{f'Mono:{task_offer_feature}': mono_feature_models[task_offer_feature]
       for task_offer_feature in TASKS}
}

In [None]:
%%time
from collections import defaultdict
from utils import evaluate_model

aucs = defaultdict(dict)

for task_offer_feature in TASKS:
    for model_name, model in MODEL_TO_EVAL.items():
        print(task_offer_feature, model_name)
        aucs[task_offer_feature][model_name] = evaluate_model(model, task_offer_feature,
                                                              single_task_datasets, NUMBER_OF_NEGATIVES)

In [None]:
import pickle
with open(f'_reports/{domain}_eval.pickle', 'wb') as f:
    pickle.dump(aucs, f)

## Reporting wAUC

In [None]:
def wAUC(auc_df, cutoff=200):
    auc_df = auc_df[(auc_df.index != 0) & (auc_df['number of events'] > cutoff)]
    return (auc_df['auc'] * auc_df['number of events']).sum() / auc_df['number of events'].sum()


def meanAUC(auc_df, cutoff=200):
    auc_df = auc_df[(auc_df.index != 0) & (auc_df['number of events'] > cutoff)]
    return auc_df['auc'].mean()

In [None]:
import pandas as pd
import numpy as np
results = pd.DataFrame()

for task_name in aucs:
    for model_name in aucs[task_name]:
        w_auc = np.round(wAUC(aucs[task_name][model_name]), 3)
        results = results.append({'wAUC': w_auc, 'offers': task_name, 'model': model_name}, ignore_index=True)

pd.pivot_table(results, 'wAUC', 'model', 'offers').style.background_gradient(cmap='coolwarm')