In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os.path as op

from zipfile import ZipFile
try:
    from urllib.request import urlretrieve
except ImportError:  # Python 2 compat
    from urllib import urlretrieve


ML_100K_URL = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
ML_100K_FILENAME = ML_100K_URL.rsplit('/', 1)[1]
ML_100K_FOLDER = 'ml-100k'

if not op.exists(ML_100K_FILENAME):
    print('Downloading %s to %s...' % (ML_100K_URL, ML_100K_FILENAME))
    urlretrieve(ML_100K_URL, ML_100K_FILENAME)

if not op.exists(ML_100K_FOLDER):
    print('Extracting %s to %s...' % (ML_100K_FILENAME, ML_100K_FOLDER))
    ZipFile(ML_100K_FILENAME).extractall('.')

In [2]:
data_train = pd.read_csv(op.join(ML_100K_FOLDER, 'ua.base'), sep='\t',
                        names=["user_id", "item_id", "rating", "timestamp"])
data_test = pd.read_csv(op.join(ML_100K_FOLDER, 'ua.test'), sep='\t',
                        names=["user_id", "item_id", "rating", "timestamp"])

print(data_train.describe())
print(data_test.describe())

            user_id       item_id        rating     timestamp
count  90570.000000  90570.000000  90570.000000  9.057000e+04
mean     461.494038    428.104891      3.523827  8.835073e+08
std      266.004364    333.088029      1.126073  5.341684e+06
min        1.000000      1.000000      1.000000  8.747247e+08
25%      256.000000    174.000000      3.000000  8.794484e+08
50%      442.000000    324.000000      4.000000  8.828143e+08
75%      682.000000    636.000000      4.000000  8.882049e+08
max      943.000000   1682.000000      5.000000  8.932866e+08
           user_id      item_id       rating     timestamp
count  9430.000000  9430.000000  9430.000000  9.430000e+03
mean    472.000000   400.800954     3.587805  8.837354e+08
std     272.234934   306.859789     1.120240  5.360562e+06
min       1.000000     1.000000     1.000000  8.747247e+08
25%     236.000000   182.000000     3.000000  8.794515e+08
50%     472.000000   303.000000     4.000000  8.833904e+08
75%     708.000000   566.0000

In [3]:
data_train.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [4]:
max_user_id = max(data_train['user_id'].max(), data_test['user_id'].max())
max_item_id = max(data_train['item_id'].max(), data_test['item_id'].max())

n_users = max_user_id + 1
n_items = max_item_id + 1

print('n_users=%d, n_items=%d' % (n_users, n_items))

n_users=944, n_items=1683


In [5]:
pos_data_train = data_train.query("rating >= 4")
pos_data_test = data_test.query("rating >= 4")

print(pos_data_train.head())
print(pos_data_test.head())

   user_id  item_id  rating  timestamp
0        1        1       5  874965758
2        1        3       4  878542960
5        1        6       5  887431973
6        1        7       4  875071561
8        1        9       5  878543541
   user_id  item_id  rating  timestamp
0        1       20       4  887431883
1        1       33       4  878542699
2        1       61       4  878542420
5        1      160       4  875072547
6        1      171       5  889751711


In [6]:
pos_data_train['rating'].count()

49906

In [7]:
pos_data_test['rating'].count()

5469

In [8]:
import tensorflow as tf

def identity_loss(y_true, y_pred):
    """Ignore y_true and return the mean of y_pred
    
    This is a hack to work-around the design of the Keras API that is
    not really suited to train networks with a triplet loss by default.
    """
    return tf.reduce_mean(y_pred + 0 * y_true)

def margin_comparator_loss(inputs, margin=1.):
    """Comparator loss for a pair of precomputed similarities
    
    If the inputs are cosine similarities, they each have range in
    (-1, 1), therefore their difference have range in (-2, 2). Using
    a margin of 1. can therefore make sense.

    If the input similarities are not normalized, it can be beneficial
    to use larger values for the margin of the comparator loss.
    """
    positive_pair_sim, negative_pair_sim = inputs
    return tf.maximum(negative_pair_sim - positive_pair_sim + margin, 0)

def cos_mode(inputs):
    """Work around for Keras bug with merge([...], mode='cos').
    
    Compute the cosine similarity of two unormalized embeddings.
    """
    latent_codes_1, latent_codes_2 = inputs
    sq_norm_1 = tf.reduce_sum(latent_codes_1 ** 2, axis=-1)
    sq_norm_2 = tf.reduce_sum(latent_codes_2 ** 2, axis=-1)
    dot = tf.reduce_sum(latent_codes_1 * latent_codes_2, axis=-1)
    return dot / tf.sqrt(sq_norm_1 * sq_norm_2)

In [9]:
from keras.models import Model
from keras.layers import Embedding, Flatten, Input, Dense, merge
from keras.regularizers import l2

def build_models(n_users, n_items, latent_dim=64, l2_reg=0):
    """Build a triplet model and its companion similarity model
    
    The triplet model is used to train the weights of the companion
    similarity model. The triplet model takes 1 user, 1 positive item
    (relative to the selected user) and one negative item and is
    trained with comparator loss.
    
    The similarity model takes one user and one item as input and return
    compatibility score (aka the match score).
    """
    # Common architectural components for the two models:
    # - symbolic input placeholders
    user_input = Input((1,), name='user_input')
    positive_item_input = Input((1,), name='positive_item_input')
    negative_item_input = Input((1,), name='negative_item_input')

    # - embeddings
    l2_reg = None if l2_reg == 0 else l2(l2_reg)
    user_layer = Embedding(n_users, latent_dim, input_length=1,
                           name='user_embedding', embeddings_regularizer=l2_reg)
    
    # The following embedding parameters will be shared to encode both
    # the positive and negative items.
    item_layer = Embedding(n_items, latent_dim, input_length=1,
                           name="item_embedding", embeddings_regularizer=l2_reg)

    user_embedding = Flatten()(user_layer(user_input))
    positive_item_embedding = Flatten()(item_layer(positive_item_input))
    negative_item_embedding = Flatten()(item_layer(negative_item_input))

    # - similarity computation between embeddings
    positive_similarity = merge([user_embedding, positive_item_embedding],
                                mode=cos_mode, output_shape=(1,),
                                name="positive_similarity")
    negative_similarity = merge([user_embedding, negative_item_embedding],
                                mode=cos_mode, output_shape=(1,),
                                name="negative_similarity")

    # The triplet network model, only used for training
    triplet_loss = merge([positive_similarity, negative_similarity],
                         mode=margin_comparator_loss, output_shape=(1,),
                         name='comparator_loss')

    triplet_model = Model(inputs=[user_input,
                                 positive_item_input,
                                 negative_item_input],
                          outputs=triplet_loss)
    
    # The match-score model, only use at inference to rank items for a given
    # model: the model weights are shared with the triplet_model therefore
    # we do not need to train it and therefore we do not need to plug a loss
    # and an optimizer.
    match_model = Model(inputs=[user_input, positive_item_input],
                        outputs=positive_similarity)
    
    return triplet_model, match_model

triplet_model, match_model = build_models(n_users, n_items, latent_dim=64, l2_reg=1e-6)

Using TensorFlow backend.
  name=name)


In [10]:
print(match_model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user_input (InputLayer)          (None, 1)             0                                            
____________________________________________________________________________________________________
positive_item_input (InputLayer) (None, 1)             0                                            
____________________________________________________________________________________________________
user_embedding (Embedding)       (None, 1, 64)         60416       user_input[0][0]                 
____________________________________________________________________________________________________
item_embedding (Embedding)       (None, 1, 64)         107712      positive_item_input[0][0]        
___________________________________________________________________________________________

In [11]:
print(triplet_model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user_input (InputLayer)          (None, 1)             0                                            
____________________________________________________________________________________________________
positive_item_input (InputLayer) (None, 1)             0                                            
____________________________________________________________________________________________________
negative_item_input (InputLayer) (None, 1)             0                                            
____________________________________________________________________________________________________
user_embedding (Embedding)       (None, 1, 64)         60416       user_input[0][0]                 
___________________________________________________________________________________________

In [12]:
from sklearn.metrics import roc_auc_score

def average_roc_auc(match_model, data_train, data_test):
    """Compute the ROC AUC for each user and average over users"""
    max_user_id = max(data_train['user_id'].max(), data_test['user_id'].max())
    max_item_id = max(data_train['item_id'].max(), data_test['item_id'].max())
    user_auc_scores = []
    for user_id in range(1, max_user_id + 1):
        pos_item_train = data_train[data_train['user_id'] == user_id]
        pos_item_test = data_test[data_test['user_id'] == user_id]
        
        # Consider all the items already seen in the training set
        all_item_ids = np.arange(1, max_item_id + 1)
        items_to_rank = np.setdiff1d(all_item_ids, pos_item_train['item_id'].values)
        
        # Ground truth: return 1 for each item positively present in the test set
        # and 0 otherwise.
        expected = np.in1d(items_to_rank, pos_item_test['item_id'].values)
        
        if np.sum(expected) >= 1:
            # At least one positive test value to rank
            repeated_user_id = np.empty_like(items_to_rank)
            repeated_user_id.fill(user_id)

            predicted = match_model.predict([repeated_user_id, items_to_rank],
                                            batch_size=4096)
            user_auc_scores.append(roc_auc_score(expected, predicted))

    return sum(user_auc_scores) / len(user_auc_scores)

In [13]:
average_roc_auc(match_model, pos_data_train, pos_data_test)

0.49961645600763538

In [14]:
def sample_triplets(pos_data, max_item_id, random_seed=0):
    """Sample negatives at random"""
    rng = np.random.RandomState(random_seed)
    user_ids = pos_data['user_id'].values
    pos_item_ids = pos_data['item_id'].values

    neg_item_ids = rng.randint(low=1, high=max_item_id + 1,
                               size=len(user_ids))

    return [user_ids, pos_item_ids, neg_item_ids]

In [15]:
# we plug the identity loss and the a fake target variable ignored by
# the model to be able to use the Keras API to train the triplet model
triplet_model.compile(loss=identity_loss, optimizer="adam")
fake_y = np.ones_like(pos_data_train['user_id'])

n_epochs = 15

for i in range(n_epochs):
    # Sample new negatives to build different triplets at each epoch
    triplet_inputs = sample_triplets(pos_data_train, max_item_id,
                                     random_seed=i)

    # Fit the model incrementally by doing a single pass over the
    # sampled triplets.
    triplet_model.fit(triplet_inputs, fake_y, shuffle=True,
                      batch_size=64, epochs=1, verbose=2)
    
    # Monitor the convergence of the model
    test_auc = average_roc_auc(match_model, pos_data_train, pos_data_test)
    print("Epoch %d/%d: test ROC AUC: %0.4f"
          % (i + 1, n_epochs, test_auc))

Epoch 1/1
3s - loss: 0.7703
Epoch 1/15: test ROC AUC: 0.8500
Epoch 1/1
3s - loss: 0.3843
Epoch 2/15: test ROC AUC: 0.8846
Epoch 1/1
3s - loss: 0.3556
Epoch 3/15: test ROC AUC: 0.8980
Epoch 1/1
3s - loss: 0.3440
Epoch 4/15: test ROC AUC: 0.9035
Epoch 1/1
3s - loss: 0.3355
Epoch 5/15: test ROC AUC: 0.9088
Epoch 1/1
3s - loss: 0.3285
Epoch 6/15: test ROC AUC: 0.9121
Epoch 1/1
3s - loss: 0.3306
Epoch 7/15: test ROC AUC: 0.9144
Epoch 1/1
3s - loss: 0.3245
Epoch 8/15: test ROC AUC: 0.9146
Epoch 1/1
3s - loss: 0.3230
Epoch 9/15: test ROC AUC: 0.9151
Epoch 1/1
3s - loss: 0.3208
Epoch 10/15: test ROC AUC: 0.9157
Epoch 1/1
3s - loss: 0.3189
Epoch 11/15: test ROC AUC: 0.9163
Epoch 1/1
3s - loss: 0.3193
Epoch 12/15: test ROC AUC: 0.9172
Epoch 1/1
3s - loss: 0.3182
Epoch 13/15: test ROC AUC: 0.9174
Epoch 1/1
3s - loss: 0.3218
Epoch 14/15: test ROC AUC: 0.9193
Epoch 1/1
3s - loss: 0.3171
Epoch 15/15: test ROC AUC: 0.9199


In [16]:
# %load solutions/deep_implicit_feedback_recsys.py
from keras.models import Model, Sequential
from keras.layers import Embedding, Flatten, Input, Dense, Dropout, merge
from keras.layers.merge import concatenate
from keras.regularizers import l2

def make_interaction_mlp(input_dim, n_hidden=1, hidden_size=64,
                         dropout=0, l2_reg=None):
    """Build the shared multi layer perceptron"""
    mlp = Sequential()
    if n_hidden == 0:
        # Plug the output unit directly: this is a simple
        # linear regression model. Not dropout required.
        mlp.add(Dense(1, input_dim=input_dim,
                      activation='relu', kernel_regularizer=l2_reg))
    else:
        mlp.add(Dense(hidden_size, input_dim=input_dim,
                      activation='relu', kernel_regularizer=l2_reg))
        mlp.add(Dropout(dropout))
        for i in range(n_hidden - 1):
            mlp.add(Dense(hidden_size, activation='relu',
                          kernel_regularizer=l2_reg))
            mlp.add(Dropout(dropout))
        mlp.add(Dense(1, activation='relu', kernel_regularizer=l2_reg))
    return mlp

def build_models(n_users, n_items, user_dim=32, item_dim=64,
                 n_hidden=1, hidden_size=64, dropout=0, l2_reg=0):
    """Build models to train a deep triplet network"""
    user_input = Input((1,), name='user_input')
    positive_item_input = Input((1,), name='positive_item_input')
    negative_item_input = Input((1,), name='negative_item_input')

    l2_reg = None if l2_reg == 0 else l2(l2_reg)
    user_layer = Embedding(n_users, user_dim, input_length=1,
                           name='user_embedding', embeddings_regularizer=l2_reg)

    # The following embedding parameters will be shared to encode both
    # the positive and negative items.
    item_layer = Embedding(n_items, item_dim, input_length=1,
                           name="item_embedding", embeddings_regularizer=l2_reg)

    user_embedding = Flatten()(user_layer(user_input))
    positive_item_embedding = Flatten()(item_layer(positive_item_input))
    negative_item_embedding = Flatten()(item_layer(negative_item_input))

    # Similarity computation between embeddings using a MLP similarity
    positive_embeddings_pair = concatenate([user_embedding, positive_item_embedding],
                                     name="positive_embeddings_pair")
    positive_embeddings_pair = Dropout(dropout)(positive_embeddings_pair)
    negative_embeddings_pair = concatenate([user_embedding, negative_item_embedding],
                                     name="negative_embeddings_pair")
    negative_embeddings_pair = Dropout(dropout)(negative_embeddings_pair)

    # Instanciate the shared similarity architecture
    interaction_layers = make_interaction_mlp(
        user_dim + item_dim, n_hidden=n_hidden, hidden_size=hidden_size,
        dropout=dropout, l2_reg=l2_reg)

    positive_similarity = interaction_layers(positive_embeddings_pair)
    negative_similarity = interaction_layers(negative_embeddings_pair)

    # The triplet network model, only used for training
    triplet_loss = merge([positive_similarity, negative_similarity],
                         mode=margin_comparator_loss, output_shape=(1,),
                         name='comparator_loss')

    deep_triplet_model = Model(inputs=[user_input,
                                      positive_item_input,
                                      negative_item_input],
                               outputs=triplet_loss)

    # The match-score model, only used at inference
    deep_match_model = Model(inputs=[user_input, positive_item_input],
                             outputs=positive_similarity)

    return deep_match_model, deep_triplet_model

hyper_parameters = dict(
    user_dim=32,
    item_dim=64,
    n_hidden=1,
    hidden_size=128,
    dropout=0.1,
    l2_reg=0
)

deep_match_model, deep_triplet_model = build_models(n_users, n_items,**hyper_parameters)

deep_triplet_model.compile(loss=identity_loss, optimizer='adam')
fake_y = np.ones_like(pos_data_train['user_id'])

n_epochs = 15

for i in range(n_epochs):
    # Sample new negatives to build different triplets at each epoch
    triplet_inputs = sample_triplets(pos_data_train, max_item_id, random_seed=i)

    # Fit the model incrementally by doing a single pass over the
    # sampled triplets.
    deep_triplet_model.fit(triplet_inputs, fake_y, shuffle=True,
                           batch_size=64, epochs=1, verbose=2)

    # Monitor the convergence of the model
    test_auc = average_roc_auc(deep_match_model, pos_data_train, pos_data_test)
    print("Epoch %d/%d: test ROC AUC: %0.4f" % (i + 1, n_epochs, test_auc))

  name=name)


Epoch 1/1
3s - loss: 0.4693
Epoch 1/15: test ROC AUC: 0.8548
Epoch 1/1
2s - loss: 0.3791
Epoch 2/15: test ROC AUC: 0.8635
Epoch 1/1
2s - loss: 0.3728
Epoch 3/15: test ROC AUC: 0.8650
Epoch 1/1
2s - loss: 0.3666
Epoch 4/15: test ROC AUC: 0.8652
Epoch 1/1
2s - loss: 0.3624
Epoch 5/15: test ROC AUC: 0.8691
Epoch 1/1
2s - loss: 0.3519
Epoch 6/15: test ROC AUC: 0.8779
Epoch 1/1
2s - loss: 0.3374
Epoch 7/15: test ROC AUC: 0.8882
Epoch 1/1
2s - loss: 0.3210
Epoch 8/15: test ROC AUC: 0.8918
Epoch 1/1
2s - loss: 0.3135
Epoch 9/15: test ROC AUC: 0.8948
Epoch 1/1
2s - loss: 0.3050
Epoch 10/15: test ROC AUC: 0.8956
Epoch 1/1
2s - loss: 0.2997
Epoch 11/15: test ROC AUC: 0.8991
Epoch 1/1
2s - loss: 0.2958
Epoch 12/15: test ROC AUC: 0.9025
Epoch 1/1
2s - loss: 0.2883
Epoch 13/15: test ROC AUC: 0.9051
Epoch 1/1
2s - loss: 0.2864
Epoch 14/15: test ROC AUC: 0.9079
Epoch 1/1
2s - loss: 0.2766
Epoch 15/15: test ROC AUC: 0.9115
