## BPR: Bayesian Personalized Ranking from Implicit Feedback

Ref: 
* https://arxiv.org/pdf/1205.2618
* https://medium.com/radon-dev/implicit-bayesian-personalized-ranking-in-tensorflow-b4dfa733c478

In [126]:
tf.disable_eager_execution()

In [None]:
WIP

In [127]:
import tensorflow as tf
# tf.enable_eager_execution()
# import tensorflow.contrib.eager as tfe
import pandas as pd
import numpy as np
import scipy.sparse as sp
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

In [128]:
#---------------------------
# LOAD AND PREPARE THE DATA
#---------------------------

# Load the dataframe from a tab separated file.
df = pd.read_csv('data/movielens/ml-latest-small/ratings.csv', sep=',')
    
# Add column names
df = df.drop(df.columns[3], axis=1)
df_movie = pd.read_csv('data/movielens/ml-latest-small/movies.csv')

In [129]:
movie_encoder = LabelEncoder()
df_movie['movie_id']=movie_encoder.fit_transform(df_movie.movieId)

In [130]:
df_movie.head()

Unnamed: 0,movieId,title,genres,movie_id
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3
4,5,Father of the Bride Part II (1995),Comedy,4


In [131]:

# Drop any rows with missing values
df = df.dropna()

# Drop any rows with 0 rating
df = df.loc[df.rating != 0]

# Convert movies into numerical IDs
df['user_id'] = df['userId'].astype("category").cat.codes
df['movie_id'] = movie_encoder.transform(df['movieId'].values)

# Create a lookup frame so we can get the movie
# names back in readable form later.
item_lookup = df[['movie_id', 'movieId']].drop_duplicates()
item_lookup['movie_id'] = item_lookup.movie_id.astype(str)

# We drop our old user and item columns
df = df.drop(['userId', 'movieId'], axis=1)

# Drop any rows with 0 rating
df = df.loc[df.rating != 0]

# Create lists of all users, movies and ratings
users = list(np.sort(df.user_id.unique()))
movies = list(np.sort(df_movie.movie_id.unique()))
ratings = list(df.rating)
print(f"#users: {len(users):,}, #items: {len(movies):,} #ratings: {len(ratings):,}" )

# Get the rows and columns for our new matrix
rows = df.user_id.astype(float)
cols = df.movie_id.astype(float)

# Contruct a sparse matrix for our users and items containing number of ratings
data_sparse = sp.csr_matrix((ratings, (rows, cols)), shape=(len(users), len(movies)))

# Get the values of our matrix as a list of user ids
# and item ids. Note that our litsts have the same length
# as each user id repeats one time for each rated movie.
uids, iids = data_sparse.nonzero()

#users: 610, #items: 9,742 #ratings: 100,836


In [132]:
title_vectorizer = TfidfVectorizer()
movie_titles = title_vectorizer.fit_transform(df_movie.title)
genre_vectorizer = TfidfVectorizer()
movie_genre = genre_vectorizer.fit_transform(df_movie.genres)
print(movie_titles.shape, movie_genre.shape)
#movie_features = sp.hstack((movie_titles,movie_genre))
movie_features = movie_genre
movie_features = movie_features.tocsr()
movie_item_features = movie_features[iids,:]

(9742, 9269) (9742, 24)


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [133]:
movie_features

<9742x24 sparse matrix of type '<class 'numpy.float64'>'
	with 23219 stored elements in Compressed Sparse Row format>

In [10]:
# cols_list=[]
# value_list=[]
# for r in range(movie_item_features.shape[0]):
#     cols_list.append([])
#     value_list.append([])    
# for (nzr, nzc) in zip(non_zero_rows, non_zero_cols):
#     value = movie_item_features[nzr,nzc]
#     cols_list[nzr].append(nzc)
#     value_list[nzr].append(value)

        

In [134]:
#-------------
# HYPERPARAMS
#-------------

epochs = 50
# How many (u,i,j) triplets we sample for each batch
samples = 150
batches = math.ceil(uids.shape[0]/samples)
# batches = min(batches,30)
num_factors = 64 # Number of latent features

# Independent lambda regularization values 
# for user, items and bias.
lambda_user = 0.0000001
lambda_item = 0.0000001
lambda_bias = 0.0000001

# Our learning rate 
lr = 0.005



## Eager way to iterator through dataset

In [11]:
# import tensorflow as tf
# tf.enable_eager_execution()
# import tensorflow.contrib.eager as tfe
# uids=list(range(10))
# iids=list(range(10,20))
# train_data = tf.data.Dataset.from_tensor_slices((uids, iids))
# train_data = train_data.shuffle(len(uids))
# train_data = train_data.batch(2)

# for eps in range(3):
#     batch_id = 0
#     for x, y in train_data:
#         print(f"eps: {eps}, batch: {batch_id}, data: {x}, {y} ")
#         batch_id+=1 

In [135]:
#-------------------------
# TENSORFLOW GRAPH
#-------------------------

# Set up our Tensorflow graph
graph = tf.Graph()

def init_variable(size, dim, name=None):
    '''
    Helper function to initialize a new variable with
    uniform random values.
    '''
    std = np.sqrt(2 / dim)
    return tf.Variable(tf.random_uniform([size, dim], -std, std), name=name)


def get_variable(graph, session, name):
    '''
    Helper function to get the value of a
    Tensorflow variable by name.
    '''
    v = graph.get_operation_by_name(name)
    v = v.values()[0]
    v = v.eval(session=session)
    return v

In [136]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)

In [144]:
def build_sparse_tensor(row,col,value,shape):
    indices = np.mat([row, col]).transpose()
    print(indices)
    print(value)
    print(shape)
    return tf.SparseTensor(indices, value, shape)

In [145]:
movie_item_features[10].data

array([0.79380421, 0.41927294, 0.44055087])

In [139]:
#https://stackoverflow.com/questions/49531286/tensorflow-tf-data-dataset-cannot-batch-tensors-with-different-shapes-in-compo
def gen():
    for u,i in zip(uids, iids):
        j = np.random.randint(0, len(movies))
        i_item_row,i_item_col = movie_item_features[i].nonzero()
        i_item_values = movie_item_features[i].data
        j_item_row, j_item_col = movie_item_features[j].nonzero()
        j_item_values = movie_item_features[j].data
        #i_item = tf.SparseTensor([0,i_item_index],i_item_values,[1,movie_features.shape[1]])
        #j_item = tf.SparseTensor([0,i_item_index],i_item_values,[1,movie_features.shape[1]])
        #yield (u,i,j,j_item,j_item)
        yield (u,i,j,i_item_row,i_item_col,i_item_values,j_item_row, j_item_col,j_item_values)
        
train_data = tf.data.Dataset.from_generator(gen,
                                                (tf.int32, tf.int32, tf.int32,
                                                 tf.int32, tf.int32, tf.float32,
                                                 tf.int32, tf.int32, tf.float32,))

In [140]:
i_features = build_sparse_tensor(i_item_row,i_item_col,i_item_values,shape=[tf.size(i),movie_features.shape[1]])
j_features = build_sparse_tensor(j_item_row,j_item_col,j_item_values,shape=[tf.size(j),movie_features.shape[1]])
user_factors = init_variable(len(users), num_factors, "user_factors") # V matrix
u_factors = tf.nn.embedding_lookup(user_factors, u)
# Known and unknown item embeddings
item_factors = init_variable(len(movies), num_factors, "item_factors") # V matrix
item_feature_weights = init_variable(movie_features.shape[1], num_factors, "item_feature_weights") # V matrix
i_identity_factors = tf.nn.embedding_lookup(item_factors, i)
j_identity_factors = tf.nn.embedding_lookup(item_factors, j)
i_features_factors = tf.sparse.sparse_dense_matmul(i_features,item_feature_weights)
j_features_factors = tf.sparse.sparse_dense_matmul(j_features,item_feature_weights)

i_factors = i_identity_factors + i_features_factors
j_factors = j_identity_factors + j_features_factors

# i and j bias embeddings.
item_bias = init_variable(len(movies), 1, "item_bias")
i_bias = tf.nn.embedding_lookup(item_bias, i)
i_bias = tf.squeeze(i_bias)
j_bias = tf.nn.embedding_lookup(item_bias, j)
j_bias = tf.squeeze(j_bias)

# Calculate the dot product + bias for known and unknown
# item to get xui and xuj.
ui = tf.reduce_sum(u_factors * i_factors, axis=1)
xui = i_bias + ui
uj = tf.reduce_sum(u_factors * j_factors, axis=1)
xuj = j_bias + uj

# We calculate xuij.
xuij = xui - xuj

RuntimeError: Attempting to capture an EagerTensor without building a function.

In [146]:

with graph.as_default():
    '''
    Loss function: 
    -SUM ln σ(xui - xuj) + λ(w1)**2 + λ(w2)**2 + λ(w3)**2 ...
    ln = the natural log
    σ(xuij) = the sigmoid function of xuij.
    λ = lambda regularization value.
    ||W||**2 = the squared L2 norm of our model parameters.
    
    '''
    train_data = tf.data.Dataset.from_generator(gen,
                                                (tf.int32, tf.int32, tf.int32,
                                                 tf.int32, tf.int32, tf.float32,
                                                 tf.int32, tf.int32, tf.float32,))
    #train_data = train_data.batch(samples)
    train_data = train_data.prefetch(4)
    iterator = train_data.make_initializable_iterator()
    movie_features_tf = convert_sparse_matrix_to_sparse_tensor(movie_features)
    u,i,j,i_item_row,i_item_col,i_item_values,j_item_row, j_item_col,j_item_values = iterator.get_next()
    
    i_features = build_sparse_tensor(i_item_row,i_item_col,i_item_values,shape=[tf.size(i),movie_features.shape[1]])
    j_features = build_sparse_tensor(j_item_row,j_item_col,j_item_values,shape=[tf.size(j),movie_features.shape[1]])

    # User feature embedding
    user_factors = init_variable(len(users), num_factors, "user_factors") # V matrix
    u_factors = tf.nn.embedding_lookup(user_factors, u)
    # Known and unknown item embeddings
    item_factors = init_variable(len(movies), num_factors, "item_factors") # V matrix
    item_feature_weights = init_variable(len(movie_features.shape[1]), num_factors, "item_feature_weights") # V matrix
    i_identity_factors = tf.nn.embedding_lookup(item_factors, i)
    j_identity_factors = tf.nn.embedding_lookup(item_factors, j)
    i_features_factors = tf.sparse.sparse_dense_matmul(i_features,item_feature_weights)
    j_features_factors = tf.sparse.sparse_dense_matmul(j_features,item_feature_weights)
    
    i_factors = i_identity_factors + i_features_factors
    j_factors = j_identity_factors + j_features_factors

    # i and j bias embeddings.
    item_bias = init_variable(len(movies), 1, "item_bias")
    i_bias = tf.nn.embedding_lookup(item_bias, i)
    i_bias = tf.squeeze(i_bias)
    j_bias = tf.nn.embedding_lookup(item_bias, j)
    j_bias = tf.squeeze(j_bias)

    # Calculate the dot product + bias for known and unknown
    # item to get xui and xuj.
    ui = tf.reduce_sum(u_factors * i_factors, axis=1)
    xui = i_bias + ui
    uj = tf.reduce_sum(u_factors * j_factors, axis=1)
    xuj = j_bias + uj

    # We calculate xuij.
    xuij = xui - xuj

    # Calculate the mean AUC (area under curve).
    # if xuij is greater than 0, that means that 
    # xui is greater than xuj (and thats what we want).
    u_auc = tf.reduce_mean(tf.cast(xuij > 0,tf.float32))

    # Output the AUC value to tensorboard for monitoring.
    tf.summary.scalar('auc', u_auc)

    # Calculate the squared L2 norm ||W||**2 multiplied by λ.
    l2_norm = tf.add_n([
        lambda_user * tf.reduce_sum(tf.multiply(u_factors, u_factors)),
        lambda_item * tf.reduce_sum(tf.multiply(i_factors, i_factors)),
        lambda_item * tf.reduce_sum(tf.multiply(j_factors, j_factors)),
        lambda_bias * tf.reduce_sum(tf.multiply(i_bias, i_bias)),
        lambda_bias * tf.reduce_sum(tf.multiply(j_bias, j_bias))
        ])

    # Calculate the loss as ||W||**2 - ln σ(Xuij)
    #loss = l2_norm - tf.reduce_mean(tf.log(tf.sigmoid(xuij)))
    loss = -tf.reduce_mean(tf.log(tf.sigmoid(xuij))) + l2_norm
    
    # Train using the Adam optimizer to minimize 
    # our loss function.
    opt = tf.train.AdamOptimizer(learning_rate=lr)
    step = opt.minimize(loss)

    # Initialize all tensorflow variables.
    init = tf.global_variables_initializer()

[[<tf.Tensor 'IteratorGetNext_3:3' shape=<unknown> dtype=int32>]
 [<tf.Tensor 'IteratorGetNext_3:4' shape=<unknown> dtype=int32>]]
Tensor("IteratorGetNext_3:5", dtype=float32)
[<tf.Tensor 'Size_2:0' shape=() dtype=int32>, 24]


ValueError: setting an array element with a sequence.

In [None]:
movie_features_tf

In [None]:
%%time
#------------------
# GRAPH EXECUTION
#------------------

# Run the session. 
session = tf.Session(config=None, graph=graph)
session.run(init)

# This has noting to do with tensorflow but gives
# us a nice progress bar for the training.
progress = tqdm(total=batches*epochs)
idx=np.arange(uids.shape[0])
for _ in range(epochs):
    session.run(iterator.initializer)

    # We run the session.
    _, l, auc = session.run([step, loss, u_auc])
    progress.update(batches)
    progress.set_description('Loss: %.3f | AUC: %.3f' % (l, auc))

progress.close()