# Setup

## Constants

In [1]:
# Select a dataset among 'ML-100K' and 'ML-1M'
dataset = 'ML-100K'
ML_1M_TEST_SIZE = 0.1

# Model hyperparameters
BATCH_SIZE = 1
LEARNING_RATE = 0.002
REGULARIZATION = 0.05
EPOCHS = 10 # TODO - Increase this value to 100

# Matrix factorization hyperparameters
LATENT_DIM = 25 # Concepts count

# Session
SESSION_TIME_GAP_SEC = 604800 # 7 days

## Imports

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD

import numpy as np

from sklearn.metrics import root_mean_squared_error

import pickle

## Defining Data Loaders

In [3]:
def load_data_100k(path='./', delimiter='\t'):
    train = np.loadtxt(path+'movielens_100k_u1.base', skiprows=0, delimiter=delimiter).astype('int32')
    test = np.loadtxt(path+'movielens_100k_u1.test', skiprows=0, delimiter=delimiter).astype('int32')

    total = np.concatenate((train, test), axis=0)
    test_size = len(test)
    total = total[total[:,3].argsort()] # Sort by timestamp
    
    train = total[:-test_size]
    test = total[-test_size:]

    train_users, train_items = set(train[:, 0]), set(train[:, 1])
    test = test[[(test_record[0] in train_users and test_record[1] in train_items) for test_record in test]]

    n_u = np.unique(train[:,0]).size  # num of users
    n_m = np.unique(train[:,1]).size  # num of movies
    n_train = train.shape[0]  # num of training ratings
    n_test = test.shape[0]  # num of test ratings

    user_id_dict = {}
    for i, user_id in enumerate(np.unique(train[:,0]).tolist()):
        user_id_dict[user_id] = i
    
    item_id_dict = {}
    for i, item_id in enumerate(np.unique(train[:,1]).tolist()):
        item_id_dict[item_id] = i

    train = np.array([(user_id_dict[record[0]], item_id_dict[record[1]], record[2], record[3]) for record in train])
    test = np.array([(user_id_dict[record[0]], item_id_dict[record[1]], record[2], record[3]) for record in test])

    print('data matrix loaded')
    print('num of users: {}'.format(n_u))
    print('num of movies: {}'.format(n_m))
    print('num of training ratings: {}'.format(n_train))
    print('num of test ratings: {}'.format(n_test))

    return n_m, n_u, train, test

def load_data_1m(path='./', delimiter='::', test_size=ML_1M_TEST_SIZE):
    data = np.genfromtxt(path+'movielens_1m_dataset.dat', skip_header=0, delimiter=delimiter).astype('int32')
    data = data[(-data[:,3]).argsort()]

    n_u = np.unique(data[:,0]).size  # num of users
    n_m = np.unique(data[:,1]).size  # num of movies
    n_r = data.shape[0]  # num of ratings

    user_dict = {}
    for i, user_id in enumerate(np.unique(data[:,0]).tolist()):
        user_dict[user_id] = i
    item_dict = {}
    for i, item_id in enumerate(np.unique(data[:,1]).tolist()):
        item_dict[item_id] = i

    idx = np.arange(n_r)
    
    train, test = [], []
    training_users = set()
    
    for i in range(n_r - 1, -1, -1):
        user_id = user_dict[data[idx[i], 0]]
        item_id = item_dict[data[idx[i], 1]]
        rating = data[idx[i], 2]
        timestamp = data[idx[i], 3]
        if i < int(test_size * n_r): # test set
            if user_id not in training_users:
                continue
            test.append((user_id - 1, item_id, rating, timestamp))
        else: # training set
            training_users.add(user_id)
            train.append((user_id, item_id, rating, timestamp))

    train, test = np.array(train), np.array(test)

    n_train = train.shape[0]  # num of training ratings
    n_test = test.shape[0]  # num of test ratings

    print('data matrix loaded')
    print('num of users: {}'.format(n_u))
    print('num of movies: {}'.format(n_m))
    print('num of ratings: {}'.format(n_r))
    print('num of training ratings: {}'.format(n_train))
    print('num of test ratings: {}'.format(n_test))

    return n_m, n_u, train, test

# Load Data

In [4]:
# Insert the path of a data directory by yourself (e.g., '/content/.../data')
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
data_path = 'data'
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._

# Data Load
try:
    if dataset == 'ML-100K':
        path = data_path + '/MovieLens_100K/'
        n_m, n_u, train, test = load_data_100k(path=path, delimiter='\t')

    elif dataset == 'ML-1M':
        path = data_path + '/MovieLens_1M/'
        n_m, n_u, train, test = load_data_1m(path=path, delimiter='::')

    else:
        raise ValueError

except ValueError as e:
    print('Error: Unable to load data')

data matrix loaded
num of users: 751
num of movies: 1616
num of training ratings: 80000
num of test ratings: 2863


# Session initializations

In [5]:
def divide_to_sessions(data: list[tuple[int, int, int, int]]) -> dict[tuple[int, int], list[list[int]]]:
    user_sessions_dict = {}
    for user_id, item_id, _, timestamp in data:
        user_id, item_id, timestamp = int(user_id), int(item_id), int(timestamp)
        if user_id not in user_sessions_dict:
            user_sessions_dict[user_id] = []
        if len(user_sessions_dict[user_id]) == 0 or timestamp - user_sessions_dict[user_id][-1][-1][1] > SESSION_TIME_GAP_SEC:
            user_sessions_dict[user_id].append([])
        user_sessions_dict[user_id][-1].append((item_id, timestamp))
    for user_id, user_sessions in user_sessions_dict.items():
        for i, user_session in enumerate(user_sessions):
            user_sessions_dict[user_id][i] = [item_id for item_id, _ in user_session]
    user_item_sessions_dict = {}
    for user_id, user_sessions in user_sessions_dict.items():
        for user_session in user_sessions:
            for item_id in user_session:
                user_item_sessions_dict[(user_id, item_id)] = tuple(user_session)
    return user_item_sessions_dict

In [6]:
sessions_dict = divide_to_sessions(train)

# Matrix Factorization

In [7]:
class MatrixFactorization(tf.keras.Model):
    def __init__(self, num_users, num_items, avg_ratings, latent_dim=LATENT_DIM, reg=REGULARIZATION):
        super().__init__()
        self.reg = reg
        self.user_emb = Embedding(num_users, latent_dim,
                                  embeddings_regularizer=l2(reg),
                                  name="user_embedding")
        self.item_emb = Embedding(num_items, latent_dim,
                                  embeddings_regularizer=l2(reg),
                                  name="item_embedding")
        self.avg_rating = avg_ratings
        self.user_bias = Embedding(num_users, 1,
                                   embeddings_regularizer=l2(reg),
                                   name="user_bias")
        self.item_bias = Embedding(num_items, 1,
                                   embeddings_regularizer=l2(reg),
                                   name="item_bias")
        self.session_bias = Embedding(len(set(sessions_dict.values())), 1,
                                      embeddings_regularizer=l2(reg),
                                      name="session_bias")
        print("finish init")

    def get_session_predict(self, user_vec, session_items):
        session_items = tf.boolean_mask(session_items, session_items != -1)
        session_items_vecs = self.item_emb(session_items)
        
        session_items_scores = tf.reduce_sum(user_vec[:, None, :] * session_items_vecs, axis=2)

        return tf.reduce_mean(session_items_scores, axis=1)

    def call(self, inputs):
        user, item, session_items = inputs
        user_vec = self.user_emb(user)
        item_vec = self.item_emb(item)
        dot_product = tf.reduce_sum(user_vec * item_vec, axis=1)
        session_predict = self.get_session_predict(user_vec, session_items)
        bias = (
            tf.squeeze(self.user_bias(user)) +
            tf.squeeze(self.item_bias(item)) +
            tf.squeeze(self.session_bias(session_predict))
        )
        return dot_product + self.avg_rating + bias
    
    def l2_loss(self, y_true, y_pred, user, item, session_items):
        squared_error = tf.square(y_true - y_pred)
        user_vec, item_vec = self.user_emb(user), self.item_emb(item)
        user_vec_norm = tf.reduce_sum(tf.square(user_vec), axis=1)
        item_vec_norm = tf.reduce_sum(tf.square(item_vec), axis=1)
        user_bias = tf.squeeze(self.user_bias(user))
        item_bias = tf.squeeze(self.item_bias(item))
        session_bias = tf.squeeze(self.session_bias(self.get_session_predict(user_vec, session_items)))
        user_bias_norm = tf.square(user_bias)
        item_bias_norm = tf.square(item_bias)
        session_norm = tf.square(session_bias)

        reg_loss = self.reg * (user_vec_norm + item_vec_norm + user_bias_norm + item_bias_norm + session_norm)
        return squared_error + reg_loss
    
    def train_step(self, data):
        (user, item, session_items), y_true = data

        with tf.GradientTape() as tape:
            y_pred = self((user, item, session_items), training=True)
            loss = self.l2_loss(y_true, y_pred, user, item, session_items)

        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        return {"loss": loss}

In [8]:
train_users, train_items, train_ratings = train[:,0], train[:,1], np.float32(train[:,2])
train_sessions_items = [sessions_dict[(row[0], row[1])] for row in train]
max_session_length = max(len(session_items) for session_items in train_sessions_items)
train_sessions_items = np.array([session_items + (-1, ) * (max_session_length - len(session_items)) for session_items in train_sessions_items])
test_users, test_items, test_ratings = test[:,0], test[:,1], np.float32(test[:,2])

train_avg_rating = np.mean(train_ratings)

In [9]:
example_session_items = np.array([1,2,3, -1])
example_session_tensor = tf.constant(example_session_items, dtype=tf.int32)
example_tensor_no_negative = tf.boolean_mask(example_session_tensor, example_session_tensor != 0)
print(example_tensor_no_negative)

tf.Tensor([ 1  2  3 -1], shape=(4,), dtype=int32)


In [10]:
model = MatrixFactorization(n_u, n_m, train_avg_rating)
model.compile(optimizer=SGD(learning_rate=LEARNING_RATE))
history = model.fit(
    [train_users, train_items, train_sessions_items], train_ratings,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1
)

finish init
Epoch 1/10
[1m80000/80000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 881us/step - loss: 1.1101
Epoch 2/10
[1m80000/80000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 925us/step - loss: 0.9832
Epoch 3/10
[1m80000/80000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 885us/step - loss: 0.9419
Epoch 4/10
[1m80000/80000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 925us/step - loss: 0.9207
Epoch 5/10
[1m80000/80000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 900us/step - loss: 0.9072
Epoch 6/10
[1m80000/80000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 905us/step - loss: 0.8979
Epoch 7/10
[1m80000/80000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 935us/step - loss: 0.8911
Epoch 8/10
[1m80000/80000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 952us/step - loss: 0.8861
Epoch 9/10
[1m80000/80000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 947us/step - loss: 0.8818
Epoch 10/10
[1m

In [12]:
users_matrix, items_matrix, user_biases, item_biases, avg_rating = model.get_weights()
ratings_matrix = np.dot(users_matrix, items_matrix.T) + user_biases + item_biases.T + train_avg_rating
ratings_matrix = np.clip(ratings_matrix, 1, 5)

# Save the model
with open(path + "mf_prediction.pickle", 'wb') as f:
    pickle.dump(ratings_matrix, f)

In [None]:
# Get Test Score
# test_ratings_predicted = np.clip(model.predict([test_users, test_items]), 1, 5) # predict using the model
test_ratings_predicted = np.array([ratings_matrix[test_user, test_item] for test_user, test_item in zip(test_users, test_items)]) # predict using the matrix


# check test rmse
test_rmse = root_mean_squared_error(test_ratings, test_ratings_predicted)
print(f"Test RMSE: {test_rmse}")

Test RMSE: 1.0086641311645508
