# Setup

## Constants

In [1]:
# Select a dataset among 'ML-100K' and 'ML-1M'
dataset = 'ML-100K'
ML_1M_TEST_SIZE = 0.1

# Model hyperparameters
BATCH_SIZE = 1
LEARNING_RATE = 0.002
REGULARIZATION = 0.05
EPOCHS = 10 # TODO - Increase this value to 100

# Matrix factorization hyperparameters
LATENT_DIM = 25 # Concepts count

# Session
SESSION_TIME_GAP_SEC = 604800 # 7 days

## Imports

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD

import numpy as np

from sklearn.metrics import root_mean_squared_error

import pickle

## Defining Data Loaders

In [3]:
def load_data_100k(path='./', delimiter='\t'):
    train = np.loadtxt(path+'movielens_100k_u1.base', skiprows=0, delimiter=delimiter).astype('int32')
    test = np.loadtxt(path+'movielens_100k_u1.test', skiprows=0, delimiter=delimiter).astype('int32')

    total = np.concatenate((train, test), axis=0)
    test_size = len(test)
    total = total[total[:,3].argsort()] # Sort by timestamp
    
    train = total[:-test_size]
    test = total[-test_size:]

    train_users, train_items = set(train[:, 0]), set(train[:, 1])
    test = test[[(test_record[0] in train_users and test_record[1] in train_items) for test_record in test]]

    n_u = np.unique(train[:,0]).size  # num of users
    n_m = np.unique(train[:,1]).size  # num of movies
    n_train = train.shape[0]  # num of training ratings
    n_test = test.shape[0]  # num of test ratings

    user_id_dict = {}
    for i, user_id in enumerate(np.unique(train[:,0]).tolist()):
        user_id_dict[user_id] = i
    
    item_id_dict = {}
    for i, item_id in enumerate(np.unique(train[:,1]).tolist()):
        item_id_dict[item_id] = i

    train = np.array([(user_id_dict[record[0]], item_id_dict[record[1]], record[2], record[3]) for record in train])
    test = np.array([(user_id_dict[record[0]], item_id_dict[record[1]], record[2], record[3]) for record in test])

    print('data matrix loaded')
    print('num of users: {}'.format(n_u))
    print('num of movies: {}'.format(n_m))
    print('num of training ratings: {}'.format(n_train))
    print('num of test ratings: {}'.format(n_test))

    return n_m, n_u, train, test

def load_data_1m(path='./', delimiter='::', test_size=ML_1M_TEST_SIZE):
    data = np.genfromtxt(path+'movielens_1m_dataset.dat', skip_header=0, delimiter=delimiter).astype('int32')
    data = data[(-data[:,3]).argsort()]

    n_u = np.unique(data[:,0]).size  # num of users
    n_m = np.unique(data[:,1]).size  # num of movies
    n_r = data.shape[0]  # num of ratings

    user_dict = {}
    for i, user_id in enumerate(np.unique(data[:,0]).tolist()):
        user_dict[user_id] = i
    item_dict = {}
    for i, item_id in enumerate(np.unique(data[:,1]).tolist()):
        item_dict[item_id] = i

    idx = np.arange(n_r)
    
    train, test = [], []
    training_users = set()
    
    for i in range(n_r - 1, -1, -1):
        user_id = user_dict[data[idx[i], 0]]
        item_id = item_dict[data[idx[i], 1]]
        rating = data[idx[i], 2]
        timestamp = data[idx[i], 3]
        if i < int(test_size * n_r): # test set
            if user_id not in training_users:
                continue
            test.append((user_id - 1, item_id, rating, timestamp))
        else: # training set
            training_users.add(user_id)
            train.append((user_id, item_id, rating, timestamp))

    train, test = np.array(train), np.array(test)

    n_train = train.shape[0]  # num of training ratings
    n_test = test.shape[0]  # num of test ratings

    print('data matrix loaded')
    print('num of users: {}'.format(n_u))
    print('num of movies: {}'.format(n_m))
    print('num of ratings: {}'.format(n_r))
    print('num of training ratings: {}'.format(n_train))
    print('num of test ratings: {}'.format(n_test))

    return n_m, n_u, train, test

# Load Data

In [4]:
# Insert the path of a data directory by yourself (e.g., '/content/.../data')
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
data_path = 'data'
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._

# Data Load
try:
    if dataset == 'ML-100K':
        path = data_path + '/MovieLens_100K/'
        n_m, n_u, train, test = load_data_100k(path=path, delimiter='\t')

    elif dataset == 'ML-1M':
        path = data_path + '/MovieLens_1M/'
        n_m, n_u, train, test = load_data_1m(path=path, delimiter='::')

    else:
        raise ValueError

except ValueError as e:
    print('Error: Unable to load data')

data matrix loaded
num of users: 751
num of movies: 1616
num of training ratings: 80000
num of test ratings: 2863


# Session initializations

In [5]:
def divide_to_sessions(data: list[tuple[int, int, int, int]]) -> dict[tuple[int, int], list[list[int]]]:
    user_sessions_dict = {}
    for user_id, item_id, _, timestamp in data:
        user_id, item_id, timestamp = int(user_id), int(item_id), int(timestamp)
        if user_id not in user_sessions_dict:
            user_sessions_dict[user_id] = []
        if len(user_sessions_dict[user_id]) == 0 or timestamp - user_sessions_dict[user_id][-1][-1][1] > SESSION_TIME_GAP_SEC:
            user_sessions_dict[user_id].append([])
        user_sessions_dict[user_id][-1].append((item_id, timestamp))
    for user_id, user_sessions in user_sessions_dict.items():
        for i, user_session in enumerate(user_sessions):
            user_sessions_dict[user_id][i] = [item_id for item_id, _ in user_session]
    user_item_sessions_dict = {}
    for user_id, user_sessions in user_sessions_dict.items():
        for user_session in user_sessions:
            for item_id in user_session:
                user_item_sessions_dict[(user_id, item_id)] = tuple(user_session)
    return user_item_sessions_dict

In [6]:
sessions_dict = divide_to_sessions(train)

# Matrix Factorization

In [43]:
class MatrixFactorization(tf.keras.Model):
    def __init__(self, num_users, num_items, avg_ratings, sessions_dict, latent_dim=LATENT_DIM, reg=REGULARIZATION):
        super().__init__()
        self.reg = reg
        self.user_emb = Embedding(num_users, latent_dim,
                                  embeddings_regularizer=l2(reg),
                                  name="user_embedding")
        self.item_emb = Embedding(num_items, latent_dim,
                                  embeddings_regularizer=l2(reg),
                                  name="item_embedding")
        self.avg_rating = avg_ratings
        self.user_bias = Embedding(num_users, 1,
                                   embeddings_regularizer=l2(reg),
                                   name="user_bias")
        self.item_bias = Embedding(num_items, 1,
                                   embeddings_regularizer=l2(reg),
                                   name="item_bias")
        self.session_bias = Embedding(len(set(sessions_dict.values())), 1,
                                      embeddings_regularizer=l2(reg),
                                      name="session_bias")
        
        keys = [f"{user}_{item}" for user, item in sessions_dict.keys()]
        sessions = [",".join(map(str, session)) for session in sessions_dict.values()]
        print(sessions[:5])
        keys, sessions = tf.constant(keys), tf.constant(sessions)
        print(keys.shape, sessions.shape)

        self.sessions_dict = tf.lookup.StaticHashTable(
            tf.lookup.KeyValueTensorInitializer(keys, sessions),
            default_value=""
)

    # def get_session_predict(self, user, item):
    #     key = tf.strings.join([tf.strings.format('{}', user), tf.strings.format('{}', item)], separator="_")
    #     session_items = self.sessions_dict.lookup(key)
    #     session_items = tf.strings.split(session_items, sep=",")
    #     session_items = tf.map_fn(lambda x: tf.cast(x, tf.int32), session_items)
    #     session_items_embeddings = tf.map_fn(self.item_emb, session_items)
    #     items_predicts = tf.map_fn(lambda session_item_vec: tf.reduce_sum(self.user_emb(user) * session_item_vec, axis=1), session_items_embeddings)
    #     return tf.reduce_mean(items_predicts, axis=0)

    def get_session_predict(self, user, item):
        key = tf.strings.join([tf.strings.format('{}', user), tf.strings.format('{}', item)], separator="_")
        session_items = self.sessions_dict.lookup(key)  # Get the session string
        
        session_items = tf.strings.split(session_items, sep=",")  # Split into string tensors
        session_items = tf.strings.to_number(session_items, out_type=tf.int32)  # Convert to int tensor

        session_items_float = tf.cast(session_items, tf.float32)  # Ensure compatibility with embeddings
        session_items_embeddings = tf.map_fn(self.item_emb, session_items_float)  # Get embeddings

        user_embedding = self.user_emb(user)  # Get user embedding (assumed float32)

        items_predicts = tf.map_fn(
            lambda session_item_vec: tf.reduce_sum(user_embedding * session_item_vec, axis=1),
            session_items_embeddings
        )

        return tf.reduce_mean(items_predicts, axis=0)  # Final prediction



    def call(self, inputs):
        user, item = inputs
        user_vec = self.user_emb(user)
        print(user_vec)
        item_vec = self.item_emb(item)
        print(item_vec)
        dot_product = tf.reduce_sum(user_vec * item_vec, axis=1)
        session_predict = self.get_session_predict(user, item)
        bias = (
            tf.squeeze(self.user_bias(user)) +
            tf.squeeze(self.item_bias(item)) +
            tf.squeeze(self.session_bias(session_predict))
        )
        return dot_product + self.avg_rating + bias
    
    def l2_loss(self, y_true, y_pred, user, item):
        squared_error = tf.square(y_true - y_pred)
        user_vec, item_vec = self.user_emb(user), self.item_emb(item)
        user_vec_norm = tf.reduce_sum(tf.square(user_vec), axis=1)
        item_vec_norm = tf.reduce_sum(tf.square(item_vec), axis=1)
        user_bias = tf.squeeze(self.user_bias(user))
        item_bias = tf.squeeze(self.item_bias(item))
        session_bias = tf.squeeze(self.session_bias(self.get_session_predict(user, item)))
        user_bias_norm = tf.square(user_bias)
        item_bias_norm = tf.square(item_bias)
        session_norm = tf.square(session_bias)

        reg_loss = self.reg * (user_vec_norm + item_vec_norm + user_bias_norm + item_bias_norm + session_norm)
        return squared_error + reg_loss
    
    def train_step(self, data):
        (user, item), y_true = data

        with tf.GradientTape() as tape:
            y_pred = self((user, item), training=True)
            loss = self.l2_loss(y_true, y_pred, user, item)

        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        return {"loss": loss}

In [44]:
train_users, train_items, train_ratings = train[:,0], train[:,1], np.float32(train[:,2])
test_users, test_items, test_ratings = test[:,0], test[:,1], np.float32(test[:,2])

train_avg_rating = np.mean(train_ratings)

In [45]:
model = MatrixFactorization(n_u, n_m, train_avg_rating, sessions_dict)
model.compile(optimizer=SGD(learning_rate=LEARNING_RATE))
history = model.fit(
    [train_users, train_items], train_ratings,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1
)

['254,285,297,184,172,107,768,287,914,116,199,404,1060,175,356,209,180,316,97,11,96', '254,285,297,184,172,107,768,287,914,116,199,404,1060,175,356,209,180,316,97,11,96', '254,285,297,184,172,107,768,287,914,116,199,404,1060,175,356,209,180,316,97,11,96', '254,285,297,184,172,107,768,287,914,116,199,404,1060,175,356,209,180,316,97,11,96', '254,285,297,184,172,107,768,287,914,116,199,404,1060,175,356,209,180,316,97,11,96']
(80000,) (80000,)
Epoch 1/10
Tensor("user_embedding_1/GatherV2:0", shape=(1, 25), dtype=float32)
Tensor("item_embedding_1/GatherV2:0", shape=(1, 25), dtype=float32)
Tensor("matrix_factorization_12_1/user_embedding_1/GatherV2:0", shape=(1, 25), dtype=float32)
Tensor("matrix_factorization_12_1/item_embedding_1/GatherV2:0", shape=(1, 25), dtype=float32)
Tensor("matrix_factorization_12_1/user_embedding_1/GatherV2:0", shape=(1, 25), dtype=float32)
Tensor("matrix_factorization_12_1/item_embedding_1/GatherV2:0", shape=(1, 25), dtype=float32)


InvalidArgumentError: Graph execution error:

Detected at node StringToNumber defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\ipykernel\kernelapp.py", line 701, in start

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\tornado\platform\asyncio.py", line 195, in start

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\asyncio\windows_events.py", line 321, in run_forever

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\asyncio\base_events.py", line 608, in run_forever

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\asyncio\base_events.py", line 1936, in _run_once

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\asyncio\events.py", line 84, in _run

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\ipykernel\kernelbase.py", line 534, in dispatch_queue

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\ipykernel\kernelbase.py", line 523, in process_one

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\ipykernel\kernelbase.py", line 429, in dispatch_shell

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\ipykernel\kernelbase.py", line 767, in execute_request

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\ipykernel\ipkernel.py", line 429, in do_execute

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3051, in run_cell

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3106, in _run_cell

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3311, in run_cell_async

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3493, in run_ast_nodes

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3553, in run_code

  File "C:\Users\t-yzelinger\AppData\Local\Temp\ipykernel_28600\1484275807.py", line 3, in <module>

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 368, in fit

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 216, in function

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 129, in multi_step_on_iterator

  File "c:\Users\t-yzelinger\AppData\Local\anaconda3\envs\school_venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 110, in one_step_on_data

  File "C:\Users\t-yzelinger\AppData\Local\Temp\ipykernel_28600\3907597624.py", line 98, in train_step

  File "C:\Users\t-yzelinger\AppData\Local\Temp\ipykernel_28600\3907597624.py", line 85, in l2_loss

  File "C:\Users\t-yzelinger\AppData\Local\Temp\ipykernel_28600\3907597624.py", line 47, in get_session_predict

StringToNumberOp could not correctly convert string: 
	 [[{{node StringToNumber}}]] [Op:__inference_multi_step_on_iterator_16137]

In [None]:
test_ratings_predicted = model.predict([test_users, test_items])
test_ratings_predicted = np.clip(test_ratings_predicted, 1, 5)

# check test rmse
test_rmse = root_mean_squared_error(test_ratings, test_ratings_predicted)
print(f"Test RMSE: {test_rmse}")

In [None]:
users_matrix, items_matrix, user_biases, item_biases = model.get_weights()
ratings_matrix = np.dot(users_matrix, items_matrix.T) + user_biases + item_biases.T + train_avg_rating
ratings_matrix = np.clip(ratings_matrix, 1, 5)

# Save the model
with open(path + "mf_prediction.pickle", 'wb') as f:
    pickle.dump(ratings_matrix, f)