# Setup

## Constants

In [1]:
# Select a dataset among 'ML-100K' and 'ML-1M'
dataset = 'ML-100K'

# Data split parameters
VALIDATION_USERS_RATINGS_COUNT = 4
TEST_USERS_RATINGS_COUNT = 6

# Model hyperparameters
BATCH_SIZE = 1
LEARNING_RATE = 0.002
REGULARIZATION = 0.05
EPOCHS = 10 # TODO - Increase this value to 100

# Matrix factorization hyperparameters
LATENT_DIM = 25 # Concepts count

## Imports

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping

import numpy as np

from sklearn.metrics import root_mean_squared_error

import pickle

## Defining Data Loaders

In [3]:
def load_data_100k(path='./', delimiter='\t'):
    train = np.loadtxt(path+'movielens_100k_u1.base', skiprows=0, delimiter=delimiter).astype('int32')
    test = np.loadtxt(path+'movielens_100k_u1.test', skiprows=0, delimiter=delimiter).astype('int32')

    total = np.concatenate((train, test), axis=0)
    total = total[total[:,3].argsort()] # Sort by timestamp
    
    users_ratings_counts = {}
    train, validation, test = [], [], []
    for user_id, item_id, rating, _ in total[:: -1]:
        users_ratings_counts[user_id] = users_ratings_counts.get(user_id, 0) + 1
        if users_ratings_counts[user_id] <= TEST_USERS_RATINGS_COUNT:
            test.append((user_id - 1, item_id - 1, rating))
        elif users_ratings_counts[user_id] <= VALIDATION_USERS_RATINGS_COUNT + TEST_USERS_RATINGS_COUNT:
            validation.append((user_id - 1, item_id - 1, rating))
        else:
            train.append((user_id - 1, item_id - 1, rating))
    train, validation, test = np.array(train), np.array(validation), np.array(test)

    n_u = np.unique(total[:,0]).size  # num of users
    n_m = np.unique(total[:,1]).size  # num of movies
    n_train = train.shape[0]  # num of training ratings
    n_validation = validation.shape[0]  # num of validation ratings
    n_test = test.shape[0]  # num of test ratings

    print('data matrix loaded')
    print('num of users: {}'.format(n_u))
    print('num of movies: {}'.format(n_m))
    print('num of training ratings: {}'.format(n_train))
    print('num of validation ratings: {}'.format(n_validation))
    print('num of test ratings: {}'.format(n_test))

    return n_m, n_u, train, validation, test

def load_data_1m(path='./', delimiter='::'):
    data = np.genfromtxt(path+'movielens_1m_dataset.dat', skip_header=0, delimiter=delimiter).astype('int32')
    data = data[(-data[:,3]).argsort()]

    n_u = np.unique(data[:,0]).size  # num of users
    n_m = np.unique(data[:,1]).size  # num of movies
    n_r = data.shape[0]  # num of ratings

    user_dict = {}
    for i, user_id in enumerate(np.unique(data[:,0]).tolist()):
        user_dict[user_id] = i
    item_dict = {}
    for i, item_id in enumerate(np.unique(data[:,1]).tolist()):
        item_dict[item_id] = i

    idx = np.arange(n_r)

    users_ratings_counts = {}
    train, validation, test = [], [], []
    for i in range(n_r):
        user_id = user_dict[data[idx[i], 0]]
        item_id = item_dict[data[idx[i], 1]]
        rating = data[idx[i], 2]
        users_ratings_counts[user_id] = users_ratings_counts.get(user_id, 0) + 1
        if users_ratings_counts[user_id] <= TEST_USERS_RATINGS_COUNT:
            test.append((user_id - 1, item_id - 1, rating))
        elif users_ratings_counts[user_id] <= VALIDATION_USERS_RATINGS_COUNT + TEST_USERS_RATINGS_COUNT:
            validation.append((user_id - 1, item_id - 1, rating))
        else:
            train.append((user_id - 1, item_id - 1, rating))

    train, validation, test = np.array(train), np.array(validation), np.array(test)

    n_train = train.shape[0]  # num of training ratings
    n_validation = validation.shape[0]  # num of validation ratings
    n_test = test.shape[0]  # num of test ratings

    print('data matrix loaded')
    print('num of users: {}'.format(n_u))
    print('num of movies: {}'.format(n_m))
    print('num of ratings: {}'.format(n_r))
    print('num of training ratings: {}'.format(n_train))
    print('num of validation ratings: {}'.format(n_validation))
    print('num of test ratings: {}'.format(n_test))

    return n_m, n_u, train, validation, test

# Load Data

In [4]:
# Insert the path of a data directory by yourself (e.g., '/content/.../data')
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
data_path = 'data'
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._

# Data Load
try:
    if dataset == 'ML-100K':
        path = data_path + '/MovieLens_100K/'
        n_m, n_u, train, validation, test = load_data_100k(path=path, delimiter='\t')

    elif dataset == 'ML-1M':
        path = data_path + '/MovieLens_1M/'
        n_m, n_u, train, validation, test = load_data_1m(path=path, delimiter='::')

    else:
        raise ValueError

except ValueError as e:
    print('Error: Unable to load data')

data matrix loaded
num of users: 943
num of movies: 1682
num of training ratings: 90570
num of validation ratings: 3772
num of test ratings: 5658


# Matrix Factorization

In [5]:
class MatrixFactorization(tf.keras.Model):
    def __init__(self, num_users, num_items, avg_ratings, latent_dim=LATENT_DIM, reg=REGULARIZATION):
        super().__init__()
        self.user_emb = Embedding(num_users, latent_dim,
                                  embeddings_regularizer=l2(reg),
                                  name="user_embedding")
        self.item_emb = Embedding(num_items, latent_dim,
                                  embeddings_regularizer=l2(reg),
                                  name="item_embedding")
        self.user_bias = Embedding(num_users, 1,
                                   embeddings_regularizer=l2(reg),
                                   name="user_bias")
        self.item_bias = Embedding(num_items, 1,
                                   embeddings_regularizer=l2(reg),
                                   name="item_bias")
        self.avg_rating = avg_ratings

    def call(self, inputs):
        user, item = inputs
        user_vec = self.user_emb(user)
        item_vec = self.item_emb(item)
        dot_product = tf.reduce_sum(user_vec * item_vec, axis=1)

        bias = (
            tf.squeeze(self.user_bias(user)) +
            tf.squeeze(self.item_bias(item))
        )
        return dot_product + self.avg_rating + bias
    
    def l2_loss(self, y_true, y_pred, user, item):
        squared_error = tf.square(y_true - y_pred)
        user_vec, item_vec = self.user_emb(user), self.item_emb(item)
        user_vec_norm, item_vec_norm = tf.reduce_sum(tf.square(user_vec), axis=1), tf.reduce_sum(tf.square(item_vec), axis=1)
        user_bias, item_bias = self.user_bias(user), self.item_bias(item)
        user_bias_norm, item_bias_norm = tf.squeeze(tf.square(user_bias)), tf.squeeze(tf.square(item_bias))
        reg_loss = REGULARIZATION * (user_vec_norm + item_vec_norm + user_bias_norm + item_bias_norm)
        return squared_error + reg_loss
    
    def train_step(self, data):
        (user, item), y_true = data

        with tf.GradientTape() as tape:
            y_pred = self((user, item), training=True)
            loss = self.l2_loss(y_true, y_pred, user, item)

        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        return {"loss": loss}

In [6]:
train_users, train_items, train_ratings = train[:,0], train[:,1], np.float32(train[:,2])
validation_users, validation_items, validation_ratings = validation[:,0], validation[:,1], np.float32(validation[:,2])
test_users, test_items, test_ratings = test[:,0], test[:,1], np.float32(test[:,2])

train_avg_rating = np.mean(train_ratings)

In [7]:
model = MatrixFactorization(n_u, n_m, train_avg_rating)
model.compile(optimizer=SGD(learning_rate=LEARNING_RATE))
history = model.fit(
    [train_users, train_items], train_ratings,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    # validation_data=([validation_users, validation_items], validation_ratings),
    # callbacks=[EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)],
    verbose=1
)

Epoch 1/10
[1m90570/90570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 761us/step - loss: 1.0913
Epoch 2/10
[1m90570/90570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 767us/step - loss: 0.9703
Epoch 3/10
[1m90570/90570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 786us/step - loss: 0.9318
Epoch 4/10
[1m90570/90570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 760us/step - loss: 0.9118
Epoch 5/10
[1m90570/90570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 761us/step - loss: 0.8992
Epoch 6/10
[1m90570/90570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 793us/step - loss: 0.8904
Epoch 7/10
[1m90570/90570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 774us/step - loss: 0.8838
Epoch 8/10
[1m90570/90570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 790us/step - loss: 0.8788
Epoch 9/10
[1m90570/90570[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 790us/step - loss: 0.8747
Epoch 10/10
[1m90570/90570

In [8]:
test_ratings_predicted = model.predict([test_users, test_items])
test_ratings_predicted = np.clip(test_ratings_predicted, 1, 5)

# check test rmse
test_rmse = root_mean_squared_error(test_ratings, test_ratings_predicted)
print(f"Test RMSE: {test_rmse}")

[1m177/177[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
Test RMSE: 1.0371140241622925


In [9]:
users_matrix, items_matrix, user_biases, item_biases = model.get_weights()
ratings_matrix = np.dot(users_matrix, items_matrix.T) + user_biases + item_biases.T + train_avg_rating
ratings_matrix = np.clip(ratings_matrix, 1, 5)

# Save the model
with open(path + "mf_prediction.pickle", 'wb') as f:
    pickle.dump(ratings_matrix, f)