In [24]:
!pip -q install fast_map

In [25]:
import os

from fast_map import fast_map
import numpy as np
import scipy.sparse as sp
from sklearn.linear_model import ElasticNet
from tqdm import tqdm
import pandas as pd
from matplotlib import pyplot as plt


class RecommenderModel:
    def __init__(self):
        self.urm: sp.csr_matrix | None = None
        self.icm: sp.csr_matrix | None = None
        self.urm_pred: sp.csr_matrix | None = None

    def fit(self, urm: sp.csr_matrix, icm: sp.csr_matrix, urm_val: sp.csr_matrix, progress_bar: bool = True, **kwargs) -> None:
        """Fits (trains) the model on the given URM and (or) ICM, depending on the algorithm. To be overridden in
        subclasses.

        :param urm: User Ratings Matrix for training
        :type urm: sp.csr_matrix
        :param icm: Item Content Matrix
        :type icm: sp.csr_matrix
        :param urm_val: User Ratings Matrix for validation
        :type urm_val: sp.csr_matrix
        :param progress_bar: If true, progress bar will be shown (if implemented if subclass)
        :type progress_bar: bool
        """
        raise NotImplementedError

    def recommend(self, user_id: int, at: int = 10) -> np.ndarray:
        """Gives the top {at} recommended items for this user.

        :param user_id: ID of the user to recommend to
        :type user_id: int
        :param at: Number of items to recommend
        :type at: int
        :return: The {at} most relevant recommended items
        :rtype: np.ndarray
        """
        recommendations_predictions = self._get_recommendations_predictions(user_id).astype(np.float32)
        self._exclude_seen_items(user_id, recommendations_predictions)

        top_n_ratings_idx = np.argpartition(-recommendations_predictions, at)[:at]
        top_n_ratings = recommendations_predictions[top_n_ratings_idx]

        return top_n_ratings_idx[
            np.argsort(-top_n_ratings)
        ]

    def _get_recommendations_predictions(self, user_id: int) -> np.ndarray:
        """Gives the recommendations predictions for a given user, which are the probabilities or top-n (the higher,
        the better) that the items should be recommended to the user. It should be overridden in some subclasses

        :param user_id: ID of the user to recommend to
        :type user_id: int
        :return: The recommendations predictions for all the items of the urm
        :rtype: np.ndarray
        """
        if isinstance(self.urm_pred, sp.spmatrix):
            return self.urm_pred[user_id].toarray().ravel()
        elif isinstance(self.urm_pred, np.ndarray):
            return self.urm_pred[user_id]
        else:
            raise "Unknown type of urm predictions"

    def _exclude_seen_items(self, user_id: int, predicted_ratings: np.ndarray) -> None:
        """Excludes the items the user has already seen in the predicted ratings list. In-place operation!

        :param user_id: The id of the user
        :type user_id: int
        :param predicted_ratings: The predicted ratings of items for a user
        :type predicted_ratings: np.ndarray
        """
        seen_items = self.urm.indices[self.urm.indptr[user_id]:self.urm.indptr[user_id + 1]]
        predicted_ratings[seen_items] = -np.inf




def open_dataset() -> tuple[sp.csr_matrix, sp.csr_matrix]:
	"""Opens the dataset (URM and ICM matrices) into sparse matrices

	:return: The URM and ICM as sparse matrices
	:rtype: tuple[sp.csr_matrix, sp.csr_matrix]
	"""
	train = pd.read_csv("./data_train.csv")
	icm_metadata = pd.read_csv("./data_ICM_metadata.csv")
	urm = sp.csr_matrix((train['data'], (train['user_id'], train['item_id']))).astype(np.float32)
	icm = sp.csr_matrix((icm_metadata['data'], (icm_metadata['item_id'], icm_metadata['feature_id']))).astype(np.float32)
	return urm, icm


def train_test_split(urm: sp.csr_matrix, test_size: float = .2) -> tuple[sp.csr_matrix, sp.csr_matrix]:
	"""Splits the URM matrix into a train and test dataset over the users.

	:param urm: The User-Rating matrix
	:type urm: sp.csr_matrix
	:param test_size: The test size (in [0,1])
	:type test_size: float
	:return: The train and test URM matrices
	:rtype: tuple[sp.csr_matrix, sp.csr_matrix]
	"""
	train_mask = np.random.choice([True, False], urm.getnnz(), p=[1 - test_size, test_size])
	test_mask = ~train_mask

	urm_coo = urm.tocoo()
	urm_train = sp.csr_matrix((urm_coo.data[train_mask], (urm_coo.row[train_mask], urm_coo.col[train_mask])))
	if test_size > 0:
		urm_test = sp.csr_matrix((urm_coo.data[test_mask], (urm_coo.row[test_mask], urm_coo.col[test_mask])))
	else:
		urm_test = sp.csr_matrix([])

	return urm_train, urm_test


def average_precision(recommendations: np.ndarray, y: np.ndarray, k: int = 10) -> float:
	"""Computes the Average Precision of a recommendation

	:param recommendations: Recommendations for a user
	:type recommendations: np.ndarray
	:param y: Ground truth array of relevant items to be recommended
	:type y: np.ndarray
	:param k: Number of items to consider (AP@k)
	:type k: int
	:return: The Average Precision at k for these particular recommendations
	:rtype: float
    """
	relevance_mask = np.isin(recommendations[:k], y)
	precisions = np.cumsum(relevance_mask) / (np.arange(1, k+1))
	return np.sum(precisions * relevance_mask) / min(len(y), k) if len(y) > 0 else 0.


def evaluate_model(trained_model: RecommenderModel, urm_test: sp.csr_matrix, at: int = 10, users_to_test: float = 1.) -> float:
	"""Evaluates a recommender model using the MAP metric

	:param trained_model: A fitted recommender model
	:type trained_model: RecommenderModel
	:param urm_test: The test URM matrix
	:type urm_test: sp.csr_matrix
	:param at: The number of items to recommend to each user
	:type at: int
	:param users_to_test: The ratio of users to test (in [0,1])
	:type users_to_test: float
	:return: The MAP metric for this model on this test data
	:rtype: float
	"""
	cum_ap = 0.
	eval_count = 0

	num_users = urm_test.shape[0]
	users_ids = np.arange(num_users) if users_to_test == 1 else np.random.choice(num_users, size=int(users_to_test * num_users))

	for user_id in users_ids:
		y = urm_test.indices[urm_test.indptr[user_id]:urm_test.indptr[user_id+1]]
		if len(y) > 0:
			eval_count += 1
			recommendations = trained_model.recommend(user_id, at=at)
			cum_ap += average_precision(recommendations, y, k=at)

	return (cum_ap / eval_count).item()


def train_model(model: RecommenderModel, at: int = 10, test_size: float = .2, users_to_test: float = 1, print_eval: bool = True, **kwargs) -> tuple[RecommenderModel, float]:
	"""Given a recommender model, trains it and evaluates it on test data, then returns the trained model.

	:param model: The model to train, an instance of a recommender model
	:type model: RecommenderModel
	:param at: The number of recommendations given to each user
	:type at: int
	:param test_size: The test size (in [0,1]) for the train/test split. If set to zero, the model uses the whole
	dataset to train and is not evaluated
	:type test_size: float
	:param users_to_test:
	:param print_eval: Indicates if the function should print the model evaluation after training
	:type print_eval: bool
	:return: The fitted (trained) recommender model and the MAP@10 score
	:rtype: tuple[RecommenderModel, float]
	"""
	urm, icm = open_dataset()
	urm_train, urm_test = train_test_split(urm, test_size=test_size)

	model.fit(urm=urm_train, icm=icm, urm_val=urm_test, **kwargs)

	map_10 = 0
	if print_eval and test_size > 0:
		map_10 = evaluate_model(model, urm_test, at=at, users_to_test=users_to_test)
		print(f"MAP@{at} evaluation of the {model.__class__.__name__} model: {map_10:.5f}")

	return model, map_10


def write_submission(trained_model: RecommenderModel, filename: str = "submission.csv", at: int = 10) -> None:
	"""Builds the submission file from a trained recommender model. The file is saved in a CSV format.

	:param trained_model: A fitted recommender model
	:type trained_model: RecommenderModel
	:param filename: The filename of the submission for this particular recommender model
	:type filename: str
	:param at: Number of items to recommend
	:type at: int
	"""
	target_users_test = pd.read_csv("./data_target_users_test.csv",).to_numpy().ravel()

	recommendations = np.array([
		trained_model.recommend(user_id, at) for user_id in target_users_test
	])

	if not os.path.exists("../submissions"):
		os.makedirs("../submissions")
	with open(f"../submissions/{filename}", "w") as f:
		f.write("user_id,item_list\n")
		for user_id, recs in zip(target_users_test, recommendations):
			f.write(f"{user_id},{' '.join(map(str, recs))}\n")


def tf_idf(mat: sp.csr_matrix) -> sp.csr_matrix:
	"""Rescales the matrix values by weighting the features of the matrix (typically the ICM) using TF-IDF

	:param mat: The sparse matrix
	:type mat: sp.csr_matrix
	:return: The matrix rescaled by TF-IDF
	:rtype: sp.csr_matrix
	"""
	mat = mat.copy()
	df = np.asarray(mat.sum(axis=0)).ravel()
	idf = np.log(mat.shape[0] / (df + 1))
	mat.data = mat.data * idf[mat.tocoo().col]
	mat.eliminate_zeros()
	return mat


def plot_losses(epochs: int, loss_history: np.ndarray | list, loss_history_val: np.ndarray | list = None, num_batch_per_epochs: int = 1, other_data: tuple = None) -> None:
	"""Plots the losses history of a training.

	:param epochs: The number of epochs
	:type epochs: int
	:param loss_history: The loss history
	:type loss_history: np.ndarray | list
	:param loss_history_val: The validation loss history
	:type loss_history_val: np.ndarray | list
	:param num_batch_per_epochs: The number of batches per epoch
	:type num_batch_per_epochs: int
	:param other_data: Other data to plot (optional). The format is (label: str, x: list, y: list)
	:type other_data: tuple
	"""
	plt.plot(loss_history, label="Train loss")
	if loss_history_val is not None:
		plt.plot([x * num_batch_per_epochs for x in range(epochs + 1)], loss_history_val, label="Validation loss")
	plt.xlabel("Train iteration")
	plt.ylabel("Loss")
	plt.title("Loss history")
	plt.legend(loc="upper right")
	if other_data:
		label, x, y = other_data
		ax2 = plt.gca().twinx()
		ax2.plot(x, y, label=label, c="C2")
		plt.legend(loc="lower left")

	plt.grid(True)
	plt.show()

class EASER(RecommenderModel):
    """
    Important note:
        On this dataset with a URM about the size of (35000,40000), this needs at least 60GB of RAM.
        Run on g-colab TPUs to get 300+GB of RAM.
    """
    def __init__(self):
        super(EASER, self).__init__()
        self.lambda_reg: float = 0

    def fit(self, urm: sp.csr_matrix, lambda_reg: float = 45, **kwargs) -> None:
        self.urm = urm.astype(np.float32)
        self.lambda_reg = lambda_reg

        g = (self.urm.T @ self.urm)
        g += sp.identity(g.shape[0], dtype=np.float32) * self.lambda_reg
        g = g.toarray().astype(np.float32)
        p = np.linalg.inv(g)
        b = p / (-np.diag(p))
        np.fill_diagonal(b, 0.)

        self.urm_pred = self.urm @ b

class SLIMElasticNet(RecommenderModel):
    def __init__(self):
        super(SLIMElasticNet, self).__init__()
        self.alpha: float = 0
        self.l1_ratio: float = 0
        self.top_k: int = 0
        self.max_iter: int = 0
        self.similarity_matrix: sp.csr_matrix | None = None

    @staticmethod
    def process_item(item_idx: int, urm_csc: sp.csc_matrix, top_k: int, alpha: float, l1_ratio: float, max_iter: int):
        elastic_net = ElasticNet(
            alpha=alpha,
            l1_ratio=l1_ratio,
            fit_intercept=False,
            positive=True,
            copy_X=False,
            selection='random',
            max_iter=max_iter,
            tol=1e-3
        )

        y = urm_csc[:, item_idx].toarray()
        x = urm_csc.copy()
        x.data[x.indptr[item_idx]:x.indptr[item_idx + 1]] = 0.

        elastic_net.fit(urm_csc, y)

        coeffs_idxs = elastic_net.sparse_coef_.indices
        coeffs_vals = elastic_net.sparse_coef_.data

        if coeffs_idxs.shape[0] > top_k:
            relevant_items = np.argpartition(-np.abs(coeffs_vals), top_k)[:top_k]
            coeffs_idxs = coeffs_idxs[relevant_items]
            coeffs_vals = coeffs_vals[relevant_items]

        return item_idx, coeffs_idxs, coeffs_vals

    def fit(
        self,
        urm: sp.csr_matrix,
        progress_bar: bool = True,
        top_k: int = 300,
        l1_reg: float = 1e-7,
        l2_reg: float = 1e-5,
        max_iter: int = 100,
        **kwargs
    ) -> None:
        self.urm = urm
        urm_csc = self.urm.tocsc()
        num_items = self.urm.shape[1]

        self.top_k = min(top_k, num_items - 1)
        self.alpha = l1_reg + l2_reg
        self.l1_ratio = l1_reg / self.alpha
        self.max_iter = max_iter

        s_rows = []
        s_cols = []
        s_vals = []

        mapper = fast_map(self.process_item, range(num_items), [urm_csc] * num_items, [self.top_k] * num_items, [self.alpha] * num_items, [self.l1_ratio] * num_items, [self.max_iter] * num_items, threads_limit=10)
        iterator = tqdm(mapper, desc="Items", total=num_items) if progress_bar else mapper

        for item_idx, coeffs_idxs, coeffs_vals in iterator:
            s_rows.extend([item_idx] * len(coeffs_idxs))
            s_cols.extend(coeffs_idxs)
            s_vals.extend(coeffs_vals)

        self.similarity_matrix = sp.csr_matrix(
            (s_vals, (s_rows, s_cols)),
            shape=(num_items, num_items),
            dtype=np.float32
        )

        self.urm_pred = self.urm @ self.similarity_matrix


class HybridSLIMEASE(RecommenderModel):
    def __init__(self):
        super(HybridSLIMEASE, self).__init__()
        self.slim = None
        self.ease = None

    def fit(
        self,
        urm: sp.csr_matrix,
        slim_ratio,
        **kwargs
    ) -> None:
        self.urm = urm
        self.slim_ratio = slim_ratio

        self.slim = SLIMElasticNet()
        self.slim.fit(urm, **kwargs)

        self.ease = EASER()
        self.ease.fit(urm, **kwargs)

    def _get_recommendations_predictions(self, user_id: int) -> np.ndarray:
        return self.slim_ratio * self.slim.urm_pred[user_id] + (1 - self.slim_ratio) * self.ease.urm_pred[user_id]

In [26]:
urm, icm = open_dataset()
urm_train, urm_test = train_test_split(urm, test_size=.2)

In [27]:
model = HybridSLIMEASE()
model.fit(urm_train, slim_ratio=0.5, urm_val=urm_test, icm=icm)

Items: 100%|██████████| 38121/38121 [09:31<00:00, 66.70it/s] 


In [29]:
for slim_ratio in np.arange(0, 1.1, .1):
    print(f"{slim_ratio=}: MAP@10={evaluate_model(model, urm_test, at=10, users_to_test=1)}")

IndexError: index 120 is out of bounds for axis 0 with size 1

In [49]:
class TestModel(RecommenderModel):
    def __init__(self, slim, ease, urm):
        super(TestModel, self).__init__()
        self.slim = slim
        self.ease = ease
        self.slim_ratio = .5
        self.urm=urm
    def _get_recommendations_predictions(self, user_id: int) -> np.ndarray:
        return self.slim_ratio * self.slim.urm_pred[user_id].toarray().ravel() + (1 - self.slim_ratio) * self.ease.urm_pred[user_id]

In [50]:
m = TestModel(model.slim, model.ease, urm_train)

In [61]:
for slim_ratio in np.arange(0, 1.1, .1):
    m.slim_ratio = slim_ratio
    print(f"{slim_ratio=}: MAP@10={evaluate_model(m, urm_test, at=10, users_to_test=1)}")

slim_ratio=0.0: MAP@10=0.07768906611989392
slim_ratio=0.1: MAP@10=0.07782439251224023
slim_ratio=0.2: MAP@10=0.07801684533026695
slim_ratio=0.30000000000000004: MAP@10=0.07824286867179803
slim_ratio=0.4: MAP@10=0.07858456928336222
slim_ratio=0.5: MAP@10=0.079050506661552
slim_ratio=0.6000000000000001: MAP@10=0.0795870275702205
slim_ratio=0.7000000000000001: MAP@10=0.08043719500680566
slim_ratio=0.8: MAP@10=0.08147913004163017
slim_ratio=0.9: MAP@10=0.08174388818746725
slim_ratio=1.0: MAP@10=0.04123829385797967


In [64]:
m.slim_ratio = .6
write_submission(m)