# План
1. Реализовать алгоритмы **item2item**, **ALS**, **IALS**
2. Посчитать метрику предсказаний **MRR@100** выбрасывая случайный лайк пользователя

Будем решать задачу предсказания: на 4/5 пользователей учимся, на 1/5 выбрасываем случайный лайк и пытаемся предсказать его беря топ 100 наших лучших предсказаний для этого пользователя.

MRR@100 будет равно $1/(p+1)$, где $p$ - позиция на которой оказался выброшенный лайк в нашем ранжировании и 0 если в топ 100 его не было.

3. Подобрать параметры алгоритмов для максимизации MRR@100 (1 балл)
4. Сравнить похожести айтемов получаюшиеся для item2item, ALS, IALS (1 балл)

Замерить насколько получаются похожими топы похожестей. Так же рекомендуется взять 5-топовых (или любимых) треков и посмотреть на похожести которые получаются для них в разных алгоритмах.

1. Item2item implementation

In [None]:
import os, zipfile
import numpy as np
import pandas as pd
import scipy.sparse as sp
from tqdm.notebook import tqdm, tnrange

In [2]:
user_item_matrix = sp.load_npz("/kaggle/input/music-dataset/data_train.npz")

In [3]:
item_weights = np.array(user_item_matrix.tocsc().sum(0))[0]
top_to_bottom_order = np.argsort(-item_weights)
item_mapping = np.empty(top_to_bottom_order.shape, dtype=int)
item_mapping[top_to_bottom_order] = np.arange(len(top_to_bottom_order))
total_item_count = (item_weights > 0).sum()
total_user_count = user_item_matrix.shape[0]

def build_dataset(user_item_matrix, item_pct, user_pct):
    user_count, item_count = int(total_user_count * user_pct), int(total_item_count * item_pct)
    item_ids = top_to_bottom_order[:item_count]
    user_ids = np.random.choice(np.arange(user_item_matrix.shape[0]), size=user_count, replace=False)
    train = user_item_matrix[user_ids]
    train = train[:, item_ids]
    return train

In [4]:
small_dataset = build_dataset(user_item_matrix, 0.05, 0.05)
small_dataset.get_shape()

(68661, 50000)

In [5]:

del user_item_matrix

In [6]:
import random
from tqdm.auto import trange

class Item2Item:
    def __init__(self, ui_matrix, max_items):
        self.max_items = max_items
        self.ui_matrix = ui_matrix
        self.n_items = self.ui_matrix.get_shape()[1]
        self.n_users = self.ui_matrix.get_shape()[1]

        self.sim_matrix = self.ui_matrix.T @ self.ui_matrix

    def get_simmilarity(self, item1, item2):
        return self.sim_matrix[item1, item2]

    def recommend(self, user, k):
        exist_indices = self.ui_matrix.getrow(user).indices
        random.shuffle(exist_indices)
        ratings = self.sim_matrix[exist_indices[:self.max_items]].sum(axis=0).reshape(-1)
        answer = []
        for i in range(ratings.shape[1]):
            if i not in exist_indices:
                answer.append((ratings[0, i], i))
                answer = sorted(answer)[::-1]
                answer = answer[:k]
        return [elem[1] for elem in answer]

In [7]:
p = np.random.permutation(small_dataset.shape[0])
train_size = int(0.8 * small_dataset.shape[0])
train_users, test_users = p[:train_size], p[train_size:]

In [8]:
n_test_ui = 5000
seen_ui = set()
for i in range(n_test_ui):
    random_user = random.choice(test_users)
    if len(small_dataset.getrow(random_user).indices) == 0:
        continue
    random_item = random.choice(small_dataset.getrow(random_user).indices)
    if (random_user, random_item) in seen_ui:
        continue
    seen_ui.add((random_user, random_item))
    small_dataset[random_user, random_item] = 0
small_dataset.eliminate_zeros()
test_ui = list(seen_ui)

In [9]:
from tqdm.auto import tqdm

def evaluate_mrr(k, model, test_ui):
    sum_mrr = 0
    for user, item in tqdm(test_ui):
        predictions = model.recommend(user, k)
        for i, elem in enumerate(predictions):
            if elem == item:
                sum_mrr += 1 / (1 + i)
    return sum_mrr / (len(test_ui) + 1e-9)

In [10]:
model = Item2Item(small_dataset, 3)

In [11]:
model.recommend(3, 10)

[8, 9, 22, 3, 35, 93, 42, 33, 5, 75]

In [12]:
evaluate_mrr(100, model, test_ui)

  0%|          | 0/4914 [00:00<?, ?it/s]

0.02805322307168879

2. ALS implementation

In [13]:
from tqdm.auto import trange, tqdm


class ALS:
    def __init__(self, ui_matrix, hid_dim, l2_coef):
        self.ui_matrix = ui_matrix.toarray()
        self.n_items = self.ui_matrix.shape[1]
        self.n_users = self.ui_matrix.shape[0]
        self.item_embeddings = np.random.rand(self.n_items, hid_dim)
        self.user_embeddings = np.random.rand(self.n_users, hid_dim)
        self.l2_coef = l2_coef

    def step_opt_item(self):
        Y = self.user_embeddings
        new_items = np.linalg.inv(Y.T @ Y + self.l2_coef * np.eye(Y.shape[1])) @ Y.T @ self.ui_matrix
        self.item_embeddings = new_items.T

    def step_opt_user(self):
        Y = self.item_embeddings
        new_items = np.linalg.inv(Y.T @ Y + self.l2_coef * np.eye(Y.shape[1])) @ Y.T @ self.ui_matrix.T
        self.user_embeddings = new_items.T
        
    def als_func(self):
        return np.mean(((self.item_embeddings[:5] @ self.user_embeddings[:5].T) - self.ui_matrix[:5, :5]) ** 2)

    def train(self, num_iters):
        history = []
        for i in trange(num_iters):
            self.step_opt_item()
            self.step_opt_user()
            history.append(self.als_func())
        return history

    def recommend(self, user_id, k):
        ratings = (self.item_embeddings @ self.user_embeddings[user_id].reshape(-1, 1)).reshape(-1)
        indices = self.ui_matrix[user_id, :] > 0.5
        ratings[indices] = -10000
        return np.argsort(ratings)[-k:][::-1]

In [14]:
model = ALS(small_dataset, 20, 0.1)
model.train(10)

  0%|          | 0/10 [00:00<?, ?it/s]

[0.04019324835821434,
 0.0412737382024636,
 0.040973993973280116,
 0.04106928091195297,
 0.041173677850295035,
 0.041235710097185053,
 0.04126845335840702,
 0.041286801628035844,
 0.04129921678517426,
 0.041309645204579325]

In [15]:
evaluate_mrr(100, model, test_ui)

  0%|          | 0/4914 [00:00<?, ?it/s]

0.02198107477067858

In [16]:
del model

3. IALS implementation

In [17]:
from tqdm.auto import trange, tqdm
import threading
import random

class IALS:
    def __init__(self, ui_matrix, hid_dim, l2_coef, alpha):
        self.ui_matrix = ui_matrix.toarray()
        self.n_items = self.ui_matrix.shape[1]
        self.n_users = self.ui_matrix.shape[0]
        self.item_embeddings = np.random.rand(self.n_items, hid_dim)
        self.user_embeddings = np.random.rand(self.n_users, hid_dim)
        self.l2_coef = l2_coef
        self.alpha = alpha

    def step_opt_item(self, item_id):
        features = self.user_embeddings
        targets = self.ui_matrix[:, item_id]
        c = sp.diags(1 + self.alpha * targets)
        new_item_embedding = np.linalg.inv(features.T @ c @ features + self.l2_coef * np.eye(features.shape[1]))
        new_item_embedding = new_item_embedding @ features.T @ c @ targets
        self.item_embeddings[item_id] = new_item_embedding

    def step_opt_user(self, user_id):
        features = self.item_embeddings
        targets = self.ui_matrix[user_id, :]
        c = sp.diags(1 + self.alpha * targets)
        new_user_embedding = np.linalg.inv(features.T @ c @ features + self.l2_coef * np.eye(features.shape[1]))
        new_user_embedding = new_user_embedding @ features.T @ targets
        self.user_embeddings[user_id] = new_user_embedding

    def train_items(self, items):
        for item in tqdm(items):
            self.step_opt_item(item)

    def train_users(self, users):
        for user in tqdm(users):
            self.step_opt_user(user)

    def als_func(self):
        return np.mean(((self.item_embeddings @ self.user_embeddings.T) - self.ui_matrix) ** 2)

    def train(self, num_iters):
        for _ in trange(num_iters):
            if random.randint(0, 1) == 0:
                self.step_opt_item(random.randint(0, self.n_items - 1))
            else:
                self.step_opt_user(random.randint(0, self.n_users - 1))

    def recommend(self, user_id, k):
        ratings = (self.item_embeddings @ self.user_embeddings[user_id].reshape(-1, 1)).reshape(-1)
        indices = self.ui_matrix[user_id, :] > 0.5
        ratings[indices] = -10000
        return np.argsort(ratings)[-k:][::-1]

In [18]:
model = IALS(small_dataset, 20, 1, 1)


In [19]:
model.train(2000)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [20]:
model.recommend(2, 10)

array([ 7226, 36331, 19180, 39160,  8355,  2111, 11947, 42867, 10219,
       10265])

In [21]:
evaluate_mrr(100, model, test_ui)

  0%|          | 0/4914 [00:00<?, ?it/s]

1.2043012447397103e-05

In [22]:
del model

4. Ищем гиперпараметры

4.1 Для ALS

In [23]:
possible_l2 = [0.01, 0.3, 0.5, 1]

opt_l2 = -1
max_score = 0

for coef_l2 in possible_l2:
    model = ALS(small_dataset, 20, coef_l2)
    model.train(10)
    score = evaluate_mrr(100, model, test_ui)
    if score > max_score:
        opt_l2 = coef_l2
        max_score = score

opt_als_l2 = opt_l2

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/4914 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/4914 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/4914 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/4914 [00:00<?, ?it/s]

4.2 Для IALS

In [24]:
possible_l2 = [0.2, 0.6]
alphas = [0.2, 0.6]

opt_l2, opt_alpha = -1, -1
max_score = 0

for coef_l2 in possible_l2:
    for alpha in alphas:
        model = IALS(small_dataset, 20, coef_l2, alpha)
        model.train(20000)
        score = evaluate_mrr(100, model, test_ui)
        if score > max_score:
            opt_alpha = alpha
            opt_l2 = coef_l2
            max_score = score

opt_ials_l2, opt_ials_alpha = opt_l2, opt_alpha

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/4914 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/4914 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/4914 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/4914 [00:00<?, ?it/s]

5. Оцениваем похожести

In [25]:
def get_recommendations(model, users, k, lim=100):
    answer = []
    for user in users[:lim]:
        rec1 = set(model.recommend(user, k))
        answer.append(rec1)
    return answer


def measure_sim(recs1, recs2):
    sum_sim = 0
    for rec1, rec2 in zip(recs1, recs2):
        sum_sim += len(rec1.intersection(rec2)) / len(rec1)
    return sum_sim / len(recs2)
    


In [26]:
model1 = Item2Item(small_dataset, 3)
recs1 = get_recommendations(model1, test_users, 100)
del model1
print("DONE 1")

model2 = ALS(small_dataset,  20, opt_als_l2)
model2.train(10)
recs2 = get_recommendations(model2, test_users, 100)
print(f"ALS score : {evaluate_mrr(100, model2, test_ui)}")
del model2
print("DONE 2")


model3 = IALS(small_dataset, 20, opt_ials_l2, opt_ials_alpha)
model3.train(550000)
print(f"IALS score : {evaluate_mrr(100, model3, test_ui)}")
recs3 = get_recommendations(model3, test_users, 100)
del model3 
print("DONE 3")

DONE 1


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/4914 [00:00<?, ?it/s]

ALS score : 0.022622090939418582
DONE 2


  0%|          | 0/550000 [00:00<?, ?it/s]

  0%|          | 0/4914 [00:00<?, ?it/s]

IALS score : 0.007432326301437072
DONE 3


In [27]:
print(measure_sim(recs1, recs2))
print(measure_sim(recs1, recs3))
print(measure_sim(recs2, recs3))


0.4509
0.3206
0.24360000000000004
