In [5]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from math import sqrt

In [6]:

ratings = pd.read_csv('./ml-1m/ratings.dat', delimiter='::', header=None,
                      names=['user_id', 'movie_id', 'rating', 'timestamp'],
                      usecols=['user_id', 'movie_id', 'rating'], engine='python')
movie_info = pd.read_csv('./ml-1m/movies.dat', delimiter='::', header=None,
                         names=['movie_id', 'name', 'category'], engine='python')
user_info = pd.read_csv('./ml-1m/users.dat', delimiter='::', header=None,
                        names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')

def get_movie_names(ids):
    return [movie_info[movie_info['movie_id'] == mid]['name'].to_string() for mid in ids]

get_user_history = lambda user_id: [movie_info[movie_info['movie_id'] == mid]['name'].to_string() + "   " + str(rat)
                                    for mid, rat in zip(ratings[ratings['user_id'] == user_id]['movie_id'],
                                                        ratings[ratings['user_id'] == user_id]['rating'])]

сделаем implicit матрицу рейтингов

In [7]:
imp_ratings = ratings.loc[(ratings['rating'] >= 4)]
imp_users = imp_ratings['user_id']
imp_movies = imp_ratings['movie_id']
imp_user_item = sp.coo_matrix((np.ones_like(imp_users), (imp_users, imp_movies)))
imp_user_item_csr = imp_user_item.tocsr()

In [8]:
class ALS:

    def __init__(self, factors=10, epochs=10, alpha=10, lambda_val=0.01, logging=False):
        self.factors = factors
        self.epochs = epochs
        self.alpha = alpha
        self.lambda_val = lambda_val
        self.logging = logging
        self.x, self.y = np.empty((0, 0)), np.empty((0, 0))

    def fit(self, raw_matrix):
        # в ALS решил все индексировать с 0, а не с 1, как в датасете
        matrix = raw_matrix[1:, 1:]
        n, m = matrix.shape
        fact = self.factors
        self.x = np.random.uniform(0, 1 / sqrt(fact), (n, fact))
        self.y = np.random.uniform(0, 1 / sqrt(fact), (m, fact))
        i_factors, i_users, i_items = self.lambda_val * np.eye(fact), np.eye(n), np.eye(m)
        for ep in range(self.epochs):
            xtx = self.x.T @ self.x + i_factors
            yty = self.y.T @ self.y + i_factors
            for u in range(n):
                user_imp = matrix[u].toarray()[0]
                nonzero = user_imp.nonzero()
                # ускоряем за счет того, что в матрице (С-I) из формулы много нулевых столбцов
                A = yty + (self.y.T[:, nonzero] @ self.y[nonzero]).reshape((fact, fact)) * self.alpha
                # ускоряем за счет того что в матрице C*p(u) из формулы много нулевых столбцов
                b = (self.y.T[:, nonzero] @ user_imp.T[nonzero] * (self.alpha + 1))
                # print(self.x[u].shape, np.linalg.solve(A, b).shape, np.linalg.solve(A, b).T.shape)
                self.x[u] = np.linalg.solve(A, b).reshape(fact)
            for i in range(m):
                item_imp = matrix[:, i].T.toarray()[0]
                nonzero = item_imp.nonzero()
                A = xtx + (self.x.T[:, nonzero] @ self.x[nonzero]).reshape((fact, fact)) * self.alpha
                b = (self.x.T[:, nonzero] @ item_imp.T[nonzero] * (self.alpha + 1))
                self.y[i] = np.linalg.solve(A, b).reshape(fact)
            if self.logging:
                print(f'Epoch {ep + 1}: ', end='')
                if ep % 2 == 0:
                    print(f'RMSE = {self.rmse(raw_matrix)}', end='')
                print()

    def predict(self, i, j):
        return self.x[i - 1] @ self.y[j - 1].T

    def similar_item_ids(self, item_id, cnt=10):
        item_embeddings = self.y[item_id - 1]
        dists = [(np.linalg.norm(item_embeddings - self.y[i]), i + 1) for i in range(len(self.y))]
        dists.sort()
        return [x[1] for x in dists[1:cnt + 1]]

    def recommend_ids(self, user_id, ratings, cnt=10):
        used = [mid for _, mid in ratings[ratings['user_id'] == user_id]['movie_id'].items()]
        predicts = [(self.predict(user_id, it), it) for it in range(1, len(self.y) + 1) if it not in used]
        predicts.sort()
        predicts.reverse()
        return [x[1] for x in predicts[:cnt]]

    def rmse(self, raw_matrix):
        matrix = raw_matrix[1:, 1:]
        n, m = matrix.shape
        loss = 0
        predicts = self.x @ self.y.T
        for i in range(n):
            # считается долго, поэтому убрал все обращение к sparse-матрице по индексам,
            # учитывая все nonzero() индексы как 1, остальное - 0
            nz = matrix[i].nonzero()[1]
            if len(nz) == 0:
                for j in range(m):
                    loss += predicts[i][j] ** 2
                continue
            cur_nz = 0
            for j in range(m):
                if nz[cur_nz] == j:
                    loss += (1 + self.alpha) * (predicts[i][j] - 1) ** 2
                    cur_nz += 1
                    if cur_nz >= len(nz):
                        for k in range(j + 1, m):
                            loss += predicts[i][k] ** 2
                        break
                    continue
                loss += predicts[i][j] ** 2
        return sqrt(loss / (n * m))


In [9]:
model = ALS(epochs=15, factors=64, alpha=40, logging=True)
model.fit(imp_user_item_csr)

Epoch 1: RMSE = 1.0373820665746563
Epoch 2: 
Epoch 3: RMSE = 0.6690536805104136
Epoch 4: 
Epoch 5: RMSE = 0.4733873349346077
Epoch 6: 
Epoch 7: RMSE = 0.38504333910114374
Epoch 8: 
Epoch 9: RMSE = 0.3465262049164095
Epoch 10: 
Epoch 11: RMSE = 0.3272746832341109
Epoch 12: 
Epoch 13: RMSE = 0.3200705299324511
Epoch 14: 
Epoch 15: RMSE = 0.31411919308894376


симилары для Toy Story

In [10]:
print(movie_info.loc[0])
similar_ids = model.similar_item_ids(1)
get_movie_names(similar_ids)

movie_id                              1
name                   Toy Story (1995)
category    Animation|Children's|Comedy
Name: 0, dtype: object


['1245    Groundhog Day (1993)',
 '3045    Toy Story 2 (1999)',
 '33    Babe (1995)',
 "2286    Bug's Life, A (1998)",
 '49    Usual Suspects, The (1995)',
 '2647    Ghostbusters (1984)',
 '352    Forrest Gump (1994)',
 '293    Pulp Fiction (1994)',
 '315    Shawshank Redemption, The (1994)',
 '2255    Life Is Beautiful (La Vita � bella) (1997)']

In [11]:
get_user_history(4)

['3399    Hustler, The (1961)   5',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)   3',
 '2882    Fistful of Dollars, A (1964)   4',
 '1196    Alien (1979)   4',
 '1023    Die Hard (1988)   4',
 '257    Star Wars: Episode IV - A New Hope (1977)   5',
 '1959    Saving Private Ryan (1998)   5',
 '476    Jurassic Park (1993)   4',
 '1178    Star Wars: Episode V - The Empire Strikes Back...   2',
 '1180    Raiders of the Lost Ark (1981)   5',
 '1885    Rocky (1976)   5',
 '1081    E.T. the Extra-Terrestrial (1982)   4',
 '3349    Thelma & Louise (1991)   4',
 '3633    Mad Max (1979)   4',
 '2297    King Kong (1933)   4',
 '1366    Jaws (1975)   5',
 '3458    Predator (1987)   1',
 '1183    Good, The Bad and The Ugly, The (1966)   5',
 '2623    Run Lola Run (Lola rennt) (1998)   5',
 '2878    Goldfinger (1964)   5',
 '1220    Terminator, The (1984)   5']

Рекомендации для юзера 4

In [12]:
recommend_ids = model.recommend_ids(4, ratings, cnt=15)
get_movie_names(recommend_ids)

['847    Godfather, The (1972)',
 '1182    Aliens (1986)',
 '1203    Godfather: Part II, The (1974)',
 '108    Braveheart (1995)',
 '585    or 2: Judgment Day (1991)',
 '2502    Matrix, The (1999)',
 '1204    Full Metal Jacket (1987)',
 '1267    Ben-Hur (1959)',
 '1271    Indiana Jones and the Last Crusade (1989)',
 '453    Fugitive, The (1993)',
 '3634    Mad Max 2 (a.k.a. The Road Warrior) (1981)',
 '1950    Seven Samurai (The Magnificent Seven) (Shichin...',
 '1179    Princess Bride, The (1987)',
 '1222    Glory (1989)',
 '2460    Planet of the Apes (1968)']

Выдало даже продолжение Mad Max
