In [4]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from math import sqrt, exp

ratings = pd.read_csv('./ml-1m/ratings.dat', delimiter='::', header=None,
                      names=['user_id', 'movie_id', 'rating', 'timestamp'],
                      usecols=['user_id', 'movie_id', 'rating'], engine='python')
movie_info = pd.read_csv('./ml-1m/movies.dat', delimiter='::', header=None,
                         names=['movie_id', 'name', 'category'], engine='python')
user_info = pd.read_csv('./ml-1m/users.dat', delimiter='::', header=None,
                        names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')

def get_movie_names(ids):
    return [movie_info[movie_info['movie_id'] == mid]['name'].to_string() for mid in ids]

get_user_history = lambda user_id: [movie_info[movie_info['movie_id'] == mid]['name'].to_string() + "   " + str(rat)
                                    for mid, rat in zip(ratings[ratings['user_id'] == user_id]['movie_id'],
                                                        ratings[ratings['user_id'] == user_id]['rating'])]

imp_ratings = ratings.loc[(ratings['rating'] >= 4)]
imp_users = imp_ratings['user_id']
imp_movies = imp_ratings['movie_id']
imp_user_item = sp.coo_matrix((np.ones_like(imp_users), (imp_users, imp_movies)))
imp_user_item_csr = imp_user_item.tocsr()

In [5]:
from bisect import bisect_right

class BPR:

    def __init__(self, epochs=10, factors=16, regularization=0.02,
                 learning_rate=0.005, logging=False):
        self.epochs = epochs
        self.factors = factors
        self.reg = regularization
        self.lr = learning_rate
        self.logging = logging
        self.users, self.items = np.array(0), np.array(0)

    def fit(self, matrix):
        n, m = matrix.shape
        self.users = np.random.uniform(0, 1 / sqrt(self.factors), (n, self.factors))
        self.items = np.random.uniform(0, 1 / sqrt(self.factors), (m, self.factors))
        for ep in range(self.epochs):
            # Проходимся по всем юзерам
            for u in range(1, n):
                positives = matrix[u].nonzero()[1]
                # Проходимся по всем позитивам
                for pos in positives:
                    # Рандомим итем, пока попадаем в позитивы
                    neg = np.random.randint(1, m)
                    while neg in positives:
                        neg = np.random.randint(1, m)

                    x_uij = self.predict(u, pos) - self.predict(u, neg)
                    e = exp(-x_uij) / (1 + exp(-x_uij))
                    u_copy = self.users[u].copy()

                    diff = self.items[pos] - self.items[neg]
                    self.users[u] += self.lr * (e * diff + self.reg * self.users[u])

                    diff = u_copy
                    self.items[pos] += self.lr * (e * diff + self.reg * self.items[pos])

                    diff = -u_copy
                    self.items[neg] += self.lr * (e * diff + self.reg * self.items[neg])
            if self.logging:
                print(f'Epoch {ep + 1}: ', end='')
                print(f'AUC = {self.auc(matrix)}')

    def predict(self, i, j):
        return self.users[i] @ self.items[j].T

    def auc(self, matrix):
        n, m = matrix.shape
        auc = 0
        for u in range(1, n):
            positives = matrix[u].nonzero()[1]
            predicts = [self.predict(u, i) for i in range(m)]
            negative_predicts = [predicts[i] for i in range (1, m) if i not in positives]
            user_auc = 0
            if len(positives) == 0 or len(negative_predicts) == 0:
                continue
            negative_predicts.sort()
            for pos in positives:
                # Быстро подсчитываю кол-во негативов, у которых скор меньше текущего позитива
                user_auc += bisect_right(negative_predicts, predicts[pos])
                # Ниже закомменчена медленная версия, negatives - индексы негативов
                # for neg in negatives:
                #     x_uij = predicts[pos] - predicts[neg]
                #     if x_uij > 0:
                #         user_auc += 1
            auc += user_auc / (len(positives) * len(negative_predicts))
        return auc / (n - 1)

    def similar_item_ids(self, item_id, cnt=10):
        item_embeddings = self.items[item_id]
        all_items = self.items[1:]
        dists = [(np.linalg.norm(item_embeddings - all_items[i]), i + 1) for i in range(len(all_items))]
        dists.sort()
        return [x[1] for x in dists[1:cnt + 1]]

    def recommend_ids(self, user_id, ratings, cnt=10):
        used = [mid for _, mid in ratings[ratings['user_id'] == user_id]['movie_id'].items()]
        predicts = [(self.predict(user_id, it), it) for it in range(1, len(self.items)) if it not in used]
        predicts.sort()
        predicts.reverse()
        return [x[1] for x in predicts[:cnt]]

In [6]:
model = BPR(epochs=9, factors=64, logging=True)
model.fit(imp_user_item_csr)

Epoch 1: AUC = 0.885487154989309
Epoch 2: AUC = 0.889155911781217
Epoch 3: AUC = 0.8900681839018211
Epoch 4: AUC = 0.8903813115478301
Epoch 5: AUC = 0.8905382257437138
Epoch 6: AUC = 0.8906313148334517
Epoch 7: AUC = 0.8907263128796129
Epoch 8: AUC = 0.8907737228508505
Epoch 9: AUC = 0.8908529516501202


симилары для Toy Story

In [7]:
print(movie_info.loc[0])
similar_ids = model.similar_item_ids(1)
get_movie_names(similar_ids)

movie_id                              1
name                   Toy Story (1995)
category    Animation|Children's|Comedy
Name: 0, dtype: object


['49    Usual Suspects, The (1995)',
 '1081    E.T. the Extra-Terrestrial (1982)',
 '1245    Groundhog Day (1993)',
 '1539    Men in Black (1997)',
 '352    Forrest Gump (1994)',
 '453    Fugitive, The (1993)',
 '1196    Alien (1979)',
 '2928    Being John Malkovich (1999)',
 '1220    Terminator, The (1984)',
 '476    Jurassic Park (1993)']

In [8]:
get_user_history(4)

['3399    Hustler, The (1961)   5',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)   3',
 '2882    Fistful of Dollars, A (1964)   4',
 '1196    Alien (1979)   4',
 '1023    Die Hard (1988)   4',
 '257    Star Wars: Episode IV - A New Hope (1977)   5',
 '1959    Saving Private Ryan (1998)   5',
 '476    Jurassic Park (1993)   4',
 '1178    Star Wars: Episode V - The Empire Strikes Back...   2',
 '1180    Raiders of the Lost Ark (1981)   5',
 '1885    Rocky (1976)   5',
 '1081    E.T. the Extra-Terrestrial (1982)   4',
 '3349    Thelma & Louise (1991)   4',
 '3633    Mad Max (1979)   4',
 '2297    King Kong (1933)   4',
 '1366    Jaws (1975)   5',
 '3458    Predator (1987)   1',
 '1183    Good, The Bad and The Ugly, The (1966)   5',
 '2623    Run Lola Run (Lola rennt) (1998)   5',
 '2878    Goldfinger (1964)   5',
 '1220    Terminator, The (1984)   5']

Рекомендации для юзера 4

In [9]:
recommend_ids = model.recommend_ids(4, ratings, cnt=15)
get_movie_names(recommend_ids)


['2789    American Beauty (1999)',
 '589    Silence of the Lambs, The (1991)',
 '2502    Matrix, The (1999)',
 '2693    Sixth Sense, The (1999)',
 '604    Fargo (1996)',
 "523    Schindler's List (1993)",
 '315    Shawshank Redemption, The (1994)',
 '585    or 2: Judgment Day (1991)',
 '847    Godfather, The (1972)',
 '108    Braveheart (1995)',
 '1179    Princess Bride, The (1987)',
 '1250    Back to the Future (1985)',
 '1575    L.A. Confidential (1997)',
 '2327    Shakespeare in Love (1998)',
 '293    Pulp Fiction (1994)']