In [3]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

In [4]:
ratings = pd.read_csv('./ml-1m/ratings.dat', delimiter='::', header=None,
                      names=['user_id', 'movie_id', 'rating', 'timestamp'],
                      usecols=['user_id', 'movie_id', 'rating'], engine='python')
movie_info = pd.read_csv('./ml-1m/movies.dat', delimiter='::', header=None,
                         names=['movie_id', 'name', 'category'], engine='python')
user_info = pd.read_csv('./ml-1m/users.dat', delimiter='::', header=None,
                        names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')

In [5]:
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [6]:
from math import sqrt

class SGD:

    def __init__(self, factors=64, epochs=10,
                 regularization=0.02, learning_rate=0.005,
                 logging=False):
        self.epochs = epochs
        self.factors = factors
        self.regularization = regularization
        self.learning_rate = learning_rate
        self.users, self.bu = np.empty(0), np.empty(0)
        self.items, self.bi = np.empty(0), np.empty(0)
        self.mu = 0
        self.logging = logging
        self.rmse = None

    def fit(self, matrix):
        n, m = matrix.shape
        self.users = np.random.uniform(0, 1 / sqrt(self.factors), (n, self.factors))
        self.items = np.random.uniform(0, 1 / sqrt(self.factors), (m, self.factors))
        self.bu = np.zeros(n)
        self.bi = np.zeros(m)
        self.mu = matrix.data.mean()

        indices = matrix.nonzero()
        order = [o for o in range(len(indices[0]))]

        for ep in range(self.epochs):
            np.random.shuffle(order)
            errors = []
            for o in order:
                i, j = indices[0][o], indices[1][o]
                error = self.predict(i, j) - matrix[i, j]
                self.bu[i] -= (error + self.regularization * self.bu[i]) * self.learning_rate
                self.bi[j] -= (error + self.regularization * self.bi[j]) * self.learning_rate
                self.users[i] -= (error * self.items[j] + self.regularization * self.users[i]) * self.learning_rate
                self.items[j] -= (error * self.users[i] + self.regularization * self.items[j]) * self.learning_rate
                errors.append(self.predict(i, j) - matrix[i, j])
            errors = np.array(errors)
            loss = np.square(errors).mean()
            self.rmse = sqrt(loss)
            if self.logging:
                print(f'Epoch {ep + 1}: RMSE = {self.rmse}')

    def predict(self, i, j):
        return self.users[i] @ self.items[j] + self.bu[i] + self.bi[j] + self.mu

    def similar_item_ids(self, item_id, cnt=10):
        item_embeddings = self.items[item_id]
        all_items = self.items[1:]
        dists = [(np.linalg.norm(item_embeddings - all_items[i]), i + 1) for i in range(len(all_items))]
        dists.sort()
        return [x[1] for x in dists[1:cnt + 1]]

    def recommend_ids(self, user_id, matrix, cnt=10):
        used = matrix.indices[matrix.indptr[user_id]:matrix.indptr[user_id + 1]]
        predicts = [self.predict(user_id, it) for it in range(1, len(self.items)) if it not in used]
        predicts.sort()
        predicts.reverse()
        return [x[1] for x in predicts[:cnt]]

In [7]:
exp_user_item = sp.coo_matrix((ratings['rating'], (ratings['user_id'], ratings['movie_id'])))
exp_user_item_csr = exp_user_item.tocsr()

In [22]:
model = SGD(epochs=20, logging=True)
model.fit(exp_user_item_csr)

KeyboardInterrupt: 

найдем симилары

In [21]:
def get_movie_names(ids):
    return [movie_info[movie_info['movie_id'] == mid]['name'].to_string() for mid in ids]

def get_similar_movies(_model, item_id):
    similar_ids = model.similar_item_ids(item_id)
    return get_movie_names(similar_ids)

movie_id                              1
name                   Toy Story (1995)
category    Animation|Children's|Comedy
Name: 0, dtype: object


NameError: name 'model' is not defined

симилары для Toy Story

In [None]:
print(movie_info.loc[0])
get_similar_movies(model, 1)

Рейтинги фильмов от юзера 4

In [26]:
get_user_history = lambda user_id: [movie_info[movie_info['movie_id'] == mid]['name'].to_string() + "   " + str(rat)
                                    for mid, rat in zip(ratings[ratings['user_id'] == user_id]['movie_id'],
                                                        ratings[ratings['user_id'] == user_id]['rating'])]
get_user_history(4)

['3399    Hustler, The (1961)   5',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)   3',
 '2882    Fistful of Dollars, A (1964)   4',
 '1196    Alien (1979)   4',
 '1023    Die Hard (1988)   4',
 '257    Star Wars: Episode IV - A New Hope (1977)   5',
 '1959    Saving Private Ryan (1998)   5',
 '476    Jurassic Park (1993)   4',
 '1178    Star Wars: Episode V - The Empire Strikes Back...   2',
 '1180    Raiders of the Lost Ark (1981)   5',
 '1885    Rocky (1976)   5',
 '1081    E.T. the Extra-Terrestrial (1982)   4',
 '3349    Thelma & Louise (1991)   4',
 '3633    Mad Max (1979)   4',
 '2297    King Kong (1933)   4',
 '1366    Jaws (1975)   5',
 '3458    Predator (1987)   1',
 '1183    Good, The Bad and The Ugly, The (1966)   5',
 '2623    Run Lola Run (Lola rennt) (1998)   5',
 '2878    Goldfinger (1964)   5',
 '1220    Terminator, The (1984)   5']

Получим рекомендации для юзера 4

In [27]:
def get_recommends(_model, user_id, matrix):
    recommend_ids = _model.recommend_ids(user_id, matrix)
    return get_movie_names(recommend_ids)

get_recommends(model, 4, exp_user_item_csr)

IndexError: invalid index to scalar variable.

сделаем implicit матрицу рейтингов

In [8]:
imp_ratings = ratings.loc[(ratings['rating'] >= 4)]
imp_users = imp_ratings['user_id']
imp_movies = imp_ratings['movie_id']
imp_user_item = sp.coo_matrix((np.ones_like(imp_users), (imp_users, imp_movies)))
imp_user_item_csr = imp_user_item.tocsr()

In [32]:
class ALS:

    def __init__(self, factors=10, epochs=10, alpha=10, lambda_val=0.1, logging=False):
        self.factors = factors
        self.epochs = epochs
        self.alpha = alpha
        self.lambda_val = lambda_val
        self.logging = logging
        self.x, self.y = np.empty((0, 0)), np.empty((0, 0))

    def fit(self, matrix):
        n, m = matrix.shape
        n, m = n - 1, m - 1
        self.x = np.random.uniform(0, 1 / sqrt(self.factors), (n, self.factors))
        self.y = np.random.uniform(0, 1 / sqrt(self.factors), (m, self.factors))
        i_factors, i_users, i_items = self.lambda_val * np.eye(self.factors), np.eye(n), np.eye(m)
        for ep in range(self.epochs):
            xtx = self.x.T @ self.x
            yty = self.y.T @ self.y
            for u in range(n):
                user_imp = matrix.getrow(u + 1).toarray()[0][1:]
                nonzero = user_imp.nonzero()
                # ускоряем за счет того, что в матрице С из формулы много нулевых столбцов
                A = yty + (self.y.T[:, nonzero] @ self.y[nonzero])[:, 0, :] * self.alpha + i_factors
                b = (self.y.T[:, nonzero] @ user_imp.T[nonzero] * (self.alpha + 1)).T[0]
                self.x[u] = np.linalg.solve(A, b).T[0]
            for i in range(m):
                item_imp = matrix.getcol(i + 1).T.toarray()[0][1:]
                nonzero = item_imp.nonzero()
                A = xtx + (self.x.T[:, nonzero] @ self.x[nonzero])[:, 0, :] * self.alpha + i_factors
                b = (self.x.T[:, nonzero] @ item_imp.T[nonzero] * (self.alpha + 1)).T[0]
                self.y[i] = np.linalg.solve(A, b).T[0]
            if self.logging:
                print(f'Epoch {ep + 1}: ', end='')
                print(f'RMSE = {self.rmse(matrix)}')

    def predict(self, i, j):
        return self.x[i - 1] @ self.y[j - 1].T

    def similar_item_ids(self, item_id, cnt=10):
        item_embeddings = self.y[item_id - 1]
        dists = [(np.linalg.norm(item_embeddings - self.y[i]), i + 1) for i in range(len(self.y))]
        dists.sort()
        return [x[1] for x in dists[1:cnt + 1]]

    def recommend_ids(self, user_id, matrix, cnt=10):
        used = matrix.indices[matrix.indptr[user_id]:matrix.indptr[user_id + 1]]
        predicts = [(self.predict(user_id, it + 1), it + 1) for it in range(len(self.y)) if it not in used]
        predicts.sort()
        predicts.reverse()
        return [x[1] for x in predicts[:cnt]]

    def rmse(self, matrix):
        n, m = matrix.shape
        loss = 0
        predicts = self.x @ self.y.T
        for i in range(1, n):
            nz = matrix.getrow(i).nonzero()[1]
            if len(nz) == 0:
                for j in range(1, m):
                    loss += predicts[i - 1][j - 1] ** 2
                continue
            cur_nz = 0
            for j in range(1, m):
                if nz[cur_nz] == j:
                    loss += (1 + self.alpha) * (predicts[i - 1][j - 1] - 1) ** 2
                    cur_nz += 1
                    if cur_nz >= len(nz):
                        for k in range(j + 1, m):
                            loss += predicts[i - 1][k - 1] ** 2
                        break
                    continue
                loss += predicts[i - 1][j - 1] ** 2
        return sqrt(loss / (n * m))


In [34]:
model_als = ALS(epochs=10, factors=64, alpha=40, logging=True)
model_als.fit(imp_user_item_csr)

Epoch 1: RMSE = 0.9845780945096181
Epoch 2: RMSE = 0.6703251252969984
Epoch 3: RMSE = 0.4894378160536007
Epoch 4: RMSE = 0.4874016724451012
Epoch 5: 

KeyboardInterrupt: 

симилары для Toy Story

In [35]:
print(movie_info.loc[0])
get_similar_movies(model_als, 1)

movie_id                              1
name                   Toy Story (1995)
category    Animation|Children's|Comedy
Name: 0, dtype: object


['1061    Sleeper (1973)',
 '2966    Mister Roberts (1955)',
 '964    Farewell to Arms, A (1932)',
 '1057    For the Moment (1994)',
 '907    Wizard of Oz, The (1939)',
 '3412    High Fidelity (2000)',
 '3833    Urbania (2000)',
 '2708    Cobra (1925)',
 '2096    Your Friends and Neighbors (1998)',
 '3161    Odessa File, The (1974)']

In [38]:
get_user_history(4)

['3352    Animal House (1978)   4',
 '1595    Full Monty, The (1997)   2',
 '642    Mission: Impossible (1996)   3',
 '1373    Raising Arizona (1987)   4',
 '3465    28 Days (2000)   3',
 '102    Happy Gilmore (1996)   4',
 '2666    Golden Child, The (1986)   4',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)   4',
 '1407    Beverly Hills Ninja (1997)   3',
 '3798    Naked Gun: From the Files of Police Squad!, Th...   3',
 '1063    Fish Called Wanda, A (1988)   5',
 '2928    Being John Malkovich (1999)   3',
 '1573    Edge, The (1997)   5',
 '1271    Indiana Jones and the Last Crusade (1989)   4',
 '1239    Stand by Me (1986)   5',
 '647    Dragonheart (1996)   4',
 '2098    Blade (1998)   5',
 '1539    Men in Black (1997)   3',
 '3550    Hollywood Knights, The (1980)   2',
 '257    Star Wars: Episode IV - A New Hope (1977)   5',
 '2789    American Beauty (1999)   4',
 '3045    Toy Story 2 (1999)   3',
 '1035    Ghost and the Darkness, The (1996)   4',
 '1241    Evil Dead 

Рекомендации для юзера 4

In [37]:
get_recommends(model_als, 4, imp_user_item_csr)

['2789    American Beauty (1999)',
 '257    Star Wars: Episode IV - A New Hope (1977)',
 '1959    Saving Private Ryan (1998)',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)',
 '2693    Sixth Sense, The (1999)',
 '1178    Star Wars: Episode V - The Empire Strikes Back...',
 '589    Silence of the Lambs, The (1991)',
 '108    Braveheart (1995)',
 '585    or 2: Judgment Day (1991)',
 '2502    Matrix, The (1999)']