In [127]:
import pandas as pd
import numpy as np
import scipy.sparse as sp

In [128]:
ratings = pd.read_csv('./ml-1m/ratings.dat', delimiter='::', header=None,
                      names=['user_id', 'movie_id', 'rating', 'timestamp'],
                      usecols=['user_id', 'movie_id', 'rating'], engine='python')
movie_info = pd.read_csv('./ml-1m/movies.dat', delimiter='::', header=None,
                         names=['movie_id', 'name', 'category'], engine='python')
user_info = pd.read_csv('./ml-1m/users.dat', delimiter='::', header=None,
                        names=['user_id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')

In [129]:
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [130]:
from math import sqrt

class SGD:

    def __init__(self, factors=64, epochs=10,
                 regularization=0.02, learning_rate=0.005,
                 logging=False):
        self.epochs = epochs
        self.factors = factors
        self.regularization = regularization
        self.learning_rate = learning_rate
        self.users, self.bu = np.empty(0), np.empty(0)
        self.items, self.bi = np.empty(0), np.empty(0)
        self.mu = 0
        self.logging = logging
        self.rmse = None

    def fit(self, matrix):
        n, m = matrix.shape
        self.users = np.random.uniform(0, 1 / sqrt(self.factors), (n, self.factors))
        self.items = np.random.uniform(0, 1 / sqrt(self.factors), (m, self.factors))
        self.bu = np.zeros(n)
        self.bi = np.zeros(m)
        self.mu = matrix.data.mean()

        indices = matrix.nonzero()
        order = [o for o in range(len(indices[0]))]

        for ep in range(self.epochs):
            np.random.shuffle(order)
            errors = []
            for o in order:
                i, j = indices[0][o], indices[1][o]
                error = self.predict(i, j) - matrix[i, j]
                self.bu[i] -= (error + self.regularization * self.bu[i]) * self.learning_rate
                self.bi[j] -= (error + self.regularization * self.bi[j]) * self.learning_rate
                self.users[i] -= (error * self.items[j] + self.regularization * self.users[i]) * self.learning_rate
                self.items[j] -= (error * self.users[i] + self.regularization * self.items[j]) * self.learning_rate
                errors.append(self.predict(i, j) - matrix[i, j])
            errors = np.array(errors)
            loss = np.square(errors).mean()
            self.rmse = sqrt(loss)
            if self.logging:
                print(f'Epoch {ep + 1}: RMSE = {self.rmse}')

    def predict(self, i, j):
        return self.users[i] @ self.items[j] + self.bu[i] + self.bi[j] + self.mu

    def similar_item_ids(self, item_id, cnt=10):
        item_embeddings = self.items[item_id]
        all_items = self.items[1:]
        dists = [(np.linalg.norm(item_embeddings - all_items[i]), i + 1) for i in range(len(all_items))]
        dists.sort()
        return [x[1] for x in dists[1:cnt + 1]]

    def recommend_ids(self, user_id, matrix, cnt=10):
        used = matrix.indices[matrix.indptr[user_id]:matrix.indptr[user_id + 1]]
        predicts = [(self.predict(user_id, it), it) for it in range(1, len(self.items)) if it not in used]
        predicts.sort()
        predicts.reverse()
        return [x[1] for x in predicts[:cnt]]

In [131]:
exp_user_item = sp.coo_matrix((ratings['rating'], (ratings['user_id'], ratings['movie_id'])))
exp_user_item_csr = exp_user_item.tocsr()

In [132]:
model = SGD(epochs=20, logging=True)
model.fit(exp_user_item_csr)

Epoch 1: RMSE = 0.9641046836931749
Epoch 2: RMSE = 0.9083677212613752
Epoch 3: RMSE = 0.897212343314062
Epoch 4: RMSE = 0.8911538469935903
Epoch 5: RMSE = 0.8862186122079216
Epoch 6: RMSE = 0.8811921437816636
Epoch 7: RMSE = 0.8756536555582612
Epoch 8: RMSE = 0.869528416123844
Epoch 9: RMSE = 0.8627494720989339
Epoch 10: RMSE = 0.8552521220514373
Epoch 11: RMSE = 0.8469192922153579
Epoch 12: RMSE = 0.8378873362218319
Epoch 13: RMSE = 0.8282737422909228
Epoch 14: RMSE = 0.8181771843728675
Epoch 15: RMSE = 0.8077019594571974
Epoch 16: RMSE = 0.79695773359207
Epoch 17: RMSE = 0.7861468833531436
Epoch 18: RMSE = 0.7752618831304625
Epoch 19: RMSE = 0.7644246895809227
Epoch 20: RMSE = 0.7536787042336003


In [133]:
def get_movie_names(ids):
    return [movie_info[movie_info['movie_id'] == mid]['name'].to_string() for mid in ids]

Найдем симилары для Toy Story

In [134]:
print(movie_info.loc[0])
similar_ids = model.similar_item_ids(1)
get_movie_names(similar_ids)

movie_id                              1
name                   Toy Story (1995)
category    Animation|Children's|Comedy
Name: 0, dtype: object


['3045    Toy Story 2 (1999)',
 "2286    Bug's Life, A (1998)",
 '584    Aladdin (1992)',
 '1838    Mulan (1998)',
 '2618    Tarzan (1999)',
 '1526    Hercules (1997)',
 '3090    Fantasia 2000 (1999)',
 '1664    Mouse Hunt (1997)',
 '2728    Big (1988)',
 '1029    That Thing You Do! (1996)']

Рейтинги фильмов от юзера 4

In [135]:
get_user_history = lambda user_id: [movie_info[movie_info['movie_id'] == mid]['name'].to_string() + "   " + str(rat)
                                    for mid, rat in zip(ratings[ratings['user_id'] == user_id]['movie_id'],
                                                        ratings[ratings['user_id'] == user_id]['rating'])]
get_user_history(4)

['3399    Hustler, The (1961)   5',
 '1192    Star Wars: Episode VI - Return of the Jedi (1983)   3',
 '2882    Fistful of Dollars, A (1964)   4',
 '1196    Alien (1979)   4',
 '1023    Die Hard (1988)   4',
 '257    Star Wars: Episode IV - A New Hope (1977)   5',
 '1959    Saving Private Ryan (1998)   5',
 '476    Jurassic Park (1993)   4',
 '1178    Star Wars: Episode V - The Empire Strikes Back...   2',
 '1180    Raiders of the Lost Ark (1981)   5',
 '1885    Rocky (1976)   5',
 '1081    E.T. the Extra-Terrestrial (1982)   4',
 '3349    Thelma & Louise (1991)   4',
 '3633    Mad Max (1979)   4',
 '2297    King Kong (1933)   4',
 '1366    Jaws (1975)   5',
 '3458    Predator (1987)   1',
 '1183    Good, The Bad and The Ugly, The (1966)   5',
 '2623    Run Lola Run (Lola rennt) (1998)   5',
 '2878    Goldfinger (1964)   5',
 '1220    Terminator, The (1984)   5']

Получим рекомендации для юзера 4

In [136]:
recommend_ids = model.recommend_ids(4, exp_user_item_csr)
get_movie_names(recommend_ids)

['2836    Sanjuro (1962)',
 '1950    Seven Samurai (The Magnificent Seven) (Shichin...',
 '900    Casablanca (1942)',
 '911    Citizen Kane (1941)',
 '1189    To Kill a Mockingbird (1962)',
 '2953    General, The (1927)',
 '3269    For All Mankind (1989)',
 '740    Dr. Strangelove or: How I Learned to Stop Worr...',
 '1162    Paths of Glory (1957)',
 '3238    City Lights (1931)']