## Получаем данные

In [1]:
import pandas as pd

In [18]:
import numpy as np
from tqdm import tqdm_notebook

In [4]:
df_ratings = pd.read_csv('../lecture-1/ratings.csv')
df_movies = pd.read_csv('../lecture-1/movies.csv')

In [6]:
df = pd.merge(df_ratings, df_movies, on='movieId')

In [8]:
del df['timestamp']
del df['genres']

In [9]:
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


## Формируем векторное описание для фильма

In [10]:
df['userId'].describe()

count    100836.000000
mean        326.127564
std         182.618491
min           1.000000
25%         177.000000
50%         325.000000
75%         477.000000
max         610.000000
Name: userId, dtype: float64

In [11]:
MAX_USER_ID = 610
MIN_USER_ID = 1

In [13]:
movie_names = df['title'].unique()

In [15]:
movie_names = movie_names.tolist()

In [17]:
movie_to_vector = {}

In [26]:
for movie in tqdm_notebook(movie_names):
    movie_to_vector[movie] = np.zeros((MAX_USER_ID,))
    for r in df[df['title'] == movie].iterrows():
        movie_to_vector[movie][r[1]['userId'] - MIN_USER_ID] = r[1]['rating']

HBox(children=(IntProgress(value=0, max=9719), HTML(value='')))




## Ищем похожие

In [29]:
def find_similar(movie, dist_func, top=10):
    distances = {}
    target_movie = movie_to_vector[movie]
    for m in movie_names:
        distances[m] = dist_func(target_movie, movie_to_vector[m])
        
    distances_with_idx = [(i, distances[m]) for i, m in enumerate(movie_names)]
    distances_with_idx = sorted(distances_with_idx, key=lambda t: t[1], reverse=False)
    distances_with_idx = distances_with_idx[:top]
    
    return [(movie_names[i], d) for i, d in distances_with_idx]

In [33]:
from scipy.spatial.distance import cosine, euclidean, cityblock

In [34]:
find_similar('Toy Story (1995)', cityblock)

[('Toy Story (1995)', 0.0),
 ('Toy Story 2 (1999)', 608.5),
 ("Bug's Life, A (1998)", 698.5),
 ('Groundhog Day (1993)', 714.0),
 ('Nutty Professor, The (1996)', 714.0),
 ('Willy Wonka & the Chocolate Factory (1971)', 718.0),
 ('Mission: Impossible (1996)', 722.0),
 ('Babe (1995)', 722.5),
 ('Monsters, Inc. (2001)', 725.0),
 ('Toy Story 3 (2010)', 728.0)]

## User 2 Item

In [36]:
!pip install surprise



In [53]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

In [39]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [42]:
df_for_surpise = df_ratings[['userId', 'movieId', 'rating']]

In [43]:
df_for_surpise.columns = ['uid', 'iid', 'rating']

In [44]:
df_for_surpise.head()

Unnamed: 0,uid,iid,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [41]:
reader = Reader(rating_scale=(0.5, 5))

In [45]:
dataset = Dataset.load_from_df(df_for_surpise, reader)

In [47]:
trainset, testset = train_test_split(dataset, test_size=0.2)

In [83]:
algo = KNNBasic(k=40, sim_options={'name': 'cosine', 'user_based': True}, user_based=True)

In [84]:
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1242112e8>

In [85]:
predictions = algo.test(testset)

In [86]:
accuracy.mae(predictions)

MAE:  0.7462


0.7462106589568676

In [87]:
testset

[(496, 904, 5.0),
 (339, 1198, 5.0),
 (292, 5299, 4.0),
 (52, 73017, 5.0),
 (474, 25795, 3.5),
 (1, 2048, 5.0),
 (83, 364, 3.5),
 (274, 45517, 3.5),
 (338, 30749, 3.5),
 (376, 7254, 4.0),
 (23, 1960, 3.0),
 (429, 185, 5.0),
 (84, 1307, 5.0),
 (438, 30898, 3.0),
 (42, 2144, 3.0),
 (477, 593, 5.0),
 (68, 1049, 3.5),
 (477, 77455, 4.5),
 (555, 1518, 1.0),
 (6, 355, 4.0),
 (587, 3418, 5.0),
 (121, 237, 4.0),
 (128, 945, 5.0),
 (438, 2540, 2.5),
 (474, 1230, 4.0),
 (91, 1215, 5.0),
 (176, 434, 4.0),
 (353, 593, 5.0),
 (182, 6645, 4.5),
 (448, 134853, 4.0),
 (76, 1278, 4.0),
 (528, 91529, 4.5),
 (468, 292, 4.0),
 (212, 86190, 3.5),
 (474, 2664, 4.0),
 (318, 86880, 3.5),
 (469, 2872, 5.0),
 (20, 4022, 3.5),
 (5, 592, 3.0),
 (401, 5952, 3.0),
 (47, 2121, 2.5),
 (474, 2396, 4.0),
 (517, 1967, 1.0),
 (19, 2468, 2.0),
 (356, 3477, 5.0),
 (438, 2712, 3.0),
 (332, 1590, 2.5),
 (318, 3717, 3.0),
 (249, 2605, 3.5),
 (419, 175, 4.0),
 (414, 96610, 4.5),
 (260, 5219, 3.0),
 (480, 4643, 2.5),
 (282, 271

In [88]:
algo.predict(353, 904)

Prediction(uid=353, iid=904, r_ui=None, est=4.239282394498953, details={'actual_k': 40, 'was_impossible': False})

In [89]:
algo.get_neighbors(353, 40)

[3,
 46,
 62,
 75,
 79,
 87,
 100,
 115,
 121,
 125,
 131,
 139,
 143,
 163,
 193,
 208,
 221,
 227,
 232,
 236,
 241,
 243,
 253,
 261,
 303,
 307,
 311,
 330,
 338,
 376,
 382,
 387,
 390,
 395,
 404,
 406,
 411,
 417,
 421,
 427]

In [101]:
algo.predict(algo.trainset.to_inner_uid(292), algo.trainset.to_inner_iid(164909))

Prediction(uid=58, iid=3282, r_ui=None, est=3.5022871522784746, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'})

In [108]:
algo.get_neighbors(iid=algo.trainset.to_inner_uid(353), k=5)

[15, 43, 84, 87, 94]

In [112]:
pd.merge(df[df['userId']==353], df[df['userId']==algo.trainset.to_raw_uid(87)], on='movieId')

Unnamed: 0,userId_x,movieId,rating_x,title_x,userId_y,rating_y,title_y
0,353,1,5.0,Toy Story (1995),44,3.0,Toy Story (1995)
1,353,6,4.0,Heat (1995),44,3.0,Heat (1995)
2,353,112,5.0,Rumble in the Bronx (Hont faan kui) (1995),44,5.0,Rumble in the Bronx (Hont faan kui) (1995)


In [123]:
def get_votes_for_film_user(uid, iid):
    neighbors = algo.get_neighbors(iid=algo.trainset.to_inner_uid(uid), k=40)
    for n in neighbors:
        res = df_ratings[(df_ratings['userId'] == n) & (df_ratings['movieId'] == iid)]
        print (n, res['rating'].tolist()[0] if len(res['rating'].tolist()) > 0 else "None")

In [124]:
get_votes_for_film_user(353, 1)

15 2.5
43 5.0
84 None
87 None
94 None
96 5.0
100 None
101 None
107 4.0
122 None
139 None
141 4.0
159 4.5
208 None
219 3.5
221 None
231 None
233 3.0
239 4.0
241 None
243 None
251 None
273 5.0
283 3.0
296 None
307 4.0
311 None
313 None
329 None
334 3.5
339 4.0
340 None
343 None
348 None
361 None
364 5.0
369 None
379 None
382 4.5
394 None


In [125]:
algo.predict(353, 1)

Prediction(uid=353, iid=1, r_ui=None, est=4.149814663807331, details={'actual_k': 40, 'was_impossible': False})

## Item-based рекомендация

In [127]:
algo = KNNBasic(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [128]:
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x638caccc0>

In [130]:
predictions = algo.test(testset)

In [131]:
accuracy.mae(predictions)

MAE:  0.6963


0.6962531062387705

In [132]:
from surprise import KNNWithMeans

In [133]:
algo = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [134]:
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x637554630>

In [135]:
predictions = algo.test(testset)

In [136]:
accuracy.mae(predictions)

MAE:  0.6704


0.6704108257652016

In [137]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [142]:
similar_films = algo.get_neighbors(algo.trainset.to_inner_iid(1), k=10)

In [143]:
similar_films = [algo.trainset.to_raw_iid(i) for i in similar_films]

In [146]:
similar_films

[588, 3114, 6377, 8961, 1097, 2716, 2797, 2355, 1198, 3471]

In [145]:
df_movies[df_movies['movieId'].isin(similar_films)]

Unnamed: 0,movieId,title,genres
506,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
836,1097,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi
900,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
1757,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
2038,2716,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi
2103,2797,Big (1988),Comedy|Drama|Fantasy|Romance
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2596,3471,Close Encounters of the Third Kind (1977),Adventure|Drama|Sci-Fi
4360,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy
5374,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy


In [147]:
from surprise.model_selection import KFold

In [148]:
kfold = KFold(5)

In [149]:
scores = []
for trainset, testset in tqdm_notebook(kfold.split(dataset)):
    algo = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})
    algo.fit(trainset)
    predictions = algo.test(testset)
    scores.append(accuracy.mae(predictions))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6679
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6729
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6659
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6771
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6684



In [150]:
scores

[0.667898127047499,
 0.672901924296151,
 0.6658599117298951,
 0.6770688689744095,
 0.6684150491909416]