In [1]:
import numpy as np
import pandas as pd

In [2]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [3]:
ratings.drop(['timestamp'], axis=1, inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [49]:
ratings.groupby("rating")["rating"].value_counts()

rating
0.5     1370
1.0     2811
1.5     1791
2.0     7551
2.5     5550
3.0    20047
3.5    13136
4.0    26818
4.5     8551
5.0    13211
Name: count, dtype: int64

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Combining the Movies and Ratings DataFrame

In [5]:
df_combined = pd.merge(ratings, movies, on = 'movieId')
df_combined.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


Lấy các items có trên 100 ratings.

In [6]:
groupby_items_Ratings = df_combined.groupby('movieId')['rating']
groupby_items_Ratings = pd.DataFrame(groupby_items_Ratings.count())
item_list_min100_ratings = groupby_items_Ratings[groupby_items_Ratings['rating'] >= 100].index
df_combined_100 =  df_combined[df_combined['movieId'].isin(item_list_min100_ratings)]

In [7]:
from collections import Counter
# Đếm số lượng mỗi thể loại phim
genre_counts = Counter('|'.join(movies[movies['movieId'].isin(item_list_min100_ratings)]['genres']).split('|'))
# In kết quả
for genre, count in genre_counts.items():
    print(genre, count)

Adventure 49
Animation 10
Children 16
Comedy 41
Fantasy 18
Action 58
Crime 33
Thriller 45
Mystery 11
Sci-Fi 33
Drama 55
Romance 24
War 7
IMAX 7
Horror 6
Musical 4
Western 1


### Create Ratings Matrix

In [8]:
ratings_matrix = df_combined_100.pivot_table(index = 'userId', columns = 'title', values = 'rating')
ratings_matrix

title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),Apocalypse Now (1979),...,"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),Up (2009),"Usual Suspects, The (1995)",V for Vendetta (2006),WALL·E (2008),Waterworld (1995),Willy Wonka & the Chocolate Factory (1971),X-Men (2000)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,4.0,,,5.0,5.0,,4.0,...,,,3.0,,5.0,,,,5.0,5.0
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,4.0,,,,5.0,,,,...,,2.0,,,,,,,4.0,
5,,3.0,4.0,,,,,,,,...,,,,,4.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,5.0,,,4.0,3.5,4.5,4.5,4.0,1.0,4.5,...,4.5,4.0,,,4.5,,4.0,,,
607,,,,3.0,,,3.0,,,,...,,,5.0,,,,,3.0,,3.0
608,3.0,3.5,3.0,4.0,4.5,,5.0,4.0,2.5,3.0,...,4.5,3.5,3.0,,4.5,4.0,,3.0,3.5,4.0
609,,,,,,,,,,,...,,,,,,,,3.0,,


<a id="there_you_go_3.2"></a>
## 1. Memory Based Collaborative Filtering

### `User-based Collaborative Filtering`

Tạo ma trận tương quan Pearson

In [51]:
user_corr_mat = ratings_matrix.T.corr(min_periods=5)  # nếu số items chung < min_periods=5 thì sim=0
user_corr_mat.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,,,0.395182,0.180151,-0.439941,-0.053108,0.464277,,-0.037987,...,0.091574,0.254514,0.106031,-0.5,0.770709,0.303854,-0.012077,0.228651,-0.175412,0.071553
2,,1.0,,,,,,,,,...,-0.583333,,,,,0.583333,,,,0.765641
3,,,,,,,,,,,...,,,,,,,,,,
4,0.395182,,,1.0,-0.394823,0.421927,0.704093,0.055442,,0.541119,...,-0.360844,0.5625,0.196187,-0.158114,0.905206,0.057797,-0.020659,-0.29637,,-0.123595
5,0.180151,,,-0.394823,1.0,-0.006888,0.328889,0.030168,,,...,,0.231642,0.131108,0.068621,-0.245026,0.377341,0.228218,0.263139,0.384111,0.040582


Lấy cột 1 của ma trận tương quan và sắp xếp giảm dần, tưc là hệ số tương quan giữa user 1 và các users khác

In [10]:
corr_user_1 = user_corr_mat.iloc[0]
corr_user_1.sort_values(ascending=False, inplace=True)
corr_user_1

userId
1      1.000000
511    0.925820
366    0.872872
90     0.838525
430    0.801784
         ...   
564         NaN
575         NaN
583         NaN
595         NaN
598         NaN
Name: 1, Length: 597, dtype: float64

Xem các bộ phim mà user 1 đã đánh giá

In [52]:
df_combined_100[df_combined_100['userId'] == 1]

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
267,1,6,4.0,Heat (1995),Action|Crime|Thriller
369,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
572,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
854,1,110,4.0,Braveheart (1995),Action|Drama|War
1261,1,223,3.0,Clerks (1994),Comedy
1365,1,231,5.0,Dumb & Dumber (Dumb and Dumber) (1994),Adventure|Comedy
1568,1,260,5.0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
1819,1,296,3.0,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
2126,1,316,3.0,Stargate (1994),Action|Adventure|Sci-Fi


Ta lấy ví dụ để dự đoán rating của phim có movied là 32 (chưa được rated bở user 1) để thự dự đoán

In [53]:
print('32nd Movie : ', movies['title'][ movies['movieId'] == 32 ].values)

32nd Movie :  ['Twelve Monkeys (a.k.a. 12 Monkeys) (1995)']


In [13]:
# top-k peer group
K=20

In [14]:
user_peer_group_1_32 = {}

i = 0
for user in corr_user_1.keys():
    rating = df_combined[ (df_combined['userId'] == user) & (df_combined['movieId'] == 32) ]['rating'].sum()
    if  rating:
        i +=1
        user_peer_group_1_32[user] = (corr_user_1[user], rating)
    if i >=K: break   

print(user_peer_group_1_32)

{90: (0.8385254915624212, 4.0), 476: (0.7205766921228922, 4.0), 513: (0.645457935453148, 4.0), 375: (0.6270894413356966, 5.0), 302: (0.6071428571428571, 3.0), 120: (0.5948744389248242, 3.0), 72: (0.5944677716119164, 4.5), 32: (0.5873890337253248, 4.0), 112: (0.5838924858069249, 5.0), 312: (0.5675951758150258, 4.0), 521: (0.5601120336112037, 3.0), 445: (0.5489123137745419, 5.0), 19: (0.5446755607721149, 4.0), 57: (0.5397923479506446, 4.0), 512: (0.5172087743594052, 5.0), 451: (0.5, 5.0), 160: (0.49667154035181316, 5.0), 206: (0.48809353009197637, 3.0), 226: (0.474159243932272, 4.0), 8: (0.4642767787236135, 3.0)}


Dự đoán rating của user 1 cho bộ phim 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)'

In [15]:
row_means = ratings_matrix.mean(axis=1)

In [16]:
def predict_rating(peer_group, target_user):
    sum_similarity = 0
    weighted_ratings = 0
    for user, (sim, rating) in peer_group.items():
        weighted_ratings += sim * (rating - row_means[user])
        sum_similarity += sim

    return weighted_ratings / sum_similarity + row_means[target_user]
    
predict_rating(user_peer_group_1_32, 1)

4.577054897479311

### `Item-based Collaborative Filtering`

Đưa ma trận ratings về mean-centering với trung bình theo các hàng.

In [17]:
ratings_matrix_mean_centered = ratings_matrix.sub(row_means, axis=0)
ratings_matrix_mean_centered

title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),Apocalypse Now (1979),...,"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),Up (2009),"Usual Suspects, The (1995)",V for Vendetta (2006),WALL·E (2008),Waterworld (1995),Willy Wonka & the Chocolate Factory (1971),X-Men (2000)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,-0.403509,,,0.596491,0.596491,,-0.403509,...,,,-1.403509,,0.596491,,,,0.596491,0.596491
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,0.555556,,,,1.555556,,,,...,,-1.444444,,,,,,,0.555556,
5,,-0.461538,0.538462,,,,,,,,...,,,,,0.538462,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,1.095745,,,0.095745,-0.404255,0.595745,0.595745,0.095745,-2.904255,0.595745,...,0.595745,0.095745,,,0.595745,,0.095745,,,
607,,,,-0.901961,,,-0.901961,,,,...,,,1.098039,,,,,-0.901961,,-0.901961
608,-0.528689,-0.028689,-0.528689,0.471311,0.971311,,1.471311,0.471311,-1.028689,-0.528689,...,0.971311,-0.028689,-0.528689,,0.971311,0.471311,,-0.528689,-0.028689,0.471311
609,,,,,,,,,,,...,,,,,,,,-0.333333,,


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

#Thay các giá trị NaN thành 0 để tính toán.
ratings_matrix_mean = ratings_matrix_mean_centered.fillna(0)
# Tính ma trận cosine similarity
cosine_matrix = cosine_similarity(ratings_matrix_mean.T)
# Chuyển ma trận thành DataFrame
item_corr_matrix = pd.DataFrame(cosine_matrix, index=ratings_matrix_mean.columns, columns=ratings_matrix_mean.columns)
item_corr_matrix.head()


title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),Apocalypse Now (1979),...,"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),Up (2009),"Usual Suspects, The (1995)",V for Vendetta (2006),WALL·E (2008),Waterworld (1995),Willy Wonka & the Chocolate Factory (1971),X-Men (2000)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),1.0,-0.014845,0.001521,0.105313,0.101089,0.069858,0.07977,0.013828,-0.066684,0.229947,...,-0.076424,-0.012274,-0.104741,-0.000862,0.044413,-0.053646,-0.034517,-0.091608,0.042026,-0.121792
Ace Ventura: Pet Detective (1994),-0.014845,1.0,0.02578,-0.139801,-0.091529,-0.182317,-0.070204,-0.135812,0.095028,-0.019967,...,-1e-06,-0.107931,0.14479,-0.020208,-0.149242,-0.016139,-0.00457,0.257542,-0.010647,0.02677
Aladdin (1992),0.001521,0.02578,1.0,-0.15897,-0.079859,-0.011985,-0.033305,0.015476,-0.002645,-0.083436,...,0.081168,-0.122079,-0.015213,0.066249,-0.027101,-0.050094,-0.000563,-0.043856,-0.039654,-0.033673
Alien (1979),0.105313,-0.139801,-0.15897,1.0,0.457603,0.061172,0.048994,-0.100824,-0.112016,0.127213,...,-0.025498,-0.010685,-0.04642,-0.145028,0.043808,0.027272,0.032279,-0.040057,-0.043623,-0.072927
Aliens (1986),0.101089,-0.091529,-0.079859,0.457603,1.0,0.058207,-0.001458,-0.049714,-0.086591,0.032714,...,0.013785,-0.037877,-0.06359,-0.052891,0.006415,0.065775,0.015659,-0.077779,-0.030854,-0.033516


Lấy độ tương quan của item 32 với các items và sort

In [19]:
Monkeys_corr = item_corr_matrix['Twelve Monkeys (a.k.a. 12 Monkeys) (1995)']
Monkeys_corr = Monkeys_corr.sort_values(ascending=False)
Monkeys_corr

title
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)    1.000000
Pulp Fiction (1994)                          0.195314
Trainspotting (1996)                         0.157325
Seven (a.k.a. Se7en) (1995)                  0.151670
Full Metal Jacket (1987)                     0.150368
                                               ...   
Batman Forever (1995)                       -0.154539
GoldenEye (1995)                            -0.161201
Clear and Present Danger (1994)             -0.162595
Clueless (1995)                             -0.175387
Independence Day (a.k.a. ID4) (1996)        -0.181700
Name: Twelve Monkeys (a.k.a. 12 Monkeys) (1995), Length: 138, dtype: float64

Tìm peer group

In [20]:
item_peer_group_1_32= {}

i = 0
for item in Monkeys_corr.keys():
    rating = df_combined[ (df_combined['userId'] == 1) & (df_combined['title'] == item) ]['rating'].sum()
    if  rating:
        i +=1
        item_peer_group_1_32[item] = (Monkeys_corr[item], rating)
    if i >=K: break   

print(item_peer_group_1_32)

{'Pulp Fiction (1994)': (0.19531432723044323, 3.0), 'Seven (a.k.a. Se7en) (1995)': (0.15167016979585257, 5.0), 'Full Metal Jacket (1987)': (0.15036830628641462, 5.0), 'Fight Club (1999)': (0.13610940556188864, 5.0), 'Braveheart (1995)': (0.13032640666687617, 4.0), 'Usual Suspects, The (1995)': (0.1076720131666151, 5.0), 'American History X (1998)': (0.08350807813216914, 5.0), 'Big Lebowski, The (1998)': (0.0751322701330421, 5.0), 'Clockwork Orange, A (1971)': (0.06981092388046745, 5.0), 'Monty Python and the Holy Grail (1975)': (0.0607464072428375, 5.0), 'Willy Wonka & the Chocolate Factory (1971)': (0.06044000901798768, 5.0), 'Princess Bride, The (1987)': (0.059763196248658966, 5.0), 'Clerks (1994)': (0.04712099474035346, 3.0), 'Star Wars: Episode VI - Return of the Jedi (1983)': (0.04504566163982933, 5.0), 'Silence of the Lambs, The (1991)': (0.04443339426759507, 4.0), 'Forrest Gump (1994)': (0.04316975698158809, 4.0), 'Goodfellas (1990)': (0.042953155235551724, 5.0), 'Apocalypse Now

In [21]:
def predict_rating_item(peer_group, target_item):
    sum_similarity = 0
    weighted_ratings = 0
    for sim, rating in peer_group.values():
        weighted_ratings += sim * rating
        sum_similarity += sim

    return weighted_ratings / sum_similarity
    
predict_rating_item(item_peer_group_1_32, 32)

4.540115673875433

*Ở đây hoàn toàn có thể sử dụng chương trình trong phương pháp User-based cho ma trận transpove để thực hiện pp Item-based và cho ra kết quả tương tự.*

## `Sử dụng thư viên scikit-surprise`

In [22]:
from surprise import Reader, Dataset, KNNBasic, SVD, NMF, accuracy, KNNWithMeans
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split

Tạo dataset của `surpise` từ dataFrame và chia tập train và tập test

In [23]:
reader = Reader(rating_scale=(0.5, 5.0))
# Load dữ liệu từ pandas
data = Dataset.load_from_df(df_combined_100[['userId', 'movieId', 'rating']], reader = reader )
dataset = data.build_full_trainset()

In [24]:
# User-based
sim_options = {'name' : 'pearson', 'min_support': 5}
algo_user = KNNWithMeans(k=K, sim_options=sim_options)
algo_user.fit(dataset)

uid = 1
iid = 32 
pre = algo_user.predict(uid, iid, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
user: 1          item: 32         r_ui = None   est = 4.58   {'actual_k': 20, 'was_impossible': False}


In [25]:
# Item-based
sim_options = {'name' : 'pearson', "user_based": False, 'min_support': 5}
algo_item = KNNWithMeans(k=K, sim_options=sim_options)
algo_item.fit(dataset)

uid = 1
iid = 32 
pre = algo_item.predict(uid, iid, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
user: 1          item: 32         r_ui = None   est = 4.54   {'actual_k': 20, 'was_impossible': False}


Chia tập train tập test

In [26]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25, random_state=10)

`User-based`

In [27]:
sim_options = {'name' : 'pearson', 'min_support': 5}
algo_user = KNNWithMeans(k=K, sim_options=sim_options)

# Train the algorithm on the trainset, and predict ratings for the testset
algo_user.fit(trainset)
predictions = algo_user.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8164


0.8163821647966252

`Item-based`

In [28]:
sim_options = {'name' : 'pearson', "user_based": False, 'min_support': 5}
algo_item = KNNWithMeans(k=K, sim_options=sim_options)

# Train the algorithm on the trainset, and predict ratings for the testset
algo_item.fit(trainset)

predictions = algo_item.test(testset)
# Then compute RMSE
accuracy.rmse(predictions)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8084


0.8084497187594657

##  Model Based Collaborative Filtering

### `SVD`

In [29]:
algo_SVD = SVD(n_factors=17, random_state=0)

# Train the algorithm on the trainset, and predict ratings for the testset
algo_SVD.fit(trainset)

# Then compute RMSE
predictions = algo_SVD.test(testset)
accuracy.rmse(predictions)

uid = 1
iid = 32
# get a prediction for specific users and items.
pre = algo_SVD.predict(uid, iid, verbose=True)

RMSE: 0.8087
user: 1          item: 32         r_ui = None   est = 4.47   {'was_impossible': False}


### `NMF`

In [30]:
algo_NMF = NMF(n_factors=17, random_state=0)

# Train the algorithm on the trainset, and predict ratings for the testset
algo_NMF.fit(trainset)

# Then compute RMSE
predictions = algo_NMF.test(testset)
accuracy.rmse(predictions)
uid = 1
iid = 32 
# get a prediction for specific users and items.
pre = algo_NMF.predict(uid, iid, verbose=True)

RMSE: 0.8433
user: 1          item: 32         r_ui = None   est = 4.49   {'was_impossible': False}


## `Recommender System`

Phần code trên chỉ thử dự đoán rating của một user cho 1 item. Tuy nhiên trong thực tế, phần lớn các doanh nhiệp cần một hệ thống đề xuất các items chứ không phải là dự đoán ratings. Vì vậy ta cần viết các lớp và hàm đề thực hiện việc đó (trong thư viên surprise không hỗ trợ vệ đưa ra top-k users or items).

In [31]:
#setting up for customized printing
from IPython.display import Markdown, display
def printmd(string, color=None):
    colorstr = "<span style='color:{}'>{}</span>".format(color, string)
    display(Markdown(colorstr))
#function to display dataframes side by side    
from IPython.display import display_html
def display_side_by_side(args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline;margin-left:50px !important;margin-right: 40px !important"'),raw=True)

In [32]:
from collections import defaultdict
# Hàm đưa ra top-k users (items) từ dữ liệu được dự đoán
def get_top_k(predictions, k=10):
    # First map the predictions to each user.
    top_k = defaultdict(list)
    for uid, iid, _, est, _ in predictions:
        top_k[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_k.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_k[uid] = user_ratings[:k]

    return top_k

In [33]:
# Class xây dựng mô hình collab filtering từ các mô hình trong surprise.
class collab_filtering_based_recommender_model():
    def __init__(self, model, trainset, testset, data):
        self.model = model
        self.trainset = trainset
        self.testset = testset
        self.data = data
        self.pred_test = None
        self.recommendations = None
        self.top_n = None
        self.recommenddf = None

    def fit_and_predict(self):        
        printmd('**Fitting the train data...**', color='blue')
        self.model.fit(self.trainset)       

        printmd('**Predicting the test data...**', color='blue')
        self.pred_test = self.model.test(self.testset)        
        rmse = round(accuracy.rmse(self.pred_test), 3)
        printmd('**RMSE for the predicted result is ' + str(rmse) + '**', color='brown')   
        
        self.top_n = get_top_k(self.pred_test)
        self.recommenddf = pd.DataFrame(columns=['userId', 'movieId', 'rating'])
        for item in self.top_n:
            subdf = pd.DataFrame(self.top_n[item], columns=['movieId', 'rating'])
            subdf['userId'] = item
            cols = subdf.columns.tolist()
            cols = cols[-1:] + cols[:-1]
            subdf = subdf[cols]        
            self.recommenddf = pd.concat([self.recommenddf, subdf], axis = 0)        
        return rmse
        
    def cross_validate(self):
        printmd('**Cross Validating the data...**', color='blue')
        cv_result = cross_validate(self.model, self.data, n_jobs=-1)
        cv_result = round(cv_result['test_rmse'].mean(),3)
        printmd('**Mean CV RMSE is ' + str(cv_result)  + '**', color='brown')
        return cv_result

    def recommend(self, user_id, k=5):
        printmd(f'**Recommending top {k} movies for userid : {user_id} ...**', color='brown')
        
        df = self.recommenddf[self.recommenddf['userId'] == user_id].head(k)
        display(df)
        return df
        

In [34]:
from surprise.model_selection import RandomizedSearchCV
# Tìm parameters tốt nhất của model cho tập dữ liệu.
def find_best_model(model, parameters,data):
    clf = RandomizedSearchCV(model, parameters, n_jobs=-1, measures=['rmse'])
    clf.fit(data)             
    print(clf.best_score)
    print(clf.best_params)
    print(clf.best_estimator)
    return clf

## Sử dụng các hàm trên để đưa ra đề xuất:

KNN With Means - Memory Based Collaborative Filtering

In [35]:
# Tìm parameters tối ưu cho mô hình
sim_options = {
    "name": ["msd", "cosine", "pearson", "pearson_baseline"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}
params = { 'k': range(30,50,1), 'sim_options': sim_options}
clf = find_best_model(KNNWithMeans, params, data)

{'rmse': 0.8015085359954147}
{'rmse': {'k': 40, 'sim_options': {'name': 'pearson', 'min_support': 3, 'user_based': False}}}
{'rmse': <surprise.prediction_algorithms.knns.KNNWithMeans object at 0x0000015289753A90>}


In [36]:
# Tạo model collaborative filtering
knnwithmeans = clf.best_estimator['rmse']
col_fil_knnwithmeans = collab_filtering_based_recommender_model(knnwithmeans, trainset, testset, data)

# Fit and Predict
knnwithmeans_rmse = col_fil_knnwithmeans.fit_and_predict()

# cross validate
knnwithmeans_cv_rmse = col_fil_knnwithmeans.cross_validate()

<span style='color:blue'>**Fitting the train data...**</span>

Computing the pearson similarity matrix...
Done computing similarity matrix.


<span style='color:blue'>**Predicting the test data...**</span>

RMSE: 0.8073


<span style='color:brown'>**RMSE for the predicted result is 0.807**</span>

<span style='color:blue'>**Cross Validating the data...**</span>

<span style='color:brown'>**Mean CV RMSE is 0.802**</span>

In [37]:
# Recommend
result_knn_user1 = col_fil_knnwithmeans.recommend(user_id=10, k=5)
result_knn_user2 = col_fil_knnwithmeans.recommend(user_id=50, k=5)
result_knn_user3 = col_fil_knnwithmeans.recommend(user_id=100, k=5)

<span style='color:brown'>**Recommending top 5 movies for userid : 10 ...**</span>

Unnamed: 0,userId,movieId,rating
0,10,356,3.595843
1,10,33794,3.413133
2,10,2571,3.272401
3,10,588,3.268039
4,10,4306,3.053897


<span style='color:brown'>**Recommending top 5 movies for userid : 50 ...**</span>

Unnamed: 0,userId,movieId,rating
0,50,318,3.635508
1,50,296,3.633967
2,50,2959,3.491238
3,50,1258,3.433256
4,50,7438,3.139019


<span style='color:brown'>**Recommending top 5 movies for userid : 100 ...**</span>

Unnamed: 0,userId,movieId,rating
0,100,1213,4.225142
1,100,5989,4.053429
2,100,1265,3.9857
3,100,648,3.880563
4,100,2716,3.815342


SVD - Model Based Collaborative Filtering

In [38]:
# Tìm parameters tối ưu cho mô hình
params= {
    "n_factors" :[17],
    "n_epochs": [5, 10, 15, 20],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
clf = find_best_model(SVD, params, data)

{'rmse': 0.8270181420000817}
{'rmse': {'n_factors': 17, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4}}
{'rmse': <surprise.prediction_algorithms.matrix_factorization.SVD object at 0x000001528F4E1250>}


In [39]:
svd = clf.best_estimator['rmse']
col_fil_svd = collab_filtering_based_recommender_model(svd, trainset, testset, data)

# Fit and Predict
svd_rmse = col_fil_svd.fit_and_predict()
# cross validate
svd_cv_rmse = col_fil_svd.cross_validate()

<span style='color:blue'>**Fitting the train data...**</span>

<span style='color:blue'>**Predicting the test data...**</span>

RMSE: 0.8187


<span style='color:brown'>**RMSE for the predicted result is 0.819**</span>

<span style='color:blue'>**Cross Validating the data...**</span>

<span style='color:brown'>**Mean CV RMSE is 0.826**</span>

In [40]:
# Recommend
result_svd_user1 = col_fil_svd.recommend(user_id=10, k=5)
result_svd_user2 = col_fil_svd.recommend(user_id=50, k=5)
result_svd_user3 = col_fil_svd.recommend(user_id=100, k=5)

<span style='color:brown'>**Recommending top 5 movies for userid : 10 ...**</span>

Unnamed: 0,userId,movieId,rating
0,10,2571,3.711812
1,10,356,3.69063
2,10,296,3.659183
3,10,2858,3.609069
4,10,33794,3.4518


<span style='color:brown'>**Recommending top 5 movies for userid : 50 ...**</span>

Unnamed: 0,userId,movieId,rating
0,50,318,3.743392
1,50,2959,3.67414
2,50,296,3.56874
3,50,1258,3.549464
4,50,7438,3.42066


<span style='color:brown'>**Recommending top 5 movies for userid : 100 ...**</span>

Unnamed: 0,userId,movieId,rating
0,100,1213,4.182664
1,100,1265,3.977293
2,100,5989,3.949236
3,100,2716,3.905758
4,100,1968,3.815115


NMF - Model Based Collaborative Filtering

In [41]:
# Tìm parameters tối ưu cho mô hình
params= {
    "n_factors" :[17],
    "n_epochs": [5, 10, 15, 20],
    "reg_pu": [0.06, 0.08],
    "lr_bu": [0.005, 0.006]
}
clf = find_best_model(NMF, params, data)

{'rmse': 0.8626445585704372}
{'rmse': {'n_factors': 17, 'n_epochs': 20, 'reg_pu': 0.08, 'lr_bu': 0.005}}
{'rmse': <surprise.prediction_algorithms.matrix_factorization.NMF object at 0x000001528F9EDB90>}


In [42]:
# Tạo model collaborative filtering
NMF = clf.best_estimator['rmse']
col_fil_nmf = collab_filtering_based_recommender_model(NMF, trainset, testset, data)

# Fit and Predict
nmf_rmse = col_fil_nmf.fit_and_predict()
# cross validate
nmf_cv_rmse = col_fil_nmf.cross_validate()


<span style='color:blue'>**Fitting the train data...**</span>

<span style='color:blue'>**Predicting the test data...**</span>

RMSE: 0.8685


<span style='color:brown'>**RMSE for the predicted result is 0.869**</span>

<span style='color:blue'>**Cross Validating the data...**</span>

<span style='color:brown'>**Mean CV RMSE is 0.86**</span>

In [43]:
# Recommend
result_svd_user1 = col_fil_nmf.recommend(user_id= 10, k=5)
result_svd_user2 = col_fil_nmf.recommend(user_id= 50, k=5)
result_svd_user3 = col_fil_nmf.recommend(user_id=100, k=5)

<span style='color:brown'>**Recommending top 5 movies for userid : 10 ...**</span>

Unnamed: 0,userId,movieId,rating
0,10,296,4.076939
1,10,2858,3.981808
2,10,2571,3.910263
3,10,33794,3.483645
4,10,4306,3.470316


<span style='color:brown'>**Recommending top 5 movies for userid : 50 ...**</span>

Unnamed: 0,userId,movieId,rating
0,50,2959,3.997497
1,50,1258,3.855093
2,50,318,3.848371
3,50,7438,3.796323
4,50,296,3.454302


<span style='color:brown'>**Recommending top 5 movies for userid : 100 ...**</span>

Unnamed: 0,userId,movieId,rating
0,100,1265,4.372052
1,100,1213,4.249726
2,100,2716,4.207357
3,100,1968,3.93514
4,100,357,3.889634


Bảng các recomend cho user 10, 50, 100 với các phương pháp khác nhau.

![](recommed.png)

## So Sánh độ chính xác và thời gian chạy của các mô hình

In [45]:
from surprise import KNNWithMeans
from surprise import SVD
from surprise import NMF
benchmark = []
# Iterate over all algorithms
models = [KNNWithMeans(sim_options = {'name' : 'pearson', "user_based": False, 'min_support': 5}),
          KNNWithMeans(sim_options = {'name' : 'pearson', 'min_support': 5}),
          SVD(n_factors=19, n_epochs=20, lr_all=0.005, reg_all=0.4),
          NMF(n_factors=19, n_epochs=20, reg_pu=0.005, lr_bu=0.6)]
for i, algorithm in enumerate(models):
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = pd.concat([tmp, pd.Series([str(algorithm).split(' ')[0].split('.')[-1]+"_"+ str(i)], index=['Algorithm'])], axis = 0)      
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNWithMeans_0,0.801095,0.093338,0.276265
KNNWithMeans_1,0.81373,0.280931,0.55393
SVD_2,0.824785,0.093155,0.024065
NMF_3,1.042966,0.129536,0.033207


## References

- https://www.kaggle.com/code/fuzzywizard/rec-sys-collaborative-filtering-dl-techniques
- https://www.kaggle.com/code/rangarajansaranathan/collaborative-filtering-based-recommender-system