In [1]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357251 sha256=c70a257caa2cf85978f1120036660c1c28d57c108b98539ebc74422783204190
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [12]:
import pandas as pd
import numpy as np
from surprise import Dataset
from surprise import Reader
from surprise.dataset import BUILTIN_DATASETS
from surprise import SVD, KNNBasic, accuracy
from surprise.model_selection import train_test_split

In [6]:
data = Dataset.load_from_file(
    "u.data.txt",
    reader=Reader(line_format="user item rating timestamp", sep="\t"),)

In [7]:
df = pd.DataFrame(data.raw_ratings, columns=['userId', 'movieId', 'rating', 'timestamp'])
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [9]:
df['movieId'].nunique()

1682

In [10]:
df['userId'].nunique()

943

In [11]:
df['rating'].value_counts(ascending=False)

rating
4.0    34174
3.0    27145
5.0    21201
2.0    11370
1.0     6110
Name: count, dtype: int64

In [20]:
train, test = train_test_split(data, test_size=0.25, random_state=13)

In [23]:
len(test)

25000

In [24]:
sim_options = {
    'name': 'cosine',
    'user_based': False
}

knn = KNNBasic(sim_options=sim_options)
knn.fit(train)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7e25eedb3cd0>

In [25]:
predictions = knn.test(test)
predictions

[Prediction(uid='7', iid='633', r_ui=5.0, est=4.199452349030111, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='422', iid='287', r_ui=3.0, est=3.4703437660463736, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='804', iid='163', r_ui=3.0, est=3.5716736533692854, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='189', iid='480', r_ui=5.0, est=4.222825780855538, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='238', iid='546', r_ui=3.0, est=3.473417286928204, details={'actual_k': 17, 'was_impossible': False}),
 Prediction(uid='804', iid='216', r_ui=4.0, est=3.922551907749182, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='350', iid='204', r_ui=4.0, est=4.345238219480267, details={'actual_k': 38, 'was_impossible': False}),
 Prediction(uid='708', iid='993', r_ui=4.0, est=3.4458505791534115, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='193', iid='1078', r_ui=4.0, es

In [37]:
df[(df['userId'] == '500') & (df['movieId'] == '699')]

Unnamed: 0,userId,movieId,rating,timestamp
34367,500,699,3.0,883875523


In [41]:
for prediction in predictions:
    if prediction.uid == '500' and prediction.iid == '699':
        print(prediction.r_ui)
        print(round(prediction.est, 2))
        break

3.0
3.47


In [42]:
accuracy.rmse(predictions)

RMSE: 1.0272


1.0271678039029761

In [43]:
pred = pd.DataFrame(predictions)
pred.sort_values(by=['est'],inplace=True,ascending = False)

In [44]:
pred

Unnamed: 0,uid,iid,r_ui,est,details
22469,849,234,5.0,4.951929,"{'actual_k': 19, 'was_impossible': False}"
1974,849,427,4.0,4.950547,"{'actual_k': 19, 'was_impossible': False}"
8272,849,568,4.0,4.949215,"{'actual_k': 19, 'was_impossible': False}"
5138,849,174,5.0,4.947691,"{'actual_k': 19, 'was_impossible': False}"
22021,688,1127,5.0,4.928412,"{'actual_k': 15, 'was_impossible': False}"
...,...,...,...,...,...
15746,405,194,1.0,1.000000,"{'actual_k': 40, 'was_impossible': False}"
21245,405,197,4.0,1.000000,"{'actual_k': 40, 'was_impossible': False}"
13891,405,511,2.0,1.000000,"{'actual_k': 40, 'was_impossible': False}"
21639,181,151,2.0,1.000000,"{'actual_k': 40, 'was_impossible': False}"


In [46]:
recom = pred[pred.uid =='849']['iid'].to_list()
recom

['234', '427', '568', '174']

In [47]:
sim_options = {
    'name': 'cosine',
    'user_based': True
}

knn_ub = KNNBasic(sim_options=sim_options)
knn_ub.fit(train)

predictions_ub = knn_ub.test(test)
predictions_ub

Computing the cosine similarity matrix...
Done computing similarity matrix.


[Prediction(uid='7', iid='633', r_ui=5.0, est=4.150997362033697, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='422', iid='287', r_ui=3.0, est=3.754016750770759, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='804', iid='163', r_ui=3.0, est=3.70096140390557, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='189', iid='480', r_ui=5.0, est=4.524549229206855, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='238', iid='546', r_ui=3.0, est=3.1991812561559425, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='804', iid='216', r_ui=4.0, est=3.9500160918588327, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='350', iid='204', r_ui=4.0, est=4.1983705425936275, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='708', iid='993', r_ui=4.0, est=3.6795202365264075, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid='193', iid='1078', r_ui=4.0, es

In [48]:
accuracy.rmse(predictions_ub)

RMSE: 1.0175


1.0174852296380237

In [49]:
svd = SVD()
svd.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7e25ee7afa90>

In [51]:
pred_svd = svd.test(test)
pred_svd

[Prediction(uid='7', iid='633', r_ui=5.0, est=4.328041788485123, details={'was_impossible': False}),
 Prediction(uid='422', iid='287', r_ui=3.0, est=3.727322204079105, details={'was_impossible': False}),
 Prediction(uid='804', iid='163', r_ui=3.0, est=3.4972225034355646, details={'was_impossible': False}),
 Prediction(uid='189', iid='480', r_ui=5.0, est=4.824090378324065, details={'was_impossible': False}),
 Prediction(uid='238', iid='546', r_ui=3.0, est=2.996506969014903, details={'was_impossible': False}),
 Prediction(uid='804', iid='216', r_ui=4.0, est=4.2744853569962284, details={'was_impossible': False}),
 Prediction(uid='350', iid='204', r_ui=4.0, est=4.510688467994853, details={'was_impossible': False}),
 Prediction(uid='708', iid='993', r_ui=4.0, est=3.5801061561848764, details={'was_impossible': False}),
 Prediction(uid='193', iid='1078', r_ui=4.0, est=2.9432884823868806, details={'was_impossible': False}),
 Prediction(uid='847', iid='173', r_ui=5.0, est=3.7715031508880497, de

In [52]:
accuracy.rmse(pred_svd)

RMSE: 0.9404


0.9403577216052652