In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=2811616 sha256=239a5b896c9fa3b2be21f0116e544bea3705c61a6b8ac4b421c0971143edf41e
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
import pandas as pd

from surprise import Dataset, SVD, SVDpp, NMF, accuracy
from surprise.model_selection import cross_validate, GridSearchCV

from collections import defaultdict
import statistics
from surprise.model_selection import train_test_split

In [3]:
data = Dataset.load_builtin("ml-100k")

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


In [4]:
with open("/root/.surprise_data/ml-100k/ml-100k/u.info", "r") as f:
    print(f.read())

943 users
1682 items
100000 ratings



In [5]:
raw_ratings = data.raw_ratings
raw_ratings = [(user_id, item_id, rating) for user_id, item_id, rating, _ in raw_ratings]

columns = ['user_id', 'item_id', 'rating']
df = pd.DataFrame(raw_ratings, columns=columns)

print(df.head())

  user_id item_id  rating
0     196     242     3.0
1     186     302     3.0
2      22     377     1.0
3     244      51     2.0
4     166     346     1.0


In [6]:
mean_rating = df['rating'].mean()
std_rating = df['rating'].std()

print("Mean Rating:", mean_rating)
print("Standard Deviation of Ratings:", std_rating)

Mean Rating: 3.52986
Standard Deviation of Ratings: 1.125673599144316


In [7]:
trainset, testset = train_test_split(data, test_size=0.2)

## SVD

In [None]:
param_grid_svd = {
    'n_epochs': [10, 15, 20],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.04, 0.06],
    'n_factors': [1, 20, 50],
    'init_mean': [mean_rating],
    'init_std_dev': [0.1, 1]
}

In [9]:
grid_search_svd = GridSearchCV(SVD, param_grid_svd, measures=['rmse', 'mae'], cv=5, n_jobs=-1, joblib_verbose=5)

grid_search_svd.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   49.8s
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  2.1min finished


In [10]:
print(grid_search_svd.best_params)
print(grid_search_svd.best_score['rmse'])
print(grid_search_svd.best_score['mae'])

{'rmse': {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.06, 'n_factors': 1, 'init_mean': 3.52986, 'init_std_dev': 0.1}, 'mae': {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.06, 'n_factors': 1, 'init_mean': 3.52986, 'init_std_dev': 0.1}}
0.9520599508460064
0.7500099963415784


In [12]:
algo_svd = SVD(n_epochs=20, n_factors=1, lr_all=0.005, reg_all=0.06, init_mean=3.52986, init_std_dev=0.1)

algo_svd.fit(trainset)

predictions = algo_svd.test(testset)

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 0.9491
MAE:  0.7471


## SVD++

In [14]:
param_grid_svd_plus = {
    'n_epochs': [10, 15, 20],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.04, 0.06],
    'n_factors': [1, 20, 50],
    'init_mean': [mean_rating],
    'init_std_dev': [0.1]
}

In [15]:
grid_search_svd_plus = GridSearchCV(SVDpp, param_grid_svd_plus, measures=['rmse', 'mae'], cv=3, n_jobs=-1, joblib_verbose=5)

grid_search_svd_plus.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed: 25.8min finished


In [16]:
print(grid_search_svd_plus.best_params)
print(grid_search_svd_plus.best_score['rmse'])
print(grid_search_svd_plus.best_score['mae'])

{'rmse': {'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.04, 'n_factors': 1, 'init_mean': 3.52986, 'init_std_dev': 0.1}, 'mae': {'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.04, 'n_factors': 1, 'init_mean': 3.52986, 'init_std_dev': 0.1}}
1.6989419051965815
1.3738269567125165


In [18]:
algo_svd_plus = SVDpp(n_epochs=20, n_factors=1, lr_all=0.002, reg_all=0.04, init_mean=3.52986, init_std_dev=0.1)

algo_svd_plus.fit(trainset)

predictions = algo_svd_plus.test(testset)

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 1.5882
MAE:  1.2787


## NMF

In [19]:
param_grid_nmf = {
    'n_epochs': [10, 15, 20],
    'n_factors': [1, 20, 50],
    'reg_pu': [0.02, 0.04, 0.06],
    'reg_qi': [0.02, 0.04, 0.06]
}

In [20]:
grid_search_nmf = GridSearchCV(NMF, param_grid_nmf, measures=['rmse', 'mae'], cv=3, n_jobs=-1, joblib_verbose=5)

grid_search_nmf.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed:  2.0min finished


In [22]:
print(grid_search_nmf.best_params)
print(grid_search_nmf.best_score['rmse'])
print(grid_search_nmf.best_score['mae'])

{'rmse': {'n_epochs': 20, 'n_factors': 20, 'reg_pu': 0.06, 'reg_qi': 0.06}, 'mae': {'n_epochs': 20, 'n_factors': 20, 'reg_pu': 0.06, 'reg_qi': 0.06}}
1.1075987808563452
0.8432669462048706


In [23]:
algo_nmf = NMF(n_epochs=20, n_factors=20, reg_pu=0.06, reg_qi=0.06)

algo_nmf.fit(trainset)

predictions = algo_nmf.test(testset)

rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 1.1129
MAE:  0.8439


# Висновок

1. SVD:
- Найкращі параметри: {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.06, 'n_factors': 1, 'init_mean': 3.52986, 'init_std_dev': 0.1}
- Найкраща RMSE: 0.9521
- Найкраща MAE: 0.7500

2. SVD++:
- Найкращі параметри: {'n_epochs': 20, 'lr_all': 0.002, 'reg_all': 0.04, 'n_factors': 1, 'init_mean': 3.52986, 'init_std_dev': 0.1}
- Найкраща RMSE: 1.6989
- Найкраща MAE: 1.3738

3. NMF:
- Найкращі параметри: {'n_epochs': 20, 'n_factors': 20, 'reg_pu': 0.06, 'reg_qi': 0.06}
- Найкраща RMSE: 1.1076
- Найкраща MAE: 0.8433


SVD показує найкращі результати серед усіх трьох алгоритмів з найнижчими значеннями RMSE (0.9521) та MAE (0.7500).

SVD++ показує значно гірші результати порівняно з SVD, з набагато вищими значеннями RMSE (1.6989) та MAE (1.3738). SVD++ враховує неявний відгук, крім явних оцінок, але в даному випадку це не призвело до поліпшення результатів.

NMF показує кращі результати, ніж SVD++, але гірші, ніж SVD. Його RMSE (1.1076) та MAE (0.8433) нижчі, ніж у SVD++, але все ж вищі, ніж у SVD. NMF має інший підхід до факторизації матриць, який спрямований на розкладання матриць на невід'ємні компоненти.

Загалом, SVD виділяється як найефективніший алгоритм для даного набору даних, надаючи найкращу точність прогнозування для рейтингів фільмів.

