In [30]:
from surprise import KNNWithMeans, KNNBasic, KNNWithZScore, KNNBaseline
from surprise.prediction_algorithms.baseline_only import BaselineOnly
from surprise.prediction_algorithms.slope_one import SlopeOne
from surprise.prediction_algorithms.matrix_factorization import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd

In [2]:
movies = pd.read_csv('C:\\jupyter\\recommended system\\ml-latest-small\\movies.csv')
ratings = pd.read_csv('C:\\jupyter\\recommended system\\ml-latest-small\\ratings.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [5]:
%%time
#создаем общую таблицу из которой понятно какой пользователь какому фильму поставил какую оценку
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.head()

Wall time: 97.8 ms


Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [6]:
movies_with_ratings

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,9.649827e+08
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,8.474350e+08
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1.106636e+09
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1.510578e+09
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1.305696e+09
...,...,...,...,...,...,...
100849,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184.0,4.0,1.537109e+09
100850,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184.0,3.5,1.537110e+09
100851,193585,Flint (2017),Drama,184.0,3.5,1.537110e+09
100852,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184.0,3.5,1.537110e+09


In [7]:
%%time
#создаем датасет для surprise
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

Wall time: 2 ms


In [8]:
dataset

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5
...,...,...,...
100849,184.0,Black Butler: Book of the Atlantic (2017),4.0
100850,184.0,No Game No Life: Zero (2017),3.5
100851,184.0,Flint (2017),3.5
100852,184.0,Bungo Stray Dogs: Dead Apple (2018),3.5


In [9]:
%%time
#создаем данные для surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

Wall time: 87 ms


In [10]:
%%time
trainset, testset = train_test_split(data, test_size=0.20, random_state = 40)

Wall time: 98.5 ms


### Попробуем различные алгоритмы для построения рекомендательной системы

In [15]:
%%time
algo = KNNBasic(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)
test_pred = algo.test(testset)
print(accuracy.rmse(test_pred))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.9151
0.9151177941778924
Wall time: 16.3 s


In [16]:
%%time
algo = KNNWithZScore(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)
test_pred = algo.test(testset)
print(accuracy.rmse(test_pred))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8931
0.8930539833775664
Wall time: 17.1 s


In [17]:
%%time
algo = KNNBaseline(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)
test_pred = algo.test(testset)
print(accuracy.rmse(test_pred))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8525
0.8524833968417639
Wall time: 17.6 s


In [18]:
%%time
algo = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})
algo.fit(trainset)
test_pred = algo.test(testset)
print(accuracy.rmse(test_pred))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8875
0.887452193628495
Wall time: 17.7 s


### KNNBaseline показал наиболучший результат. Подберем остальные параметры

In [20]:
%%time
#проверим разные similarities module
similarities = ['msd', 'cosine', 'pearson', 'pearson_baseline']

for sim in similarities: 
    algo = KNNBaseline(k=40, sim_options={'name': sim, 'user_based': False})
    algo.fit(trainset)
    test_pred = algo.test(testset)
    print(f'при {sim} rmse = {accuracy.rmse(test_pred)} \n' )

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8680
при msd rmse = 0.8679661883312856 

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8912
при cosine rmse = 0.8912462254733767 

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8791
при pearson rmse = 0.879087520101148 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8525
при pearson_baseline rmse = 0.8524833968417639 

Wall time: 1min 17s


### Лучший результат был получен при использовании модуля подобия 'pearson_baseline'

####  Проверим user-based параметр

In [22]:
%%time
algo = KNNBaseline(k=40, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)
test_pred = algo.test(testset)
print(f'при {sim} rmse = {accuracy.rmse(test_pred)}')

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8794
при pearson_baseline rmse = 0.879381000545582
Wall time: 2 s


#### Вероятнее всего на небольшой выборке лучше работает user_based = False

### Подберем параметр k

In [23]:
%%time
#проверим разные К
k_list = [30, 40, 50, 75]

for k in k_list: 
    algo = KNNBaseline(k = k, sim_options={'name': 'pearson_baseline', 'user_based': False})
    algo.fit(trainset)
    test_pred = algo.test(testset)
    print(f'при k = {k} rmse = {accuracy.rmse(test_pred)} \n' )

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8534
при k = 30 rmse = 0.853364580930994 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8525
при k = 40 rmse = 0.8524833968417639 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8521
при k = 50 rmse = 0.8521444666433746 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8521
при k = 75 rmse = 0.8521126822959002 

Wall time: 1min 10s


#### При k = 50 результат чуть лучше, но незначительно, дальнейшее увеличение параметра k не изменяет целевую метрику качества

### Подберем параметр k_min

In [25]:
%%time
#проверим разные min_K
k_n_list = [3,7,10,15]

for k_n in k_n_list: 
    algo = KNNBaseline(k = 50, min_k = k_n, sim_options={'name': 'pearson_baseline', 'user_based': False})
    algo.fit(trainset)
    test_pred = algo.test(testset)
    print(f'при min_k = {k_n} rmse = {accuracy.rmse(test_pred)} \n' )

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8510
при min_k = 3 rmse = 0.8509987639768933 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8499
при min_k = 7 rmse = 0.8498545412494931 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8500
при min_k = 10 rmse = 0.8499779926254192 

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8510
при min_k = 15 rmse = 0.8510483039750644 

Wall time: 1min 9s


#### Наилучший результат был получен при использовании min_k = 7, дальнейшее увеличение или уменьшение приводило к ухудшению результата

#### Проверим влияние другого алгоритма для построения baseline

In [26]:
%%time
algo = KNNBaseline(k=50, min_k = 7, sim_options={'name': 'pearson_baseline', 'user_based': False}, \
                   bsl_options = {'method': 'sgd', 'learning_rate': 0.00005})
algo.fit(trainset)
test_pred = algo.test(testset)
print(f'при {sim} rmse = {accuracy.rmse(test_pred)} \n' )

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8700
при pearson_baseline rmse = 0.8700113364777952 

Wall time: 18.1 s


#### По-умолчаню als алгоритм показывает лучше результаты чем sgd

### Попробуем другие алгоритмы не основанные на K ближайщих соседей

In [23]:
%%time
algo = BaselineOnly()
algo.fit(trainset)
test_pred = algo.test(testset)
print(f'при {sim} rmse = {accuracy.rmse(test_pred)} \n' )

Estimating biases using als...
RMSE: 0.8737
при pearson_baseline rmse = 0.8736728936077199 

Wall time: 337 ms


In [25]:
%%time
algo = SlopeOne()
algo.fit(trainset)
test_pred = algo.test(testset)
print(f'при {sim} rmse = {accuracy.rmse(test_pred)} \n' )

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


RMSE: 0.9038
при pearson_baseline rmse = 0.9037635689719309 

Wall time: 8.67 s


In [27]:
%%time
algo = SVD()
algo.fit(trainset)
test_pred = algo.test(testset)
print(f'при {sim} rmse = {accuracy.rmse(test_pred)} \n' )

RMSE: 0.8767
при pearson_baseline rmse = 0.8766719000609471 

Wall time: 4.09 s


In [28]:
%%time
algo = SVDpp()
algo.fit(trainset)
test_pred = algo.test(testset)
print(f'при {sim} rmse = {accuracy.rmse(test_pred)} \n' )

RMSE: 0.8629
при pearson_baseline rmse = 0.8628636877468244 

Wall time: 7min 52s


### Таким образом для предложенного набора данных, наилучший параметр рекомендательной модели по метрике rmse был получен при:
- KNNBaseline
- k = 50
- 'pearson_baseline' в качестве similarities module
- параметр 'user_based' = False
- min_k = 7