# Рекомендательная система на библиотеке surprise

Сайт библиотеки: http://surpriselib.com/

Установки библиотеки:
```
$ pip install scikit-surprise ```

или

$ conda install -c conda-forge scikit-surprise

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("recdemo.csv", sep=";") #небольшой dataset

In [3]:
df

Unnamed: 0,id,A,B,C,D,E,F
0,1,3.0,4.0,2.0,4.0,1.0,
1,2,3.0,4.0,2.0,4.0,,2.0
2,3,,2.0,5.0,5.0,,5.0
3,4,,,,,4.0,
4,5,3.0,,2.0,4.0,4.0,
5,6,,5.0,5.0,5.0,,5.0
6,7,1.0,,,2.0,,3.0
7,8,,,,,4.0,4.0


In [4]:
df_unpivot = pd.melt(df, id_vars=['id'])
df_unpivot.head()

Unnamed: 0,id,variable,value
0,1,A,3.0
1,2,A,3.0
2,3,A,
3,4,A,
4,5,A,3.0


In [5]:
df_unpivot.dropna(inplace=True)
df_unpivot.columns=['userID', 'itemID', 'rating']

In [6]:
df_unpivot.head()

Unnamed: 0,userID,itemID,rating
0,1,A,3.0
1,2,A,3.0
4,5,A,3.0
6,7,A,1.0
8,1,B,4.0


In [7]:
from surprise import Dataset
from surprise import Reader

In [8]:
reader = Reader(rating_scale=(1, 5)) # Зададим разброс оценок
data = Dataset.load_from_df(df_unpivot, reader) #создадим объект, с которым умеет работать библиотека

In [9]:
# make from all existing ratings (that's why `full`)
trainset = data.build_full_trainset()

# make from all absent ratings
# note: not with NAs, because we cleared them out
# but with the set complement (that's why `anti`)
testset = trainset.build_anti_testset()

In [19]:
trainset.to_inner_iid('B')

1

In [20]:
trainset.n_items

6

In [13]:
trainset.ir

defaultdict(list,
            {0: [(0, 3.0), (1, 3.0), (2, 3.0), (3, 1.0)],
             1: [(0, 4.0), (1, 4.0), (4, 2.0), (5, 5.0)],
             2: [(0, 2.0), (1, 2.0), (4, 5.0), (2, 2.0), (5, 5.0)],
             3: [(0, 4.0), (1, 4.0), (4, 5.0), (2, 4.0), (5, 5.0), (3, 2.0)],
             4: [(0, 1.0), (6, 4.0), (2, 4.0), (7, 4.0)],
             5: [(1, 2.0), (4, 5.0), (5, 5.0), (3, 3.0), (7, 4.0)]})

In [12]:
trainset.ur

defaultdict(list,
            {0: [(0, 3.0), (1, 4.0), (2, 2.0), (3, 4.0), (4, 1.0)],
             1: [(0, 3.0), (1, 4.0), (2, 2.0), (3, 4.0), (5, 2.0)],
             2: [(0, 3.0), (2, 2.0), (3, 4.0), (4, 4.0)],
             3: [(0, 1.0), (3, 2.0), (5, 3.0)],
             4: [(1, 2.0), (2, 5.0), (3, 5.0), (5, 5.0)],
             5: [(1, 5.0), (2, 5.0), (3, 5.0), (5, 5.0)],
             6: [(4, 4.0)],
             7: [(4, 4.0), (5, 4.0)]})

In [10]:
testset[0:10]

[(1, 'F', 3.4642857142857144),
 (2, 'E', 3.4642857142857144),
 (5, 'B', 3.4642857142857144),
 (5, 'F', 3.4642857142857144),
 (7, 'B', 3.4642857142857144),
 (7, 'C', 3.4642857142857144),
 (7, 'E', 3.4642857142857144),
 (3, 'A', 3.4642857142857144),
 (3, 'E', 3.4642857142857144),
 (6, 'A', 3.4642857142857144)]

In [11]:
from surprise import KNNBaseline

In [12]:
algo = KNNBaseline(k=1)
algo.fit(trainset)
predictions = algo.test(testset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


In [13]:
predictions[0:5]

[Prediction(uid=1, iid='F', r_ui=3.4642857142857144, est=1.9577925900897966, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=2, iid='E', r_ui=3.4642857142857144, est=1.0422074099102034, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=5, iid='B', r_ui=3.4642857142857144, est=4.082661845573158, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=5, iid='F', r_ui=3.4642857142857144, est=2.0826618455731576, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid=7, iid='B', r_ui=3.4642857142857144, est=3.8701641215302014, details={'actual_k': 1, 'was_impossible': False})]

In [14]:
df_unpivot1 = df_unpivot.copy()
for i in predictions:
    df_unpivot1 = pd.concat([df_unpivot1, pd.DataFrame({'userID':i.uid, 'itemID': i.iid, 'rating': i.est}, index=[0])], ignore_index=True)

In [15]:
df_unpivot1.pivot(index='userID', columns='itemID', values='rating')

itemID,A,B,C,D,E,F
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3.0,4.0,2.0,4.0,1.0,1.957793
2,3.0,4.0,2.0,4.0,1.042207,2.0
3,3.311189,2.0,5.0,5.0,4.090446,5.0
4,3.072513,4.197382,2.072513,4.072513,4.0,3.976639
5,3.0,4.082662,2.0,4.0,4.0,2.082662
6,3.469084,5.0,5.0,5.0,4.248341,5.0
7,1.0,3.870164,1.870164,2.0,3.691628,3.0
8,3.095874,1.909554,2.095874,4.095874,4.0,4.0


In [16]:
df

Unnamed: 0,id,A,B,C,D,E,F
0,1,3.0,4.0,2.0,4.0,1.0,
1,2,3.0,4.0,2.0,4.0,,2.0
2,3,,2.0,5.0,5.0,,5.0
3,4,,,,,4.0,
4,5,3.0,,2.0,4.0,4.0,
5,6,,5.0,5.0,5.0,,5.0
6,7,1.0,,,2.0,,3.0
7,8,,,,,4.0,4.0


In [17]:
algo = KNNBaseline(k=3)
algo.fit(trainset)
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
predictions = algo.test(testset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


In [18]:
df_unpivot3 = df_unpivot.copy()
for i in predictions:
    df_unpivot3 = pd.concat([df_unpivot3, pd.DataFrame({'userID':i.uid, 'itemID': i.iid, 'rating': i.est}, index=[0])], ignore_index=True)
df_unpivot3.pivot(index='userID', columns='itemID', values='rating')

itemID,A,B,C,D,E,F
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3.0,4.0,2.0,4.0,1.0,2.507435
2,3.0,4.0,2.0,4.0,2.601745,2.0
3,3.2561,2.0,5.0,5.0,3.527693,5.0
4,3.083864,4.197382,2.083864,4.083864,4.0,3.976639
5,3.0,3.834977,2.0,4.0,4.0,3.013312
6,3.418235,5.0,5.0,5.0,3.590444,5.0
7,1.0,3.366011,1.857717,2.0,3.095321,3.0
8,2.579863,3.471928,3.46324,4.46324,4.0,4.0


## Кросс-валидация

In [19]:
from surprise.model_selection import cross_validate

In [20]:
cross_validate(algo, data, cv=2, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 2 split(s).

                  Fold 1  Fold 2  Mean    Std     
RMSE (testset)    1.5844  1.4716  1.5280  0.0564  
MAE (testset)     1.1267  1.2586  1.1927  0.0659  
Fit time          0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    


{'test_rmse': array([1.58438275, 1.47163185]),
 'test_mae': array([1.12673283, 1.25859051]),
 'fit_time': (0.00033283233642578125, 0.00011682510375976562),
 'test_time': (0.00017142295837402344, 0.00011277198791503906)}

In [21]:
cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([1.39059437, 1.37461253, 1.52913922]),
 'fit_time': (0.00016641616821289062,
  9.274482727050781e-05,
  7.963180541992188e-05),
 'test_time': (0.0001125335693359375,
  8.368492126464844e-05,
  7.987022399902344e-05)}

In [22]:
for i in range(1,6):
    algo = KNNBaseline(k=i, verbose=False) #отключим вывод логирования
    cv=cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)
    print(str(i)+'NN:',np.mean(cv['test_rmse']))

1NN: 1.6958873596265132
2NN: 1.4544076238672032
3NN: 1.6722911012214372
4NN: 1.4764059261055642
5NN: 1.6234816844390345


## Item-based подход

In [23]:
algo = KNNBaseline(k=1, verbose=False)
predictions = algo.fit(trainset).test(testset)
df_unpivot5 = df_unpivot.copy()
for i in predictions:
    df_unpivot5 = pd.concat([df_unpivot5, pd.DataFrame({'userID':i.uid, 'itemID': i.iid, 'rating': i.est}, index=[0])], ignore_index=True)
df_unpivot5.pivot(index='userID', columns='itemID', values='rating')

itemID,A,B,C,D,E,F
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3.0,4.0,2.0,4.0,1.0,1.957793
2,3.0,4.0,2.0,4.0,1.042207,2.0
3,3.311189,2.0,5.0,5.0,4.090446,5.0
4,3.072513,4.197382,2.072513,4.072513,4.0,3.976639
5,3.0,4.082662,2.0,4.0,4.0,2.082662
6,3.469084,5.0,5.0,5.0,4.248341,5.0
7,1.0,3.870164,1.870164,2.0,3.691628,3.0
8,3.095874,1.909554,2.095874,4.095874,4.0,4.0


In [24]:
sim_options = {'user_based': False } # Item-based подход

In [25]:
algo = KNNBaseline(k=1,sim_options=sim_options, verbose=False)
predictions = algo.fit(trainset).test(testset)
df_unpivot5_items = df_unpivot.copy()
for i in predictions:
    df_unpivot5_items = pd.concat([df_unpivot5_items, pd.DataFrame({'userID':i.uid, 'itemID': i.iid, 'rating': i.est}, index=[0])], ignore_index=True)
df_unpivot5_items.pivot(index='userID', columns='itemID', values='rating')

itemID,A,B,C,D,E,F
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3.0,4.0,2.0,4.0,1.0,2.19945
2,3.0,4.0,2.0,4.0,1.844148,2.0
3,1.695931,2.0,5.0,5.0,4.844148,5.0
4,3.818825,4.122894,3.956402,4.261502,4.0,4.155852
5,3.0,3.304069,2.0,4.0,4.0,2.19945
6,4.695931,5.0,5.0,5.0,4.844148,5.0
7,1.0,1.304069,2.80055,2.0,2.844148,3.0
8,3.818825,3.967042,3.80055,4.10565,4.0,4.0


## Косинусная-мера

In [26]:
algo = KNNBaseline(k=5,sim_options= {'name': 'cosine'}, verbose=False)
predictions = algo.fit(trainset).test(testset)
df_unpivot5_cos = df_unpivot.copy()
for i in predictions:
    df_unpivot5_cos = pd.concat([df_unpivot5_cos, pd.DataFrame({'userID':i.uid, 'itemID': i.iid, 'rating': i.est}, index=[0])], ignore_index=True)
df_unpivot5_cos.pivot(index='userID', columns='itemID', values='rating')

itemID,A,B,C,D,E,F
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,3.0,4.0,2.0,4.0,1.0,3.572362
2,3.0,4.0,2.0,4.0,2.927003,2.0
3,2.754335,2.0,5.0,5.0,3.270544,5.0
4,3.134947,4.197382,2.134947,4.134947,4.0,3.976639
5,3.0,3.668772,2.0,4.0,4.0,3.714426
6,2.939492,5.0,5.0,5.0,3.360864,5.0
7,1.0,3.442211,2.951617,2.0,2.802029,3.0
8,2.700881,3.765123,3.231273,4.431273,4.0,4.0


## Datalens

В библиотеке есть встроенные Dataset. Можно воспользоваться ими.

Возьмем Dataset movielens (Побробнее о нем https://grouplens.org/datasets/movielens/ )

In [27]:
data = Dataset.load_builtin('ml-100k')#загружем данные о фильмах

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /home/ilya/.surprise_data/ml-100k


In [28]:
df = pd.DataFrame(data.raw_ratings)
df.columns = ['user', 'item', 'rating', 'timestamp']
df.head()

Unnamed: 0,user,item,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [29]:
df['user'].nunique(), df['item'].nunique()

(943, 1682)

In [30]:
for i in [1,3,5,7,20]:
    algo = KNNBaseline(k=i, verbose=False) #отключим вывод логирования
    cv=cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)
    print(str(i)+'NN:',np.mean(cv['test_rmse']))

1NN: 1.2487798498442524
3NN: 1.040728830349156
5NN: 0.9915394187738299
7NN: 0.9728157413705123
20NN: 0.9409653250460753


## Задание:
Все задание выполняется на датасэтет 'ml-100k' (Dataset.load_builtin('ml-100k'))
1. Найдте для фильма алгоритмом kNN(k=20) с id 181 все неизвестные оценки (user-based). В ответе: посчитайте среднее полученных оценок.
2. Для параметров k = [10,20,30,40] найдите лучший item-based алгоритм (по метрике RMSE, кросс-валидация на 3 фолда).
Ответ: укажите k
3. Для лучшего алгоритма из пункта 2 найдте для фильма с id 181 все неизвестные оценки. В ответе: посчитайте среднее полученных оценок.