In [1]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset

In [19]:
ratings_df = pd.read_csv('./preprocessed_data/ratings_small_preprocessed.csv', index_col=0)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
10,1,1371,5.0
11,1,1405,2.0
13,1,2105,8.0
15,1,2193,4.0
16,1,2294,4.0


In [20]:
ratings_df['userId'].info()

<class 'pandas.core.series.Series'>
Int64Index: 44692 entries, 10 to 99997
Series name: userId
Non-Null Count  Dtype
--------------  -----
44692 non-null  int64
dtypes: int64(1)
memory usage: 698.3 KB


### **1. Loading and preprocessing data**

In [21]:
reader = Reader(rating_scale=(0,10))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
data.raw_ratings[:5]

[(1, 1371, 5.0, None),
 (1, 1405, 2.0, None),
 (1, 2105, 8.0, None),
 (1, 2193, 4.0, None),
 (1, 2294, 4.0, None)]

In [22]:
type(data)

surprise.dataset.DatasetAutoFolds

In [23]:
# @TODO: CHANGE IT LATER!!!!
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, test_size=0.25)

In [17]:
type(trainset)

surprise.trainset.Trainset

### **2. Training a model**

In [6]:
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy

KNNalgo = KNNBasic(k=30, min_k=10, verbose=False)
KNNalgo.fit(trainset)
predictions = KNNalgo.test(testset)
MSE = accuracy.mse(predictions)

MSE

MSE: 3.6661


3.6661109882439797

#### Choosing the best parameters

In [7]:
from surprise.model_selection.search import GridSearchCV

param_grid = {'k': range(10,50,10), 'min_k': range(1,10),
              'sim_options': {'name': ['cosine'],
                              'user_based': [False, True]}}

* KNNBasic (approx. 5 min)

In [8]:
gs_KNNBasic = GridSearchCV(KNNBasic, param_grid, measures=['rmse'], cv=3)

gs_KNNBasic.fit(data)

# Best RMSE 
print('*** KNN Basic - results***')
knnbasic_rmse = gs_KNNBasic.best_score['rmse']
print(f' RMSE: {knnbasic_rmse}')

# Optimum parameters
print(gs_KNNBasic.best_params['rmse'])

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

KeyboardInterrupt: 

In [1]:
gs_KNNBasic.best_params['rmse']['k']

NameError: name 'gs_KNNBasic' is not defined

* KNNWithMeans

In [None]:
gs_KNNWithMeans = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=3)

gs_KNNWithMeans.fit(data)

# Best RMSE 
print('*** KNN with means - results ***')
knnmeans_rmse = gs_KNNWithMeans.best_score['rmse']
print(f' RMSE: {knnmeans_rmse}')

# Optimum parameters
print(gs_KNNWithMeans.best_params['rmse'])

* KNNWithZScore

In [None]:
gs_KNNWithZScore = GridSearchCV(KNNWithZScore, param_grid, measures=['rmse'], cv=3)

gs_KNNWithZScore.fit(data)

# Best RMSE 
print('*** KNN with Z score - results ***')
knnzscore_rmse = gs_KNNWithZScore.best_score['rmse']
print(f' RMSE: {knnzscore_rmse}')

# Optimum parameters
print(gs_KNNWithZScore.best_params['rmse'])

### Choosing the best model

In [None]:
RMSE_scores = {gs_KNNBasic: knnbasic_rmse, 
               gs_KNNWithMeans: knnmeans_rmse,
               gs_KNNWithZScore: knnzscore_rmse}

best_model = min(RMSE_scores, key=RMSE_scores.get)
best_params = best_model.best_params['rmse']

print(f'Best model: {best_model}')
print(f'Params: {best_params}')

### **3. Predictions**

In [None]:
k = best_model.best_params['rmse']['k']
min_k = best_model.best_params['rmse']['min_k']
sim_options =  best_model.best_params['rmse']['sim_options']

KNNalgo = KNNWithZScore(k=k, min_k=min_k, sim_options=sim_options)
KNNalgo.fit(trainset)
KNNalgo.predict(testset[0][0], testset[0][1], verbose=True)

In [24]:
KNNalgo = KNNWithZScore(k=50, min_k=10, sim_options={'name': 'cosine', 'user_based': True})
KNNalgo.fit(trainset)
KNNalgo.predict(75, 862, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
user: 75         item: 862        r_ui = None   est = 7.12   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


Prediction(uid=75, iid=862, r_ui=None, est=7.122587189355291, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})