In [1]:
import pandas as pd

from surprise import NormalPredictor
from surprise import SVD
from surprise import SlopeOne
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, BaselineOnly, CoClustering, SVDpp
from surprise.accuracy import rmse
from surprise.model_selection import train_test_split as surprise_train_test_split

from sklearn import pipeline

# set random state for reproducibility
kwargs = dict(random_state=42)

Documentary of scikit-suprise can be found under https://surprise.readthedocs.io/en/stable/

In [2]:
ratings = pd.read_csv('../../data/preprocessed/ratings_clean_std_0.csv', sep=',').drop(columns={'Unnamed: 0'})

In [3]:
ratings

Unnamed: 0,user_id,imdbID,rating
0,1264,tt0047034,3.5
1,213,tt0304141,2.5
2,593,tt0369436,3.0
3,609,tt1077258,4.0
4,1590,tt0052182,4.0
...,...,...,...
787536,1032,tt0083530,3.0
787537,99,tt0107798,3.0
787538,333,tt0093857,3.0
787539,49,tt0144168,3.0


In [4]:
# build a reader, define the rating scale (minimum and maximum value)
reader = Reader(rating_scale=(0.5, 5))

# The columns must be in the order of user id, item id and ratings 
data = Dataset.load_from_df(ratings[['user_id', 'imdbID', 'rating']], reader)

# KNN

In [6]:
# KNN-basic Grid Search
try:
    grid_search = pd.read_csv('gridsearch_knn_basic.csv')
    param_grid = {'k': [5, 10, 50, 100, 200], 'min_k': [1,5, 10], 'sim_options': {'name': ['msd'], 'user_based': [False]}}
    
    print("Starting GridSearch")
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse','mae'], cv=5, n_jobs=-3)
    gs.fit(data)

    # Print best param combination
    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])
    knn_basic = pd.DataFrame(gs.cv_results)
    grid_search = grid_search.append(knn_basic)
    grid_search.to_csv('gridsearch_knn_basic.csv')

except FileNotFoundError:
    # if no previous grid search run available
    print('No File found: Creating new file...')
    param_grid = {'k': [5, 10, 50, 100, 200], 'min_k': [1, 5, 10], 'sim_options': {'name': ['msd'], 'user_based': [False]}}
    
    print("Starting GridSearch")
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse','mae'], cv=5, n_jobs=-3)
    gs.fit(data)

    # Print best param combination
    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])
    knn_basic = pd.DataFrame(gs.cv_results)
    knn_basic.to_csv('gridsearch_knn_basic.csv')  

No File found: Creating new file...
Starting GridSearch
0.827407567975069
{'k': 100, 'min_k': 1, 'sim_options': {'name': 'msd', 'user_based': False}}


In [7]:
# KNN with Means
try:
    grid_search = pd.read_csv('gridsearch_knn_means.csv')
    param_grid = {'k': [75, 80, 85, 90, 95, 105, 110, 115, 120, 125], 'min_k': [1,5, 10], 'sim_options': {'name': ['msd', 'cosine'], 'user_based': [False]}}
    
    print("Starting GridSearch")
    gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse','mae'], cv=5, n_jobs=-3)
    gs.fit(data)

    # Print best param combination
    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])
    knn_means = pd.DataFrame(gs.cv_results)
    grid_search = grid_search.append(knn_means)
    grid_search.to_csv('gridsearch_knn_means.csv')

except FileNotFoundError:
    # if no previous grid search run available
    print('No File found: Creating new file...')
    param_grid = {'k': [5, 10, 50, 100, 200], 'min_k': [1, 5, 10], 'sim_options': {'name': ['msd'], 'user_based': [False]}}
    
    print("Starting GridSearch")
    gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse','mae'], cv=5, n_jobs=-3)
    gs.fit(data)

    # Print best param combination
    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])
    knn_means = pd.DataFrame(gs.cv_results)
    knn_means.to_csv('gridsearch_knn_means.csv')  

Starting GridSearch
0.7940151034427301
{'k': 115, 'min_k': 10, 'sim_options': {'name': 'msd', 'user_based': False}}


In [None]:
# KNN with Z-Score Grid Search
try:
    grid_search = pd.read_csv('gridsearch_knn_Z.csv')
    param_grid = {'k': [10, 50, 100, 200], 'min_k': [1,5, 10], 'sim_options': {'name': ['msd', 'cosine'], 'user_based': [True, False]}}
    
    print("Starting GridSearch")
    gs = GridSearchCV(KNNWithZScore, param_grid, measures=['rmse','mae'], cv=5)
    gs.fit(data)

    # Print best param combination
    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])
    knn_Z = pd.DataFrame(gs.cv_results)
    grid_search = grid_search.append(knn_Z)
    grid_search.to_csv('gridsearch_knn_Z.csv')

except FileNotFoundError:
    print('No File found: Creating new file...')
    # if no previous grid search run available
    param_grid = {'k': [50, 200], 'min_k': [5], 'sim_options': {'name': ['msd'], 'user_based': [True, False]}}
    
    print("Starting GridSearch")
    gs = GridSearchCV(KNNWithZScore, param_grid, measures=['rmse','mae'], cv=5)
    gs.fit(data)

    # Print best param combination
    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])
    knn_Z = pd.DataFrame(gs.cv_results)
    knn_Z.to_csv('gridsearch_knn_Z.csv')  

No File found: Creating new file...
Starting GridSearch
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
