In [1]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd

In [2]:
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
movies_with_ratings[movies_with_ratings.userId == 2.0].title.unique()

array(['Shawshank Redemption, The (1994)', 'Tommy Boy (1995)',
       'Good Will Hunting (1997)', 'Gladiator (2000)',
       'Kill Bill: Vol. 1 (2003)', 'Collateral (2004)',
       'Talladega Nights: The Ballad of Ricky Bobby (2006)',
       'Departed, The (2006)', 'Dark Knight, The (2008)',
       'Step Brothers (2008)', 'Inglourious Basterds (2009)',
       'Zombieland (2009)', 'Shutter Island (2010)',
       'Exit Through the Gift Shop (2010)', 'Inception (2010)',
       'Town, The (2010)', 'Inside Job (2010)',
       'Louis C.K.: Hilarious (2010)', 'Warrior (2011)',
       'Dark Knight Rises, The (2012)',
       'Girl with the Dragon Tattoo, The (2011)',
       'Django Unchained (2012)', 'Wolf of Wall Street, The (2013)',
       'Interstellar (2014)', 'Whiplash (2014)', 'The Drop (2014)',
       'Ex Machina (2015)', 'Mad Max: Fury Road (2015)',
       'The Jinx: The Life and Deaths of Robert Durst (2015)'],
      dtype=object)

In [6]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [7]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0
2,7.0,Toy Story (1995),4.5
3,15.0,Toy Story (1995),2.5
4,17.0,Toy Story (1995),4.5


In [8]:
ratings.rating.min()

0.5

In [9]:
ratings.rating.max()

5.0

In [10]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [11]:
trainset, testset = train_test_split(data, test_size=.15)

In [12]:
algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f6fc3ad9fd0>

In [13]:
test_pred = algo.test(testset)

In [14]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8925


0.8924753503608442

In [15]:
predict = algo.predict(uid=2, iid='Fight Club (1999)')

In [16]:
predict

Prediction(uid=2, iid='Fight Club (1999)', r_ui=None, est=4.642326159001888, details={'actual_k': 50, 'was_impossible': False})

In [17]:
def analyze_users(uid, iid):
    n = algo.get_neighbors(uid,10)
    wighted = []
    print(n)
    for p in n:
        rating = movies_with_ratings[(movies_with_ratings['userId'] == p) & (movies_with_ratings['title'] == iid)]
        
        if len(rating.values) > 0:
            value = rating.rating.values[0]
            print(p, value, algo.sim[uid,p], algo.sim[uid,p] * value)
            wighted.append((algo.sim[uid,p] * value,algo.sim[uid,p]))
    return wighted
            
    

In [18]:
w = analyze_users(2, 'Fight Club (1999)')

[166, 222, 237, 608, 103, 74, 523, 341, 399, 275]
166 4.5 0.06689862389597584 0.3010438075318913
222 3.5 0.05918916279195915 0.20716206977185703
608 5.0 0.04599324517064704 0.2299662258532352
103 5.0 0.04403181950433818 0.2201590975216909
523 4.5 0.04027870735001999 0.18125418307508995
399 3.0 0.037438773051128506 0.11231631915338552
275 5.0 0.03697284647022765 0.18486423235113825


# HM

### SVD & GridSearch

In [19]:
from surprise.model_selection import GridSearchCV
from surprise import SVD

In [None]:
param_grid = {'n_epochs': [ 20,23], 'lr_all': [ 0.01,0.015,0.02],
              'reg_all': [  0.1, 0.15, 0.2]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

### KNN & GridSearch

In [None]:
from surprise import KNNWithMeans, KNNBasic, KNNBaseline, KNNWithZScore
import numpy as np

In [None]:
benchmark = []
# Iterate over all algorithms
for algorithm in [ KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore]:
    param_grid = {
              'k': [40,50,55,35],
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [10,20,30],
                              'user_based': [True]}
              }
    gs = GridSearchCV(algorithm, param_grid, measures=['rmse', 'mae'], cv=3)

    gs.fit(data)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(gs.cv_results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[1].split('.')[-1].replace(">",'')], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')   

In [None]:
pd.DataFrame(benchmark).set_index('Algorithm')

In [None]:

param_grid = {'k': [ 40, 45, 50],
              'sim_options': {'name': ['pearson_baseline','pearson_baseline','msd'],'min_support':[3,5],
                              'user_based': [False,True]
                             },
              'bsl_options': {'method': ['als','sgd'],
                              'reg_i': [8,10,12],
                              'reg_u': [3,8,10,15,17]
                             }
             }

gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse'], cv=3, return_train_measures=True)
gs.fit(data)

In [None]:

param_grid = {'k': [ 45, 50],
              'sim_options': {'name': ['pearson_baseline','msd'],'min_support':[3,5],
                              'user_based': [True]
                             },
              'bsl_options': {'method': ['als'],
                              'reg_i': [8,10],
                              'reg_u': [0,15]
                             }
             }

gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse'], cv=3, return_train_measures=True)
gs.fit(data)