<a href="https://colab.research.google.com/github/vaguiar/CodingSamples/blob/master/hw2/hw2_collab_filtering_exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display, Markdown, HTML
import os

In [2]:
display(Markdown("### Mounting GDrive in Colab" ))
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

#### Mounting GDrive in Colab

Mounted at /content/drive


In [4]:
def get_data(file):
  data = pd.read_csv(f"/content/drive/My Drive/Personalization/hw2/data/ml-latest/{file}")
  display(Markdown(f"{file} has %s columns & %s rows" % ( len(data.columns), len(data.index) ) ))
  return data

display(Markdown("### Reading data"))
movies = get_data("movies.csv")
ratings = get_data("ratings.csv")

#### Reading data

movies.csv has 3 columns & 58098 rows

ratings.csv has 4 columns & 27753444 rows

In [5]:
def get_popularity(ratings_df, id_col):
    # count the number of ratings 
    popularity_df = ratings_df[id_col].value_counts().to_frame().reset_index()
    popularity_df.columns = [id_col, 'count']

    # assign weights
    popularity_df['weights'] = 1/(1-popularity_df['count'].map(popularity_df['count'].value_counts(normalize = True)))

    return popularity_df

def sampling_id(popularity_df, id_col, threshold, num, random_state=123):
    # filter by threshold
    valid_df = popularity_df[popularity_df['count'] >= threshold] 
    # sample by weights 
    sampled_id = valid_df.sample(n = num, weights = 'weights', random_state=random_state)[id_col].tolist()
    
    return sampled_id

In [6]:
# initiate constants
MOVIE_THRESHOLD = 50
USER_THRESHOLD = 50

# get movie rating counts df & sampled movie_ids
popularity_movie_df = get_popularity(ratings, id_col='movieId')
sampled_movieId = sampling_id(popularity_movie_df, id_col='movieId', threshold=MOVIE_THRESHOLD, num=1500)

# get user rating counts df & sampled user_ids
popularity_user_df = get_popularity(ratings, id_col='userId')
sampled_userId = sampling_id(popularity_user_df, id_col='userId', threshold=USER_THRESHOLD, num=25000)

In [9]:
display(Markdown("#### Sampled data"))

# select by sampled movieId
sampled_df = ratings[ratings['movieId'].isin(sampled_movieId)]
# select by sampled userId
sampled_df = sampled_df[sampled_df['userId'].isin(sampled_userId)]

#### Sampled data

In [10]:
sampled_df

Unnamed: 0,userId,movieId,rating,timestamp
2122,31,608,5.0,968886879
2135,31,1240,4.0,968887413
2144,31,1304,3.0,968870967
2154,31,2396,5.0,968886749
2158,31,2716,4.0,968871098
...,...,...,...,...
27752635,283215,276,4.0,862414494
27752642,283215,494,5.0,854982148
27752648,283215,608,3.0,854982147
27752664,283215,788,5.0,854982198


### Surprise - KNN User based recommendations

In [11]:
# Only once per session
!pip install scikit-surprise

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 5.5MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1670927 sha256=584909681cae976de3a9f8b3c8a540442e8ad69f7217f42f4de3f06a640ea73b
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [12]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import KNNWithMeans
from surprise import accuracy

# Step 1 = Data Import & Prep
# ratings_full = get_data("ratings.csv")
ratings_full = sampled_df
ratings = ratings_full.iloc[:20000]

reader = Reader(rating_scale=(0.5,5))

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [13]:
# Create the train test split
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, test_size=0.2, shuffle=True)

In [14]:
## If we had the use data as a training chunk
# # trainsetfull = data.build_full_trainset()

display(Markdown("## Training Set:"))
print('Number of ratings: ', trainset.n_ratings)
print('Number of users: ', trainset.n_users)
print('Number of items: ', trainset.n_items)
print('Global Mean:', trainset.global_mean )

display(Markdown("## Test Set:"))
print('Number of ratings: ', len(testset), '\n')

## Training Set:

Number of ratings:  16000
Number of users:  751
Number of items:  1139
Global Mean: 3.576375


## Test Set:

Number of ratings:  4000 



In [17]:
# Step 2 - Cross-Validation

my_k = 25
my_min_k = 5    # if min_k is not satisfied the global average will be used for estimates. (default 1)
my_sim_option = {
    'name':'pearson', 'user_based':True
    }


algo = KNNWithMeans(
  k = my_k, 
  min_k = my_min_k, 
  sim_options = my_sim_option, 
  verbose = True
  )
  
results = cross_validate(
    algo = algo, 
    data = data, 
    measures=['RMSE', 'MAE', 'MSE'], 
    cv=5, 
    return_train_measures=True
    )
    
print("RMSE: ", results['test_rmse'].mean())
print("MAE: ", results['test_mae'].mean())
print("MSE: ", results['test_mse'].mean())

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE:  0.9236086113338541
MAE:  0.7064122634957551
MSE:  0.8531533330892632


In [71]:
def algo_name(algo):
  return str(algo).split(".")[-1].split("'")[0]

def try_algo(algo_fn, k_val, min_k_val=5, similarity_method='pearson', user_based=True):

  my_k = k_val
  my_min_k = min_k_val
  my_sim_option = {
      'name':similarity_method, 
      'user_based':user_based
      }

  algo = algo_fn(
    k = my_k, 
    min_k = my_min_k, 
    sim_options = my_sim_option, 
    verbose = True
    )
  
  display(Markdown("#### Running '%s' algo (user based=%s) '%s' similarity" %(algo_name(algo_fn), user_based, similarity_method)))

  results = cross_validate(
      algo = algo, 
      data = data, 
      measures=['RMSE', 'MAE', 'MSE'], 
      cv=5, 
      return_train_measures=True
      )

  display(Markdown("#### Results:"))
  print("RMSE: ", results['test_rmse'].mean(), "MAE: ", results['test_mae'].mean(), "MSE: ", results['test_mse'].mean())

  return results

display(Markdown("### A single movie in the sampled dataset has atleast 30 user ratings. Hence while finding computing the KNN similarities, with can use a k with value upto 30. I will use 25"))

### A single movie in the sampled dataset has atleast 30 user ratings. Hence while finding computing the KNN similarities, with can use a k with value upto 30. I will use 25

In [72]:
from surprise import KNNWithMeans, KNNWithZScore, KNNBaseline, KNNBasic

algos = [ KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline ]
sims = ['cosine', 'msd', 'pearson', 'pearson_baseline']
user_based_opts = [ True, False ]
k_val = 25

display(Markdown("## Cross Validating to find best Memory Based option"))
display(Markdown("k-NN based nearest neighbor algorithm with k value of '%s'" %(str(k_val))))

userBasedOp = pd.DataFrame(index=[algo_name(i) for i in algos], columns=sims)
itemBasedOp = pd.DataFrame(index=[algo_name(i) for i in algos], columns=sims)

for algo in algos:
  for sim in sims:
    for user_based_opt in user_based_opts:

      rslt = try_algo(algo, k_val=k_val, similarity_method=sim, user_based=user_based_opt)
      
      if user_based_opt:
        userBasedOp.at[algo_name(algo), sim] = rslt['test_rmse'].mean()
      else:
        itemBasedOp.at[algo_name(algo), sim] = rslt['test_rmse'].mean()

## Cross Validating to find best Memory Based option

k-NN based nearest neighbor algorithm with k value of '25'

#### Running 'KNNBasic' algo (user based=True) 'cosine' similarity

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9658189791740617 MAE:  0.7449374083793476 MSE:  0.9328474221521894


#### Running 'KNNBasic' algo (user based=False) 'cosine' similarity

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9724406576850271 MAE:  0.7530121328023975 MSE:  0.9459737980701538


#### Running 'KNNBasic' algo (user based=True) 'msd' similarity

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9425527625569238 MAE:  0.724565805704881 MSE:  0.8884528358436048


#### Running 'KNNBasic' algo (user based=False) 'msd' similarity

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9502422637865091 MAE:  0.7320446038072256 MSE:  0.9031192363751988


#### Running 'KNNBasic' algo (user based=True) 'pearson' similarity

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9729889461110706 MAE:  0.7555405024553293 MSE:  0.9467570284543292


#### Running 'KNNBasic' algo (user based=False) 'pearson' similarity

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9821537047791814 MAE:  0.7621057579244946 MSE:  0.964687115124671


#### Running 'KNNBasic' algo (user based=True) 'pearson_baseline' similarity

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9607365960633502 MAE:  0.7449063287943292 MSE:  0.9230718900505671


#### Running 'KNNBasic' algo (user based=False) 'pearson_baseline' similarity

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.970406900360922 MAE:  0.7480769915936524 MSE:  0.9419742056528356


#### Running 'KNNWithMeans' algo (user based=True) 'cosine' similarity

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9196033861972787 MAE:  0.7094448064703074 MSE:  0.8458490481477957


#### Running 'KNNWithMeans' algo (user based=False) 'cosine' similarity

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.910833916162361 MAE:  0.7007411137280243 MSE:  0.8297156627088554


#### Running 'KNNWithMeans' algo (user based=True) 'msd' similarity

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9169188233494656 MAE:  0.7062287502295772 MSE:  0.8408790878012296


#### Running 'KNNWithMeans' algo (user based=False) 'msd' similarity

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9050026301957768 MAE:  0.6948317306397798 MSE:  0.8191346813287781


#### Running 'KNNWithMeans' algo (user based=True) 'pearson' similarity

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9290834265644017 MAE:  0.709228925463565 MSE:  0.8633262152175629


#### Running 'KNNWithMeans' algo (user based=False) 'pearson' similarity

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9242276989582328 MAE:  0.7081640468389276 MSE:  0.8544762566084646


#### Running 'KNNWithMeans' algo (user based=True) 'pearson_baseline' similarity

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9231773202558939 MAE:  0.703836754046887 MSE:  0.8524864904609484


#### Running 'KNNWithMeans' algo (user based=False) 'pearson_baseline' similarity

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9250722859559529 MAE:  0.7096979431769541 MSE:  0.855848441430507


#### Running 'KNNWithZScore' algo (user based=True) 'cosine' similarity

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9219727607286561 MAE:  0.7056235324869007 MSE:  0.8501892505044154


#### Running 'KNNWithZScore' algo (user based=False) 'cosine' similarity

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9152903515336404 MAE:  0.7023030682667695 MSE:  0.8378055302564318


#### Running 'KNNWithZScore' algo (user based=True) 'msd' similarity

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9210026708485021 MAE:  0.7064289845049017 MSE:  0.8483435107635253


#### Running 'KNNWithZScore' algo (user based=False) 'msd' similarity

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.912774220759023 MAE:  0.6987529506546996 MSE:  0.8332507614684893


#### Running 'KNNWithZScore' algo (user based=True) 'pearson' similarity

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9251560102036211 MAE:  0.705872356170387 MSE:  0.8562826661970421


#### Running 'KNNWithZScore' algo (user based=False) 'pearson' similarity

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9257984791688971 MAE:  0.7080565653723132 MSE:  0.8573008914127911


#### Running 'KNNWithZScore' algo (user based=True) 'pearson_baseline' similarity

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9256600435850659 MAE:  0.7006135319339135 MSE:  0.8569095916497049


#### Running 'KNNWithZScore' algo (user based=False) 'pearson_baseline' similarity

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9232682242733412 MAE:  0.7056662525315129 MSE:  0.8524751744546595


#### Running 'KNNBaseline' algo (user based=True) 'cosine' similarity

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.8943736790121198 MAE:  0.6883059687238102 MSE:  0.7999983115715468


#### Running 'KNNBaseline' algo (user based=False) 'cosine' similarity

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9002812993545468 MAE:  0.6931943364626592 MSE:  0.8107505720265535


#### Running 'KNNBaseline' algo (user based=True) 'msd' similarity

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.8917833672255258 MAE:  0.684863813807837 MSE:  0.7954554952850247


#### Running 'KNNBaseline' algo (user based=False) 'msd' similarity

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.8931571561786653 MAE:  0.6845424885269991 MSE:  0.797757389109743


#### Running 'KNNBaseline' algo (user based=True) 'pearson' similarity

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9013626812612182 MAE:  0.6933877661714771 MSE:  0.8125132576896819


#### Running 'KNNBaseline' algo (user based=False) 'pearson' similarity

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.9016170132384544 MAE:  0.6932725487344364 MSE:  0.8131346863329915


#### Running 'KNNBaseline' algo (user based=True) 'pearson_baseline' similarity

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.8932172950050787 MAE:  0.6865872124600345 MSE:  0.7980038160250855


#### Running 'KNNBaseline' algo (user based=False) 'pearson_baseline' similarity

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


#### Results:

RMSE:  0.8984952447060446 MAE:  0.6878728580988007 MSE:  0.8073658718013734


In [80]:
userBasedOp

Unnamed: 0,cosine,msd,pearson,pearson_baseline
KNNBasic,0.965819,0.942553,0.972989,0.960737
KNNWithMeans,0.919603,0.916919,0.929083,0.923177
KNNWithZScore,0.921973,0.921003,0.925156,0.92566
KNNBaseline,0.894374,0.891783,0.901363,0.893217


In [73]:
itemBasedOp

Unnamed: 0,cosine,msd,pearson,pearson_baseline
KNNBasic,0.972441,0.950242,0.982154,0.970407
KNNWithMeans,0.910834,0.905003,0.924228,0.925072
KNNWithZScore,0.91529,0.912774,0.925798,0.923268
KNNBaseline,0.900281,0.893157,0.901617,0.898495


In [81]:
# Train model and predicts with train/test data cuts separately.

display(Markdown("#### Our best RMSE is with 'KNNBaseline', UserBased and 'msd' similarity"))

algo = KNNBaseline(
    k=25,
    min_k=5,
    sim_options={
      'name':'msd', 'user_based':True
    },
    verbose=True
)

algo.fit(trainset)
predictions = algo.test(testset)

#### Our best RMSE is with 'KNNBaseline', UserBased and 'msd' similarity

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


In [82]:
accuracy.rmse(predictions)

RMSE: 0.8853


0.8852887569300382

In [86]:
print(testset)

[(5609, 3471, 4.0), (5948, 4850, 2.5), (6021, 1584, 3.0), (1519, 1036, 4.0), (4158, 356, 4.0), (6625, 6333, 2.5), (4311, 7143, 4.5), (4853, 83803, 5.0), (1055, 6709, 4.0), (5247, 6380, 3.5), (3369, 8376, 4.0), (7398, 1845, 4.5), (5356, 788, 3.0), (5609, 8376, 4.0), (2053, 1527, 4.0), (6783, 80463, 3.0), (3665, 1958, 2.0), (5407, 2064, 4.0), (5196, 165551, 3.0), (4365, 4226, 3.5), (5352, 48, 3.0), (8150, 276, 1.0), (2839, 53972, 4.0), (7364, 4103, 3.5), (111, 3799, 2.0), (1293, 2396, 4.0), (351, 5349, 4.5), (314, 1584, 3.0), (2890, 1304, 4.0), (7603, 802, 3.0), (6030, 2716, 4.0), (609, 442, 4.0), (3842, 1240, 4.0), (5418, 3102, 2.5), (443, 2858, 4.0), (7850, 6365, 4.0), (6884, 193, 3.5), (1028, 5349, 3.0), (2463, 8644, 3.0), (7243, 788, 3.0), (1523, 1240, 2.5), (6681, 3450, 4.0), (1933, 7371, 3.5), (702, 33794, 4.5), (7977, 2488, 3.5), (7724, 84152, 5.0), (717, 70286, 4.5), (1757, 1242, 3.0), (2997, 2858, 3.0), (3450, 1269, 3.0), (1909, 922, 5.0), (1961, 3454, 5.0), (6750, 262, 5.0), (6

In [98]:
def test_user_mov(user, movie):
  print(algo.predict(uid = user, iid = movie))
  print("User seen in training:", trainset.knows_user(user))
  print("Movie seen in training:", trainset.knows_item(movie))

In [99]:
test_user_mov(73, 94959)

user: 73         item: 94959      r_ui = None   est = 4.06   {'actual_k': 24, 'was_impossible': False}
User seen in training: True
Movie seen in training: False


In [100]:
test_user_mov(2610, 1584)

user: 2610       item: 1584       r_ui = None   est = 3.57   {'actual_k': 25, 'was_impossible': False}
User seen in training: False
Movie seen in training: False


In [103]:
test_user_mov(6021, 1584)

user: 6021       item: 1584       r_ui = None   est = 4.14   {'actual_k': 25, 'was_impossible': False}
User seen in training: False
Movie seen in training: False
