# Jester dataset: https://goldberg.berkeley.edu/jester-data/

In [1]:
import pandas as pd
import requests
import numpy as np

In [2]:
#URL to download Jester dataset
url = 'https://goldberg.berkeley.edu/jester-data/jester-data-1.zip'

#Send request to download URL
response = requests.get(url)

#Write contents of response to a file
with open('jester_dataset.zip', 'wb') as f:
    f.write(response.content)

#Extract dataset from the downloaded zip file
import zipfile
with zipfile.ZipFile('jester_dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('jester_dataset')

In [3]:
#Load Jester dataset into Pandas DataFrame
df = pd.read_excel('jester_dataset/jester-data-1.xls',header = None)
df = df.sample(n = 2000, random_state = 42)

#Drop the first columns (user IDs) as they are not needed for the analysis
df = df.drop(columns = 0)

#Rename the columns to make them more descriptive
column_names = ['Joke{}'.format(i) for i in range(1, 101)]
df.columns = column_names

#Print the first few rows of the dataset to verify it was loaded correctly
print(df.head(5))

       Joke1  Joke2  Joke3  Joke4  Joke5  Joke6  Joke7  Joke8  Joke9  Joke10  \
17676  99.00  99.00  99.00  99.00  -3.50  99.00  -0.24   7.82  99.00    2.48   
9252   -1.70  -1.12   0.19  -0.34   1.46   0.78  -0.15   0.49  -0.83    1.26   
11401  99.00   4.95  99.00  99.00   5.19  -5.24   7.04  -6.65  99.00   -7.43   
13512   0.68   2.96   3.01  -1.94   2.57   1.41  -2.91   1.50   3.74    2.91   
1675   -0.29  -1.50   0.34   8.93   8.64   9.08  -0.05  -1.26  -3.98    3.88   

       ...  Joke91  Joke92  Joke93  Joke94  Joke95  Joke96  Joke97  Joke98  \
17676  ...    6.41   99.00   99.00   99.00   99.00   99.00    3.79   99.00   
9252   ...    0.83    0.53   -0.10    0.34    1.12    0.87    3.01    0.49   
11401  ...   99.00   99.00   99.00   99.00   99.00   99.00   99.00   99.00   
13512  ...   99.00   99.00   99.00   99.00   99.00    3.06   99.00   99.00   
1675   ...    9.13    8.59    9.17    8.88    8.98    8.98    9.22    9.17   

       Joke99  Joke100  
17676   99.00    99.00  


# User-based Collaborative Filtering
We can use KNN to find the K most similar users to a given user based on their ratings for items. We can then use the ratings of those K similar users to
predict the rating of an item for the given user.
The similarity between users or items is typically measured using a similarity metric such as cosine similarity or Pearson correlation. Onco the similarity
scores are computed, the KNN algorithm selects the K users or items with the highest similarity scores and uses their ratings to make a prediction

In [4]:
!pip install surprise



In [5]:
from surprise import KNNWithMeans, Dataset, Reader
from surprise.model_selection import train_test_split

#load dataset into a Surprise Dataset object
reader = Reader(rating_scale=(-10,10))
data = Dataset.load_from_df(df.stack().reset_index()\
                           .rename(columns={0:'rating','level_0':'user_id','level_1':'item_id'})\
                            [['user_id','item_id','rating']],reader)

#Split dataset into training and test sets
trainset, testset = train_test_split(data, test_size = 0.25)

sim_options = {'name': 'cosine','user_based':True}
model= KNNWithMeans(sim_options = sim_options)
model.fit(trainset)

#Use the model to make preditions on the test set
predictions = model.test(testset)

#Evaluate the accuracy of the model using RMSE
from surprise import accuracy
accuracy.rmse(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 47.4618


47.46178515063383

In [8]:
#Create a Item-based Collaborative Filering model and fit it to the training set
sim_options = {'name': 'cosine', 'user_based': False}
model = KNNWithMeans(sim_options = sim_options)
model.fit(trainset)

#Use the model to make predictions on the test set
predictions = model.test(testset)

#Evaluate the accuracy of the model using RMSE
from surprise import accuracy
accuracy.rmse(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 47.8358


47.83578222632571

# Matrix Factorization

In [None]:
from surprise import SVD

#Set the number of latent factors for the matrix factorization