In [1]:
import pandas as pd
import numpy as np

Import the data set

In [2]:
header = ['user_id','item_id','rating','timestamp']
df = pd.read_csv('ml-100k/u.data',sep='\t',names=header)

In [3]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


Get the user and item counts

In [6]:
n_users = df.user_id.nunique()
n_items = df.item_id.nunique()
print('Number of users = ' + str(n_users) + 
      ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


Split the data into training and testing data sets

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_data, test_data = train_test_split(df,test_size=0.25)

Create the train and test user-item matrices

In [9]:
train_data_matrix = np.zeros((n_users,n_items))

for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

In [10]:
test_data_matrix = np.zeros((n_users,n_items))

for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

Use Scikit-Learn's pairwise_distances function to calculate cosine similarity.

In [11]:
from sklearn.metrics.pairwise import pairwise_distances

In [12]:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

Next, we need to normalize the ratings, as different users use the rating scale differently.  For example, one user might give 5 stars to all movies they like and 1 star to all other movies, where another user might give 4 stars to movies they love and 3 stars to movies they didn't actively hate.  In other words, the relative difference in a given user's ratings are more important than the absolute values and we need to account for this in our predictions.

In [15]:
def predict(ratings,similarity,type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        
        ratings_diff = (ratings - mean_user_rating[:,np.newaxis])
        
        pred = mean_user_rating[:,np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
        
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        
    return pred

In [16]:
item_prediction = predict(train_data_matrix,item_similarity,type='item')
user_prediction = predict(train_data_matrix,user_similarity,type='user')

We can evaluate these predictions using root mean square error.  We want to minimize the root mean square error.  

Since we only want to consider predicted ratings that are in the test data set, we filter out all other elements in the prediction matrix.

In [18]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [26]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    
    return sqrt(mean_squared_error(prediction, ground_truth))

In [25]:
print('User-based CF RMSE: ' + str(rmse(user_prediction,test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction,test_data_matrix)))

User-based CF RMSE: 3.1239719711720833
Item-based CF RMSE: 3.4532449077126826
