In [1]:
import numpy as np
import pandas as pd

In [2]:
header = ['user_id','item_id','rating','timestamp']

In [3]:
df = pd.read_csv('/home/ubuntu/my_data/ml-100k/u.data', sep='\t', names=header)

In [4]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items) 

Number of users = 943 | Number of movies = 1682


In [6]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.25)

## MEMORY BASED CF ##

### Create two user-item matrices, one for training and another for testing

In [7]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [8]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [9]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [10]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')

In [11]:
item_prediction

array([[ 0.36389221,  0.37695858,  0.40368798, ...,  0.44245047,
         0.43236819,  0.43006378],
       [ 0.06938185,  0.07927672,  0.07745292, ...,  0.08137285,
         0.08224604,  0.08206062],
       [ 0.05390415,  0.05695697,  0.0542243 , ...,  0.05307178,
         0.05643256,  0.05662951],
       ..., 
       [ 0.0293968 ,  0.03723037,  0.03679902, ...,  0.04304478,
         0.0422374 ,  0.04300132],
       [ 0.11839587,  0.12649807,  0.13489735, ...,  0.14097401,
         0.13845563,  0.14125099],
       [ 0.1971444 ,  0.1943919 ,  0.21467351, ...,  0.2483854 ,
         0.23916054,  0.24098579]])

In [12]:
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [13]:
user_prediction

array([[ 1.59267241,  0.58830847,  0.47741596, ...,  0.28997517,
         0.28994607,  0.2899085 ],
       [ 1.30753824,  0.28269371,  0.12243182, ..., -0.08503489,
        -0.08373127, -0.08371113],
       [ 1.3387071 ,  0.25802399,  0.10052385, ..., -0.11252787,
        -0.11058766, -0.11054619],
       ..., 
       [ 1.20002526,  0.2313211 ,  0.07720621, ..., -0.12138052,
        -0.12051246, -0.12029686],
       [ 1.36284997,  0.32147417,  0.18921765, ..., -0.0175385 ,
        -0.016972  , -0.01647407],
       [ 1.41487983,  0.38836212,  0.27982316, ...,  0.09642967,
         0.09628141,  0.09655173]])

In [14]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [15]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [16]:
print 'User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix))
print 'Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix))

User-based CF RMSE: 3.12963310081
Item-based CF RMSE: 3.45757809762


## MODEL CF ##

In [17]:
sparsity=round(1.0-len(df)/float(n_users*n_items),3)
print 'The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%'

The sparsity level of MovieLens100K is 93.7%


In [18]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

In [19]:
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print 'User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix))

User-based CF MSE: 2.72017392955
