# User-based Collaborative Filtering

In [1]:
import math
import numpy as np
from numpy import linalg as LA
from scipy.sparse import coo_matrix
import pandas as pd
from IPython.display import HTML, display

In [2]:
import sys
sys.path.append('scripts')
%run "./scripts/liblecture.py"
from liblecture import displayMovies, display

## Read Data: movies and ratings

Read Movies

In [3]:
movies = pd.read_csv('data/movies_w_imgurl.csv')

Read Rating Data

In [4]:
ratings = pd.read_csv('data/ratings-9_1.csv')
train = ratings[ratings['type'] == 'train'][['userId', 'movieId', 'rating']]
test = ratings[ratings['type'] == 'test'][['userId', 'movieId', 'rating']]

In [5]:
def displayLikedUserMovies(movies, userId, topK):
    display(HTML("<h3>%s</h3><hr>" % userId))
    topKRatings = train[train['userId'] == userId].sort_values(by='rating', ascending=False).head(topK)
    displayMovies(movies, topKRatings.movieId.values, topKRatings.rating.values)

## Convert Ratings to User-Item Sparse Matrix

### Create Index to Id Maps

In [6]:
movieIdToIndex = {}
indexToMovieId = {}
colIdx = 0
for movieId in movies.movieId:
    movieIdToIndex[movieId] = colIdx
    indexToMovieId[colIdx] = movieId
    colIdx += 1

In [7]:
userIdToIndex = {}
indexToUserId = {}
rowIdx = 0
for userId in ratings.userId.unique():
    userIdToIndex[userId] = rowIdx
    indexToUserId[rowIdx] = userId
    rowIdx += 1

### Creat User-Item Sparse Matrix

In [8]:
rows = []
cols = []
vals = []
for row in ratings.itertuples():
    rows.append(userIdToIndex[row.userId])
    cols.append(movieIdToIndex[row.movieId])
    vals.append(row.rating)
coomat = coo_matrix((vals, (rows, cols)), shape=(rowIdx, colIdx))

## Compute User-User Similarities

Compute $l_2$-norm

In [9]:
norms = LA.norm(coomat.toarray(), ord=2, axis=1)

Normalize Row Vectors

In [10]:
np.seterr(divide='ignore', invalid='ignore')
normmat = np.divide(coomat.transpose().toarray(), norms).T

In [11]:
np.nan_to_num(normmat, 0.0)

array([[0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.18, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.12, 0.  , 0.  , ..., 0.  , 0.  , 0.  ]])

Compute Similarities ( = inner product)

In [12]:
sims = pd.DataFrame(data=np.matmul(normmat, normmat.T), index=ratings.userId.unique(), columns=ratings.userId.unique())

## Similarity Example

In [13]:
topK = 6
userId = 33
simUsers = sims.loc[33, :].sort_values(ascending=False).head(11).tail(5)

In [14]:
displayLikedUserMovies(movies, userId, topK)
for index, simUser in simUsers.iteritems():
    displayLikedUserMovies(movies, index, topK)

## User Rating Prediction

In [15]:
userId = 33

### Predict Ratings

In [16]:
ratingDF = pd.DataFrame(data=coomat.toarray(), index=ratings.userId.unique(), columns=movies.movieId.values)
binDF = ratingDF.applymap(lambda x: math.ceil(x/10))

In [17]:
userAvgRatings = pd.DataFrame(data=ratingDF.sum(axis=1).divide(binDF.sum(axis=1)), columns=['avg'])

In [18]:
simUsers = sims.loc[userId, :]
simUsers[userId] = 0

In [19]:
simRatingSums = (ratingDF - binDF.T.multiply(userAvgRatings['avg']).T).T.multiply(simUsers).T.sum(axis=0)
simSums = binDF.T.multiply(simUsers).T.sum(axis=0)
recItemRatings = userAvgRatings.loc[userId].avg + pd.Series(data=simRatingSums.divide(simSums), name='prediction')
recItemRatings.fillna(0, inplace=True)

In [20]:
binDF.T.multiply(userAvgRatings['avg']).T

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,161830,161918,161944,162376,162542,162672,163056,163949,164977,164979
1,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,3.49,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,4.35,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.00,0.0,3.91,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.00,0.0,0.00,0.0,0.0,3.65,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,3.81,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
ratingDF.T.multiply(simUsers).T

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,161830,161918,161944,162376,162542,162672,163056,163949,164977,164979
1,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.00,0.0,0.45,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.00,0.0,0.00,0.0,0.0,0.02,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
669,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
670,0.00,0.0,0.00,0.0,0.0,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Compute Errors (MAE, RMSE)

In [22]:
userTestRatings = pd.DataFrame(data=test[test['userId'] == userId])
temp = userTestRatings.join(recItemRatings.loc[userTestRatings['movieId']], on='movieId')
mae = (temp['rating'] - temp['prediction']).abs().mean()
rmse = math.sqrt((temp['rating'] - temp['prediction']).pow(2).mean())
print(" MAE:", mae)
print("RMSE:", rmse)

#5:
 MAE: 0.5459849184116438


#6:
RMSE: 0.6852932874692779




In [23]:
temp

Unnamed: 0,userId,movieId,rating,prediction
6187,33,1060,4.0,4.02
6198,33,1291,4.0,3.71
6199,33,1347,2.0,3.41
6208,33,1982,4.0,3.49
6212,33,2005,4.0,3.64
6215,33,2064,5.0,4.01
6257,33,3794,4.0,3.3
6292,33,4678,3.0,3.54
6303,33,4974,3.0,2.9
