# **Collaborative Filtering - User based**  




## Simple example
 
*   4 users, 3 items.
*   Users $u_0$  and $u_2$ share the same ratings


In [18]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import heapq

from sklearn.metrics.pairwise import pairwise_distances



In [19]:

#Note that users 0 and 3 have the same ratings
ratings_pd = pd.DataFrame({'m0':[2,3,4,6],'m1':[2,3,np.nan,5],'m2':[5,9,6,1]}, index=['u0','u1','u2','u3'])
ratings = ratings_pd.to_numpy()
ratings_pd

Unnamed: 0,m0,m1,m2
u0,2,2.0,5
u1,3,3.0,9
u2,4,,6
u3,6,5.0,1


In [20]:
#predict(ratings.to_numpy(), user_similarity, type='user')
mean_user_rating = ratings_pd.mean(axis=1).to_numpy().reshape(-1, 1)
mean_user_rating.round(2)

array([[3.],
       [5.],
       [5.],
       [4.]])

In [21]:
ratings_diff = (ratings - mean_user_rating)
ratings_diff[np.isnan(ratings_diff)]=0
ratings_diff.round(2)

array([[-1., -1.,  2.],
       [-2., -2.,  4.],
       [-1.,  0.,  1.],
       [ 2.,  1., -3.]])

In [22]:
from sklearn.metrics.pairwise import pairwise_distances

user_similarity = 1 - pairwise_distances(ratings_diff, metric='cosine')
print(user_similarity.shape)
pd.DataFrame(user_similarity).round(2)

(4, 4)


Unnamed: 0,0,1,2,3
0,1.0,1.0,0.87,-0.98
1,1.0,1.0,0.87,-0.98
2,0.87,0.87,1.0,-0.94
3,-0.98,-0.98,-0.94,1.0


In [23]:
pd.DataFrame(user_similarity.dot(ratings_diff).round(2))

Unnamed: 0,0,1,2
0,-5.83,-3.98,9.81
1,-5.83,-3.98,9.81
2,-5.49,-3.54,9.03
3,5.89,3.95,-9.84


In [24]:
import numpy as np
pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
pred.round(2)

array([[1.48, 1.97, 5.55],
       [3.48, 3.97, 7.55],
       [3.51, 4.04, 7.46],
       [5.51, 5.01, 1.48]])

## Movie Recommender

### Load data

In [25]:
!wget 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
!unzip -o ml-100k.zip

--2021-12-30 19:00:03--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip.1’


2021-12-30 19:00:05 (3.45 MB/s) - ‘ml-100k.zip.1’ saved [4924029/4924029]

Archive:  ml-100k.zip
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating:

In [26]:
ratings_path='ml-100k/u.data'
items_path='ml-100k/u.item'

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(ratings_path, sep='\t', names=r_cols,encoding='latin-1')

#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv(items_path, sep='|', names=i_cols,encoding='latin-1')
print(ratings.shape)
print(ratings.head())

(100000, 4)
   user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596


In [27]:
def keep_top_k(arr, k): 
    smallest = heapq.nlargest(k, arr)[-1] 
    arr[arr < smallest] = 0 # replace anything lower than the cut off with 0
    return arr

# For debug purposes - appends rating r for items on behalf of a debug user
def new_user_ratings(items, r, new_user_id):
  to_append=pd.DataFrame(items, columns=['movie_id'])
  to_append[['user_id','rating','unix_timestamp']] = new_user_id,r,0
  return to_append

In [28]:
# ADD debug user:
def addFakeRatings(ratings):
  scifi = [7,38,39,50,62,82,84,89,96,101,109,121,135,141,145,152,164,171,172,175]
  drama = [14,20,36,125,131,133,197,213,237,246,253,268,275,276,278,280,283,286,306,311,371,421,462,483,531,549,553,582,594,643,662,704,714,716,724]

  new_user_id = ratings.user_id.max()+1
  print(new_user_id)
  ratings = ratings.append(new_user_ratings(scifi,1,new_user_id),ignore_index=True)
  ratings = ratings.append(new_user_ratings(drama,5,new_user_id),ignore_index=True)
  ratings[ratings.user_id==new_user_id].shape

addFakeRatings(ratings)

944


In [29]:
def get_top_rated(data_matrix_row, items, k=20):
  srt_idx = np.argsort(-data_matrix_row)
  #print(~np.isnan(data_matrix_row[srt_idx]))
  srt_idx_not_nan = srt_idx[~np.isnan(data_matrix_row[srt_idx])]
  return items['movie title'].iloc[srt_idx_not_nan][:k]  

#get_top_rated(data_matrix[50],items, 20)


def idx2item(items_path):
  idx_to_item = {}
  with open(items_path, 'r', encoding = "ISO-8859-1") as f:
      for line in f.readlines():
          info = line.split('|')
          idx_to_item[info[1].split(' (')[0]] = int(info[0]) - 1
  return idx_to_item

idx_to_movie = idx2item(items_path)
idx_to_movie['Four Rooms']

2

### Create normalized rating matrix

In [30]:
#calculate the number of unique users and movies.
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

#create ranking table - that table is sparse
data_matrix = np.empty((n_users, n_items))
data_matrix[:] = np.nan
for line in ratings.itertuples():
    user = line[1]-1
    movie = line[2]-1
    rating = line[3]
    data_matrix[user, movie] = rating

#calc mean
mean_user_rating = np.nanmean(data_matrix, axis=1).reshape(-1, 1)

ratings_diff = (data_matrix - mean_user_rating)
#replace nan -> 0
ratings_diff[np.isnan(ratings_diff)]=0

### Create user X user similarity matrix

In [31]:
#calculate user x user similarity matrix
user_similarity = 1-pairwise_distances(ratings_diff, metric='cosine')
print(user_similarity.shape)

# For each user (i.e., for each row) keep only k most similar users, set the rest to 0. 
# Note that the user has the highest similarity to themselves.
k=10
user_similarity = np.array([keep_top_k(np.array(arr),k) for arr  in user_similarity])
print(user_similarity.shape)

(943, 943)
(943, 943)


### Generate predicted ratings matrix

In [32]:
# since n-k users have similarity=0, for each user only k most similar users contribute to the predicted ratings
pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
#pred.round(2)

### Recommend a user top K unrated movies

In [33]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(predicted_ratings_row, data_matrix_row, items, k=5):

    predicted_ratings_unrated = predicted_ratings_row[np.isnan(data_matrix_row)]
    #print(predicted_ratings_unrated)

    idx = np.argsort(-predicted_ratings_unrated)
    #print (idx)
    sim_scores = idx[0:k]
    #print(sim_scores)

    # Return top k movies
    return items['movie title'].iloc[sim_scores]

In [None]:
new_user_id = 12
user = new_user_id-1
predicted_ratings_row = pred[user]
data_matrix_row=data_matrix[user]

print("Top rated movies by test user:")
print(get_top_rated(data_matrix_row,items))

print('****** test user - user_prediction ******')
print(get_recommendations(predicted_ratings_row, data_matrix_row, items,k=10))

# **Collaborative Filtering - Item based**  


## Simple example
 
*   4 users, 3 items.
*   Items $m_0$  and $m_2$ share the same ratings


In [35]:

#Note that users 0 and 3 have the same ratings
ratings_pd = pd.DataFrame({'m0':[1,4,3,6],'m1':[5,5,1,3],'m2':[2,np.nan,4,6]}, index=['u0','u1','u2','u3'])
ratings = ratings_pd.to_numpy()
ratings_pd

Unnamed: 0,m0,m1,m2
u0,1,5,2.0
u1,4,5,
u2,3,1,4.0
u3,6,3,6.0


In [36]:
mean_user_rating = ratings_pd.mean(axis=1).to_numpy().reshape(-1, 1)
mean_user_rating.round(2)

array([[2.67],
       [4.5 ],
       [2.67],
       [5.  ]])

In [37]:
ratings_diff = (ratings - mean_user_rating)
ratings_diff[np.isnan(ratings_diff)]=0
ratings_diff.round(2)

raitingItem = ratings_diff
raitingItem[np.isnan(raitingItem)]=0

#raitingItem


In [38]:
from sklearn.metrics.pairwise import pairwise_distances
# I made the fix here:
item_similarity = 1-pairwise_distances(raitingItem.T, metric='cosine')
print(item_similarity.shape)
pd.DataFrame(item_similarity).round(2)

(3, 3)


Unnamed: 0,0,1,2
0,1.0,-0.93,0.7
1,-0.93,1.0,-0.91
2,0.7,-0.91,1.0


In [39]:
pred = mean_user_rating + raitingItem.dot(item_similarity) / np.array([np.abs(item_similarity).sum(axis=1)])
pred.round(2)

array([[1.03, 4.25, 1.15],
       [4.13, 4.84, 4.19],
       [3.74, 1.54, 3.85],
       [6.35, 3.65, 6.35]])

## Movie Recommender

### Load data

In [40]:
!wget 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'
!unzip -o ml-100k.zip

--2021-12-30 19:00:06--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip.2’


2021-12-30 19:00:08 (3.46 MB/s) - ‘ml-100k.zip.2’ saved [4924029/4924029]

Archive:  ml-100k.zip
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating:

In [41]:
ratings_path='ml-100k/u.data'
items_path='ml-100k/u.item'

#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(ratings_path, sep='\t', names=r_cols,encoding='latin-1')

#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv(items_path, sep='|', names=i_cols,encoding='latin-1')
print(ratings.shape)
print(ratings.head())

#calculate the number of unique users and movies.
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

(100000, 4)
   user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596


In [42]:
def keep_top_k(arr, k): 
    smallest = heapq.nlargest(k, arr)[-1] 
    arr[arr < smallest] = 0 # replace anything lower than the cut off with 0
    return arr

# For debug purposes - appends rating r for items on behalf of a debug user
def new_user_ratings(items, r, new_user_id):
  to_append=pd.DataFrame(items, columns=['movie_id'])
  to_append[['user_id','rating','unix_timestamp']] = new_user_id,r,0
  return to_append

In [43]:
# ADD debug item:

scifi = [7,38,39,50,62,82,84,89,96,101,109,121,135,141,145,152,164,171,172,175]
drama = [14,20,36,125,131,133,197,213,237,246,253,268,275,276,278,280,283,286,306,311,371,421,462,483,531,549,553,582,594,643,662,704,714,716,724]

new_user_id = ratings.user_id.max()+1
print(new_user_id)
for i in range(n_users):
  ratings = ratings.append(new_user_ratings(scifi,1,i),ignore_index=True)
  ratings = ratings.append(new_user_ratings(drama,5,i),ignore_index=True)
ratings[ratings.user_id==new_user_id].shape


944


(0, 4)

In [44]:
def get_top_rated(data_matrix_row, items, k=20):
  srt_idx = np.argsort(-data_matrix_row)
  #print(~np.isnan(data_matrix_row[srt_idx]))
  srt_idx_not_nan = srt_idx[~np.isnan(data_matrix_row[srt_idx])]
  return items['movie title'].iloc[srt_idx_not_nan][:k]  
get_top_rated(data_matrix[50],items, 20)


def idx2item(items_path):
  idx_to_item = {}
  with open(items_path, 'r', encoding = "ISO-8859-1") as f:
      for line in f.readlines():
          info = line.split('|')
          idx_to_item[info[1].split(' (')[0]] = int(info[0]) - 1
  return idx_to_item

idx_to_movie = idx2item(items_path)
#idx_to_movie['Four Rooms']

### Create normalized rating matrix

In [45]:


#create ranking table - that table is sparse
data_matrix = np.empty((n_users, n_items))
data_matrix[:] = np.nan
for line in ratings.itertuples():
    user = line[1]-1
    movie = line[2]-1
    rating = line[3]
    data_matrix[user, movie] = rating

mean_user_rating = np.nanmean(data_matrix, axis=1).reshape(-1, 1)

ratings_diff = (data_matrix - mean_user_rating)
ratings_diff[np.isnan(ratings_diff)]=0

### Create item X item similarity matrix

In [46]:
from sklearn.metrics.pairwise import pairwise_distances

#calculate user x user similarity matrix
item_similarity = 1-pairwise_distances(ratings_diff.T, metric='cosine')
print(item_similarity.shape)
pd.DataFrame(item_similarity).round(2)

# For each item (i.e., for each row) keep only k most similar items, set the rest to 0. 
# Note that the item has the highest similarity to themselves.
k=10
item_similarity = np.array([keep_top_k(np.array(arr),k) for arr  in item_similarity])
print(item_similarity.shape)

(1682, 1682)
(1682, 1682)


### Generate predicted ratings matrix

In [47]:

#pred = mean_user_rating + raitingItem.dot(item_similarity) / np.array([np.abs(item_similarity).sum(axis=1)])

# since n-k users have similarity=0, for each user only k most similar users contribute to the predicted ratings
pred = (mean_user_rating + ratings_diff.dot(item_similarity) / np.array([np.abs(item_similarity).sum(axis=1)])).T
pred.round(2) 
pred.shape

(1682, 943)

In [48]:
pred

array([[3.7158479 , 3.62398182, 3.17592593, ..., 3.81911047, 3.99179341,
        3.46948357],
       [3.21876847, 3.5754717 , 3.17592593, ..., 3.68421053, 3.9453125 ,
        4.22115492],
       [3.71878943, 3.59858833, 3.17592593, ..., 3.68421053, 4.00274289,
        3.55282405],
       ...,
       [3.56949153, 3.59072745, 3.15719992, ..., 3.68421053, 3.89733854,
        3.46948357],
       [3.48247504, 3.5754717 , 3.17592593, ..., 3.68421053, 3.9453125 ,
        3.4642878 ],
       [3.53910776, 3.59605133, 3.17592593, ..., 3.70105866, 3.9453125 ,
        3.43655633]])