In [None]:
# a few function in this code are taken from https://github.com/nzhinusoftcm/review-on-collaborative-filtering

import os

if not (os.path.exists("recsys.zip") or os.path.exists("recsys")):
    !wget https://github.com/nzhinusoftcm/review-on-collaborative-filtering/raw/master/recsys.zip    
    !unzip recsys.zip

--2022-05-07 21:34:40--  https://github.com/nzhinusoftcm/review-on-collaborative-filtering/raw/master/recsys.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/nzhinusoftcm/review-on-collaborative-filtering/master/recsys.zip [following]
--2022-05-07 21:34:40--  https://raw.githubusercontent.com/nzhinusoftcm/review-on-collaborative-filtering/master/recsys.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15312323 (15M) [application/zip]
Saving to: ‘recsys.zip’


2022-05-07 21:34:41 (186 MB/s) - ‘recsys.zip’ saved [15312323/15312323]

Archive:  recsys.zip
   creating: recsys/
  inflating: recsy

In [None]:
#importing the datasets and libraries
from recsys.datasets import ml100k, ml1m
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

import pandas as pd
import numpy as np

In [None]:
ratings, movies = ml1m.load()

Download data 100.1%
Successfully downloaded ml-1m.zip 5917549 bytes.
Unzipping the ml-1m.zip zip file ...


In [None]:
# get user's mean rating
umean = ratings.groupby(by='userid')['rating'].mean()

In [None]:
#we define all the NaN values using the mean rating

def rating_matrix(ratings):
    # fill missing values with item's average ratings
    df = pd.crosstab(ratings.userid, ratings.itemid, ratings.rating, aggfunc=sum)
    df = df.fillna(df.mean(axis=0))
    
    # subtract user's mean ratings to normalize data
    df = df.subtract(umean, axis=0)
    
    # convert our dataframe to numpy array
    R = df.to_numpy()
    
    return R, df

# generate rating matrix by calling function rating_matrix
R, df = rating_matrix(ratings)

In [None]:
users = sorted(ratings['userid'].unique())
items = sorted(ratings['itemid'].unique())

# create our id encoders
uencoder = LabelEncoder()
iencoder = LabelEncoder()

# fit our label encoder
uencoder.fit(users)
iencoder.fit(items)

LabelEncoder()

In [None]:
class SVD:
    
    def __init__(self, umeam):
        self.umean = umean.to_numpy()
        
        # init svd resultant matrices
        self.P = np.array([])
        self.S = np.array([])
        self.Qh = np.array([])
        
        # init users and items latent factors
        self.u_factors = np.array([])
        self.i_factors = np.array([])
    
    def fit(self, R):
  
        P, s, Qh = np.linalg.svd(R, full_matrices=False)
        self.P = P
        self.S = np.diag(s)
        self.Qh = Qh
        # latent factors of users (u_factors) and items (i_factors)
        self.u_factors = np.dot(self.P, np.sqrt(self.S))
        self.i_factors = np.dot(np.sqrt(self.S), self.Qh)
    
    def predict(self, userid, itemid):
       
        # encode user and item ids
        u = uencoder.transform([userid])[0]
        i = iencoder.transform([itemid])[0]
        
        # the predicted rating is the dot product between the uth row 
        # of u_factors and the ith column of i_factors
        r_hat = np.dot(self.u_factors[u,:], self.i_factors[:,i])
        
        # add the mean rating of user u to the predicted value
        r_hat += self.umean[u]
        
        return r_hat
        
    
    def recommend(self, userid):
        
        # encode user
        u = uencoder.transform([userid])[0]
        
        # the dot product between the uth row of u_factors and i_factors returns
        # the predicted value for user u on all items        
        predictions = np.dot(self.u_factors[u,:], self.i_factors) + self.umean[u]
        
        # sort item ids in decreasing order of predictions
        top_idx = np.flip(np.argsort(predictions))

        # decode indices to get their corresponding itemids
        top_items = iencoder.inverse_transform(top_idx)
        
        # sorted predictions
        preds = predictions[top_idx]
        
        return top_items, preds

In [None]:
# create our svd model
svd = SVD(umean)

# fit our model with normalized ratings
svd.fit(R)

In [None]:
# user for which we make predictions
userid = 1

# list of items for which we are making predictions for user 1
items = [1,3,6,47,50,70,101,110,151,157]

# predictions
for itemid in items:
    r = svd.predict(userid=userid, itemid=itemid)
    print('prediction for userid={} and itemid={} : {}'.format(userid, itemid, r))

prediction for userid=1 and itemid=1 : 5.000000000000012
prediction for userid=1 and itemid=3 : 3.0167364016736555
prediction for userid=1 and itemid=6 : 3.8787234042553855
prediction for userid=1 and itemid=47 : 4.1064204045734565
prediction for userid=1 and itemid=50 : 4.517106001121699
prediction for userid=1 and itemid=70 : 3.156455142231958
prediction for userid=1 and itemid=101 : 3.869565217391316
prediction for userid=1 and itemid=110 : 4.234957020057329
prediction for userid=1 and itemid=151 : 3.588447653429605
prediction for userid=1 and itemid=157 : 2.7326203208556135


In [None]:
userid = 1

# items sorted in decreasing order of predictions for user 1
sorted_items, preds = svd.recommend(userid=userid)

# list of items rated by the user
uitems = ratings.loc[ratings.userid == userid].itemid.to_list()

# remove from sorted_items items already in uitems and pick the top 30 ones
# as recommendation list
top30 = np.setdiff1d(sorted_items, uitems, assume_unique=True)[:30]

# get corresponding predictions from the top30 items
top30_idx = list(np.where(sorted_items == idx)[0][0] for idx in top30)
top30_predictions = preds[top30_idx]

# find corresponding movie titles
zipped_top30 = list(zip(top30,top30_predictions))
top30 = pd.DataFrame(zipped_top30, columns=['itemid','predictions'])
List = pd.merge(top30, movies, on='itemid', how='inner')

# show the list
List

Unnamed: 0,itemid,predictions,title,genres
0,3280,5.0,"Baby, The (1973)",Horror
1,3881,5.0,Bittersweet Motel (2000),Documentary
2,3607,5.0,One Little Indian (1973),Comedy|Drama|Western
3,3233,5.0,Smashing Time (1967),Comedy
4,3382,5.0,Song of Freedom (1936),Drama
5,787,5.0,"Gate of Heavenly Peace, The (1995)",Documentary
6,1830,5.0,Follow the Bitch (1998),Comedy
7,3656,5.0,Lured (1947),Crime
8,3172,5.0,Ulysses (Ulisse) (1954),Adventure
9,989,5.0,Schlafes Bruder (Brother of Sleep) (1995),Drama
