In [1]:
from scipy.sparse import coo_matrix
import pandas as pd

In [2]:
dataFile = 'BX-Book-Ratings.csv'

In [3]:
data = pd.read_csv(dataFile, sep=";", header=0, names=["user", "isbn", "rating"], encoding='latin1')

In [4]:
data['user'] = data['user'].astype('category')
data['isbn'] = data['isbn'].astype('category')

In [5]:
R = coo_matrix(data['rating'].astype(float), (data['user'].cat.codes.copy(), data['user'].cat.codes.copy()))

In [6]:
R.shape

(1, 1149780)

In [7]:
len(R.data)

433671

In [8]:
R.data[0]

5.0

In [9]:
R.col[0]

1

In [10]:
R.row[0]

0

In [11]:
M, N = R.shape

In [12]:
K = 3

In [13]:
import numpy as np
P = np.random.rand(M,K)
Q = np.random.rand(K,N)

In [14]:
from numpy.linalg import norm

In [15]:
def error(R, P, Q, lamda=0.02):
    ratings = R.data
    rows = R.row
    cols = R.col
    e = 0
    
    for ui in range(len(ratings)):
        rui = ratings[ui]
        u = rows[ui]
        i = cols[ui]
        if rui>0:
            e=e + pow(rui-np.dot(P[u,:],Q[:,i]), 2)+lamda*(pow(norm(P[u,:]), 2) + pow(norm(Q[:,i]), 2))
            
    return e

In [16]:
error(R,P,Q)

23810273.977402646

In [17]:
rmse = np.sqrt(error(R,P,Q/len(R.data)))

In [18]:
rmse

5150.985432057938

In [25]:
def SGD(R, K, lamda = 0.02, steps=10, gamma=0.001):
    
    M,N = R.shape
    P = np.random.rand(M,K)
    Q = np.random.rand(K,N)
    
    rmse = np.sqrt(error(R,P,Q,lamda)/len(R.data))
    print('Initial RMSE: ' + str(rmse))
    
    for step in range(steps):
        for ui in range(len(R.data)):
            rui = R.data[ui]
            u = R.row[ui]
            i = R.col[ui]
            
            if rui>0:
                eui = rui-np.dot(P[u,:], Q[:,i])
                P[u,:] = P[u, :] + gamma*2*(eui*Q[:,i]-lamda*P[u,:]) 
                Q[:, i] = Q[:, i] + gamma*2*(eui*P[u,:]-lamda*Q[:,i]) 
                
        rmse = np.sqrt(error(R, P, Q, lamda)/len(R.data))
        if rmse < 0.05:
            break
    print("Final RMSE: " + str(rmse))
    return P, Q

In [26]:
(P, Q) = SGD(R, K=2, gamma=0.0007, lamda = 0.01, steps=2)

Initial RMSE: 7.715923207039724
Final RMSE: 2.8238507952631138
