In [1]:
import pandas as pd
import gzip
import json

In [2]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [17]:
df = getDF('data/All_Beauty_5.json.gz')

In [4]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 1, 2016",A3CIUOJXQ5VDQ2,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Shelly F,As advertised. Reasonably priced,Five Stars,1472688000,,
1,5.0,True,"11 14, 2013",A3H7T87S984REU,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",houserules18,Like the oder and the feel when I put it on my...,Good for the face,1384387200,,
2,1.0,True,"08 18, 2013",A3J034YH7UG4KT,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Adam,I bought this to smell nice after I shave. Wh...,Smells awful,1376784000,,
3,5.0,False,"05 3, 2011",A2UEO5XR3598GI,B0000530HU,"{'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...",Rich K,HEY!! I am an Aqua Velva Man and absolutely lo...,Truth is There IS Nothing Like an AQUA VELVA MAN.,1304380800,25.0,
4,5.0,True,"05 6, 2011",A3SFRT223XXWF7,B00006L9LC,{'Size:': ' 200ml/6.7oz'},C. C. Christian,If you ever want to feel pampered by a shampoo...,Bvlgari Shampoo,1304640000,3.0,


In [5]:
reviewers = sorted(df['reviewerID'].unique())

In [6]:
items = sorted(df['asin'].unique())

In [7]:
scores = sorted(df['overall'].unique())

In [8]:
scores

[1.0, 2.0, 3.0, 4.0, 5.0]

In [9]:
len(reviewers),len(items)

(991, 85)

In [37]:
rows = []
cols = []
data = []
for item in df.itertuples():
    r = item[1]
    u = item[4]
    i = item[5]
    iu = reviewers.index(u)
    ii = items.index(i)
    rows.append(iu)
    cols.append(ii)
    data.append(r)

In [38]:
len(rows)

5269

In [39]:
import numpy as np

In [40]:
ratings = np.zeros((len(reviewers),len(items)))

In [41]:
ratings.shape

(991, 85)

In [42]:
for r,c,d in zip(rows,cols,data):
    ratings[int(r),int(c)] = d

In [43]:
ratings[100]

array([0., 0., 0., 0., 0., 5., 5., 0., 0., 0., 0., 0., 0., 5., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 5., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [46]:
import scipy.sparse as sparse
def create_sparse_matrix(data, rows, cols, len_user,len_item):
    """ Returns a sparse utility matrix""" 
    return sparse.csc_matrix((data,(rows, cols)),shape=(len_user, len_item))

In [47]:
sparse_ratings = create_sparse_matrix(data,rows,cols,len(reviewers),len(items))

In [45]:
def create_embeddings(n, K):
    return 6*np.random.random((n, K)) / K

In [99]:
def cost(data,rows,cols, emb_user, emb_item):
    p_predict = predict(emb_user, emb_item)
    p_data = [p_predict[r][c] for r,c in zip(rows,cols)]
    predicted = create_sparse_matrix(p_data,rows,cols, emb_user.shape[0], emb_item.shape[0])
    return np.sum((sparse_ratings-predicted).power(2))/len(data)

In [84]:
def predict(emb_user, emb_item):
    p_ratings = np.dot(emb_user,emb_item.transpose())
    return p_ratings

In [101]:
beta = 0.9
lmbda = 0.0002
k = 10
learning_rate=0.01
iterations=2000
u_dim = len(reviewers)
i_dim = len(items)

In [49]:
import torch

In [103]:
emb_user = create_embeddings(u_dim,k)
emb_item = create_embeddings(i_dim,k)

In [104]:
emb_user.shape[0],emb_item.shape[0]

(991, 85)

In [97]:
def gradient(df,rows,cols, emb_user, emb_item):
    """ Computes the gradient for user and item embeddings"""
    Y = sparse_ratings
    p_predict = predict(emb_user, emb_item)
#     print(p_predict.shape)
    p_data = [p_predict[r][c] for r,c in zip(rows,cols)]
    predicted = create_sparse_matrix(p_data,rows,cols, emb_user.shape[0], emb_item.shape[0])
    delta =(Y-predicted)
    grad_user = (-2/df.shape[0])*(delta*emb_item) + 2*lmbda*emb_user
    grad_item = (-2/df.shape[0])*(delta.T*emb_user) + 2*lmbda*emb_item
    return grad_user, grad_item

In [105]:
grad_user, grad_item = gradient(df,rows,cols, emb_user, emb_item)
v_user = grad_user
v_item = grad_item
for i in range(iterations):
    grad_user, grad_item = gradient(df,rows,cols, emb_user, emb_item)
    v_user = beta*v_user + (1-beta)*grad_user
    v_item = beta*v_item + (1-beta)*grad_item
    emb_user = emb_user - learning_rate*v_user
    emb_item = emb_item - learning_rate*v_item
    if(not (i+1)%50):
        print("\niteration", i+1, ":")
        print("train mse:",  cost(data,rows,cols, emb_user, emb_item))


iteration 50 :
train mse: 16.98350604199123

iteration 100 :
train mse: 13.420272090425673

iteration 150 :
train mse: 11.213063711743391

iteration 200 :
train mse: 9.760890983310453

iteration 250 :
train mse: 8.740989938899006

iteration 300 :
train mse: 7.978743301094478

iteration 350 :
train mse: 7.378341677292116

iteration 400 :
train mse: 6.8858506858924455

iteration 450 :
train mse: 6.469685966716291

iteration 500 :
train mse: 6.110368414980986

iteration 550 :
train mse: 5.795160898468652

iteration 600 :
train mse: 5.5152424918544245

iteration 650 :
train mse: 5.264192866717178

iteration 700 :
train mse: 5.037153457614434

iteration 750 :
train mse: 4.830341690379502

iteration 800 :
train mse: 4.640753403457753

iteration 850 :
train mse: 4.46596914819623

iteration 900 :
train mse: 4.304020658226391

iteration 950 :
train mse: 4.153294247876669

iteration 1000 :
train mse: 4.012458300548648

iteration 1050 :
train mse: 3.8804073867422764

iteration 1100 :
train mse: 