In [1]:
import json
import gzip
import numpy as np
filename = 'goodreads_reviews_spoiler.json.gz'

A = np.zeros((25475, 18892), dtype=np.float64)
book_idx = {'next_idx': 0}
user_idx = {'next_idx': 0}
with gzip.open(filename, 'rb') as ds:
    for line in ds:
        line = line.rstrip()
        if line:
            obj = json.loads(line)
            book_id = str(obj['book_id'])
            user_id = str(obj['user_id'])
            if book_id not in book_idx.keys():
                book_idx[book_id] = book_idx['next_idx']
                book_idx['next_idx'] += 1
            if user_id not in user_idx.keys():
                user_idx[user_id] = user_idx['next_idx']
                user_idx['next_idx'] += 1
            row = book_idx[book_id]
            col = user_idx[user_id]
            A[row][col] = obj['rating']
print(book_idx['next_idx'])
print(user_idx['next_idx'])
print(A.shape)

25475
18892
(25475, 18892)


In [2]:
def get_bjs(A):
    # return the nonzero rows
    row_idxes, col_idxes = np.nonzero(A)
    bj_s = []
    indexes_s = []
    indexes = []
    k = 0
    while k < len(row_idxes):
        bj = []
        indexes = []
        row_idx = len(bj_s)
        while k < len(row_idxes) and row_idxes[k] == row_idx:
            bj.append(A[row_idx][col_idxes[k]])
            indexes.append(col_idxes[k])
            k += 1
        bj_s.append(np.array(bj))
        indexes_s.append(indexes)
    return bj_s, indexes_s

def get_wj(W, indexes, m):
    # according to the indexes pick corresponding rows of W
    mask = np.zeros((len(indexes), m))
    for i in range(len(indexes)):
        mask[i][indexes[i]] = 1
    return np.dot(mask, W)

def compute_loss(W, Z, A, indexes_W, beta):
    # compute the loss functions
    app = W.dot(Z.transpose())
    # keep elements with positions correponding to the nonzero elements in A
    app2 = np.zeros(app.shape)
    for j in range(len(indexes_W)):
        for i in indexes_W[j]:
            app2[i][j] = app[i][j]
    term1 = np.linalg.norm(A - app2) ** 2
    term2 = beta * np.linalg.norm(W) ** 2
    term3 = beta * np.linalg.norm(Z) ** 2
    loss =  term1 + term2 + term3 
    return loss, term1, term2, term3
    

aj_s, indexes_W = get_bjs(A.transpose())
bj_s, indexes_Z = get_bjs(A)

print(aj_s[0].shape, bj_s[0].shape)

# Hyperparameters
k = 32
beta = 1e-4
iterations = 50

m = 25475 # number of books
n = 18892 # number of users

# Initialize W and Z
W = np.random.rand(m, k) + 1
Z = np.random.rand(n, k) + 1

# start ALS algorithm
for iteration in range(iterations):
    # Fix W, optimize Z
    for j in range(n):
        aj = aj_s[j]
        Wj = get_wj(W, indexes_W[j], m)
        B = Wj.transpose().dot(Wj) + beta * np.eye(k)
        s = aj.transpose().dot(Wj)
        zj = np.linalg.solve(B, s)
        Z[j, :] = zj.squeeze()
    # Fix Z, optimize W
    for j in range(m):
        bj = bj_s[j]
        Zj = get_wj(Z, indexes_Z[j], n)
        B = Zj.transpose().dot(Zj) + beta * np.eye(k)
        s = bj.transpose().dot(Zj)
        wj = np.linalg.solve(B, s)
        W[j, :] = wj.squeeze()
    loss, difference, W_norm, Z_norm = compute_loss(W, Z, A, indexes_W, beta)
    print('iteration: {}, loss: {:.4f}, difference: {:.4f}, W_norm: {:.4f}, Z_norm: {:.4f}'.
          format(iteration, loss, difference, W_norm, Z_norm))
    
            

(77,) (34,)
iteration: 0, loss: 430246.3476, difference: 429875.4317, W_norm: 282.7736, Z_norm: 88.1422
iteration: 1, loss: 346581.8163, difference: 346265.0896, W_norm: 259.5847, Z_norm: 57.1420
iteration: 2, loss: 312144.0836, difference: 311839.8627, W_norm: 253.2707, Z_norm: 50.9502
iteration: 3, loss: 291022.4738, difference: 290721.6652, W_norm: 252.6587, Z_norm: 48.1498
iteration: 4, loss: 276018.0653, difference: 275718.4941, W_norm: 252.7211, Z_norm: 46.8501
iteration: 5, loss: 264549.5967, difference: 264249.0559, W_norm: 253.7006, Z_norm: 46.8402
iteration: 6, loss: 255402.2695, difference: 255099.9574, W_norm: 254.9586, Z_norm: 47.3535
iteration: 7, loss: 247862.2036, difference: 247557.6252, W_norm: 256.6140, Z_norm: 47.9645
iteration: 8, loss: 241501.3605, difference: 241194.3330, W_norm: 258.1396, Z_norm: 48.8879
iteration: 9, loss: 236030.8272, difference: 235721.3526, W_norm: 259.5771, Z_norm: 49.8975
iteration: 10, loss: 231263.5012, difference: 230951.8282, W_norm: 2

In [9]:
target_book_id = 2 
target_row = book_idx[str(target_book_id)]
target_embedding = W[target_row, :]
# construct a row idx to book id dictionary (list)
row_to_book_id = [0] * m
print(len(row_to_book_id))
for key in book_idx.keys():
    if key == 'next_idx':
        continue
    row = book_idx[key]
    row_to_book_id[row] = key

# find 5 books that are closest to target book
mask = np.eye(m)
mask[:, target_row] -= np.ones(m)
compare_embedding = mask.dot(W)
norm_vec = np.linalg.norm(compare_embedding, ord=2, axis=1)
closest_order = np.argsort(norm_vec)
for i in range(6):
    row_idx = closest_order[i]
    print('book id: {}'.format(row_to_book_id[row_idx]))
                          

25475
book id: 2
book id: 1
book id: 5
book id: 15881
book id: 6
book id: 136251
