1. Reading the Data

In [1]:
import json

def read_data(filename):
    with open(filename, 'r') as file:
        data = [json.loads(line) for line in file]
    return data

train_data = read_data("goodreads_reviews_historybio_train.json")
# print the first 10 line of the data
print(train_data[:10])


[{'user_id': '26d5737b1eaff71248069cde4f590338', 'book_id': '30109111', 'review_id': 'd567c1be612401ee5cbe3da05683561f', 'rating': 5, 'date_added': 'Sun Jun 12 09:38:16 -0700 2016'}, {'user_id': 'e0a970290631fd711484f0d8155f2a06', 'book_id': '7198269', 'review_id': '1fca74b92a06f2cccdc81c1288687495', 'rating': 5, 'date_added': 'Thu May 10 18:37:25 -0700 2012'}, {'user_id': 'cca945e8a7369eeb035afd21527c339b', 'book_id': '32148570', 'review_id': 'e519d8377a10308742dd49f66f4a728a', 'rating': 3, 'date_added': 'Mon May 29 10:32:28 -0700 2017'}, {'user_id': 'd1789d248a75d3cb7c5f16eeee9fe419', 'book_id': '40024', 'review_id': 'a75d9f435773a377fbe81361a1ea19c6', 'rating': 2, 'date_added': 'Mon Jan 26 21:04:34 -0800 2009'}, {'user_id': '819f2797459b579a7782d4bd595e1c36', 'book_id': '3272163', 'review_id': 'cfbc4e10f33bad3bd235f775e1833b2d', 'rating': 3, 'date_added': 'Wed Jun 25 12:12:10 -0700 2014'}, {'user_id': '7d0b0d563843507c71f867720801d84e', 'book_id': '361056', 'review_id': '41556ee650e

## Task 1 [10 points]: Explore biases
(A) [4 points] The global bg bias

In [2]:
def calculate_global_bias(data):
    total_rating = sum([review['rating'] for review in data])
    return total_rating / len(data)

bg = calculate_global_bias(train_data)
print(f"Global bias bg: {bg}")

Global bias bg: 3.7669762808387413


(B) [3 points] The user specific bias of user id= “3913f3be1e8fadc1de34dc49dab06381”

In [5]:
def calculate_user_bias(data, uid, global_bias):
    user_reviews = [review for review in data if review['user_id'] == uid]
    #calculate the average rating of the user
    average_user_rating = sum([review['rating'] for review in user_reviews])/len(user_reviews)
    return average_user_rating - global_bias
user_id = "3913f3be1e8fadc1de34dc49dab06381"

print(f"User bias bu: {calculate_user_bias(train_data, user_id, bg)}")

User bias bu: -0.1139150563489455


(C) [3 points] The item specific bias of book id = “16130”.

In [6]:
def calculate_item_bias(data, book_id, global_bias):
    item_reviews = [review for review in data if review['book_id'] == book_id]
    total_item_bias = sum([review['rating'] for review in item_reviews])/ len(item_reviews)
    return total_item_bias -global_bias if item_reviews else 0

book_id = "16130"
b_item = calculate_item_bias(train_data, book_id, bg)
print(f"Item specific bias for book id {book_id}: {b_item}")

Item specific bias for book id 16130: 0.4562653093753264


## Task 2 [45 points]: Implement the regularized latent factor model without bias using SGD
(A) [30 points] Implement the regularized latent factor model without considering the bias.

In [7]:
import numpy as np

# 1. Initialization
num_users = len(set([d['user_id'] for d in train_data]))
num_items = len(set([d['book_id'] for d in train_data]))
k = 8
P = np.random.normal(size=(num_users, k))
Q = np.random.normal(size=(num_items, k))
eta = 0.01
lambda1 = lambda2 = 0.3
epochs = 10

# Mapping user_ids and book_ids to integer indices for easier array operations
user_map = {user_id: idx for idx, user_id in enumerate(set([d['user_id'] for d in train_data]))}
book_map = {book_id: idx for idx, book_id in enumerate(set([d['book_id'] for d in train_data]))}

# 2. SGD
for epoch in range(epochs):
    np.random.shuffle(train_data)
    for review in train_data:
        i = user_map[review['user_id']]
        j = book_map[review['book_id']]
        r_ij = review['rating']
        e_ij = r_ij - np.dot(Q[j], P[i])
        
        # Update using gradients
        temp_q = Q[j, :]
        Q[j, :] += 2*eta * (e_ij * P[i, :] - lambda1 * Q[j, :])
        P[i, :] += 2*eta * (e_ij * temp_q - lambda2 * P[i, :])
    
    # 3. RMSE Calculation
    squared_errors = []
    for review in train_data:
        i = user_map[review['user_id']]
        j = book_map[review['book_id']]
        r_ij = review['rating']
        squared_errors.append((r_ij - np.dot(Q[j], P[i])) ** 2)
    rmse = np.sqrt(np.mean(squared_errors)/len(train_data)
    print(f"Epoch {epoch+1}, RMSE: {rmse:.4f}")

Epoch 1, RMSE: 3.9371
Epoch 2, RMSE: 3.6894
Epoch 3, RMSE: 3.4364
Epoch 4, RMSE: 3.1195
Epoch 5, RMSE: 2.8181
Epoch 6, RMSE: 2.5578
Epoch 7, RMSE: 2.3350
Epoch 8, RMSE: 2.1458
Epoch 9, RMSE: 1.9842
Epoch 10, RMSE: 1.8458
