1. Reading the Data

In [2]:
import json

def read_data(filename):
    with open(filename, 'r') as file:
        data = [json.loads(line) for line in file]
    return data

train_data = read_data("goodreads_reviews_historybio_train.json")
# print the first 10 line of the data
print(train_data[:10])
# print the number of data
print("The number of data is: ", len(train_data))


[{'user_id': '26d5737b1eaff71248069cde4f590338', 'book_id': '30109111', 'review_id': 'd567c1be612401ee5cbe3da05683561f', 'rating': 5, 'date_added': 'Sun Jun 12 09:38:16 -0700 2016'}, {'user_id': 'e0a970290631fd711484f0d8155f2a06', 'book_id': '7198269', 'review_id': '1fca74b92a06f2cccdc81c1288687495', 'rating': 5, 'date_added': 'Thu May 10 18:37:25 -0700 2012'}, {'user_id': 'cca945e8a7369eeb035afd21527c339b', 'book_id': '32148570', 'review_id': 'e519d8377a10308742dd49f66f4a728a', 'rating': 3, 'date_added': 'Mon May 29 10:32:28 -0700 2017'}, {'user_id': 'd1789d248a75d3cb7c5f16eeee9fe419', 'book_id': '40024', 'review_id': 'a75d9f435773a377fbe81361a1ea19c6', 'rating': 2, 'date_added': 'Mon Jan 26 21:04:34 -0800 2009'}, {'user_id': '819f2797459b579a7782d4bd595e1c36', 'book_id': '3272163', 'review_id': 'cfbc4e10f33bad3bd235f775e1833b2d', 'rating': 3, 'date_added': 'Wed Jun 25 12:12:10 -0700 2014'}, {'user_id': '7d0b0d563843507c71f867720801d84e', 'book_id': '361056', 'review_id': '41556ee650e

## Task 1 [10 points]: Explore biases

In [11]:
def calculate_global_bias(data):
    total_rating = sum([review['rating'] for review in data])
    return total_rating / len(data)

def calculate_sum_and_count(data):
    user_sums = {}
    user_counts = {}
    item_sums = {}
    item_counts = {}
    
    for review in data:
        user_id = review['user_id']
        item_id = review['book_id']
        rating = review['rating']
        
        user_sums[user_id] = user_sums.get(user_id, 0) + rating
        user_counts[user_id] = user_counts.get(user_id, 0) + 1
        
        item_sums[item_id] = item_sums.get(item_id, 0) + rating
        item_counts[item_id] = item_counts.get(item_id, 0) + 1
        
    return user_sums, user_counts, item_sums, item_counts

def calculate_all_user_bias_optimized(user_sums, user_counts, global_bias):
    return {user: (user_sums[user]/user_counts[user]) - global_bias for user in user_sums}

def calculate_all_item_bias_optimized(item_sums, item_counts, global_bias):
    return {item: (item_sums[item]/item_counts[item]) - global_bias for item in item_sums}
# A
bg = calculate_global_bias(train_data)
print(f"Global bias: {bg}")

user_sums, user_counts, item_sums, item_counts = calculate_sum_and_count(train_data)
all_user_biases = calculate_all_user_bias_optimized(user_sums, user_counts, bg)
all_item_biases = calculate_all_item_bias_optimized(item_sums, item_counts, bg)

user_id = "3913f3be1e8fadc1de34dc49dab06381"
book_id = "16130"
# B
print(f"User bias for user {user_id}: {all_user_biases[user_id]}")
# C
print(f"Item bias for book {book_id}: {all_item_biases[book_id]}")

Global bias: 3.7669762808387413
User bias for user 3913f3be1e8fadc1de34dc49dab06381: -0.1139150563489455
Item bias for book 16130: 0.4562653093753264


### Task 2 [45 points]: Implement the regularized latent factor model without bias using SGD
(A) [30 points] Implement the regularized latent factor model without considering the bias.

In [5]:
import numpy as np

def initialization(k, datasets):
    # Extract all unique users and items from all datasets
    all_users = set()
    all_items = set()
    for data in datasets:
        all_users.update([d['user_id'] for d in data])
        all_items.update([d['book_id'] for d in data])
    
    num_users = len(all_users)
    num_items = len(all_items)
    
    P = np.random.normal(scale=0.01, size=(num_users, k))
    Q = np.random.normal(scale=0.01, size=(num_items, k))
    
    user_map = {user_id: idx for idx, user_id in enumerate(all_users)}
    item_map = {book_id: idx for idx, book_id in enumerate(all_items)}
    
    return P, Q, user_map, item_map

def SGD(data, P, Q, user_map, item_map, eta, lambda1, lambda2, epochs):
    for epoch in range(epochs):
        np.random.shuffle(data)
        squared_errors = []
        
        for review in data:
            i = user_map[review['user_id']]
            j = item_map[review['book_id']]
            r_ij = review['rating']
            e_ij = r_ij - np.dot(Q[j], P[i])
            
            # Update using gradients
            temp_q = Q[j, :]
            Q[j, :] += 2 * eta * (e_ij * P[i, :] - lambda1 * Q[j, :])
            P[i, :] += 2 * eta * (e_ij * temp_q - lambda2 * P[i, :])
            
            squared_errors.append(e_ij ** 2)
        
        rmse = np.sqrt(sum(squared_errors) / len(data))
        print(f"Epoch {epoch+1}: RMSE = {rmse}")
    
    return P, Q

def compute_RMSE(data, P, Q, user_map, item_map):
    squared_errors = []
    for review in data:
        i = user_map.get(review['user_id'], None)
        j = item_map.get(review['book_id'], None)
        if i is None or j is None:
            continue
        r_ij = review['rating']
        e_ij = r_ij - np.dot(Q[j], P[i])
        squared_errors.append(e_ij ** 2)
    rmse = np.sqrt(sum(squared_errors) / len(data))
    return rmse

train_data = read_data("goodreads_reviews_historybio_train.json")
validation_data = read_data("goodreads_reviews_historybio_val.json")
test_data = read_data("goodreads_reviews_historybio_test.json")
# (A)
k = 8
eta = 0.01
lambda1 = lambda2 = 0.3
epochs = 10

# Initialization for all datasets (train, validation, test)
P, Q, user_map, item_map = initialization(k, [train_data, validation_data, test_data])

# Report the RMSE on the training data for each epoch
print("Report the RMSE on the training data for each epoch:")
P,Q = SGD(train_data, P, Q, user_map, item_map, eta, lambda1, lambda2, epochs)

Report the RMSE on the training data for each epoch:
Epoch 1: RMSE = 3.9712275565759962
Epoch 2: RMSE = 3.920560461286309
Epoch 3: RMSE = 3.3634407088134237
Epoch 4: RMSE = 2.8056054187920463
Epoch 5: RMSE = 2.4359231907821552
Epoch 6: RMSE = 2.171073773692028
Epoch 7: RMSE = 1.9716202196493424
Epoch 8: RMSE = 1.8148277056015418
Epoch 9: RMSE = 1.688415492674227
Epoch 10: RMSE = 1.5842972397232302


### (B) [15 points] Use SGD to train the latent factor model on the training data for different values of k in {4,8,16}. Pick the model that results in the best RMSE on the validation set and report its RMSE on the test data.


In [8]:
k_values = [4, 8, 16]
best_rmse = float('inf')
best_k = None
for k in k_values:
    P, Q, user_map, item_map = initialization(k, [train_data, validation_data, test_data])
    SGD(train_data, P, Q, user_map, item_map, eta, lambda1, lambda2, epochs)
    
    rmse_val = compute_RMSE(validation_data, P, Q, user_map, item_map)
    print("-------------------------------------------")
    print(f"k={k}: Validation RMSE = {rmse_val}")
    print("-------------------------------------------")

    if rmse_val < best_rmse:
        best_rmse = rmse_val
        best_k = k

print(f"Best k value for Validation set: {best_k}")
print("-------------------------------------------")

# Compute RMSE on test data for the best k value
P_best, Q_best, user_map_best, item_map_best = initialization(best_k, [train_data, validation_data, test_data])
SGD(train_data, P_best, Q_best, user_map_best, item_map_best, eta, lambda1, lambda2, epochs)
rmse_test = compute_RMSE(test_data, P_best, Q_best, user_map_best, item_map_best)
print("-------------------------------------------")
print(f"Test RMSE with best k value: {rmse_test}")

Epoch 1: RMSE = 3.9712380849997726
Epoch 2: RMSE = 3.939066854177271
Epoch 3: RMSE = 3.4277956704025527
Epoch 4: RMSE = 2.848231186744928
Epoch 5: RMSE = 2.4671284941425067
Epoch 6: RMSE = 2.194073526069323
Epoch 7: RMSE = 1.9895334844537524
Epoch 8: RMSE = 1.8297656368639728
Epoch 9: RMSE = 1.700921355294477
Epoch 10: RMSE = 1.5953063723279999
-------------------------------------------
k=4: Validation RMSE = 2.164384796192713
-------------------------------------------
Epoch 1: RMSE = 3.9712195886468415
Epoch 2: RMSE = 3.912394887490233
Epoch 3: RMSE = 3.3443390603790064
Epoch 4: RMSE = 2.793915745824147
Epoch 5: RMSE = 2.4278254130035135
Epoch 6: RMSE = 2.164925273109161
Epoch 7: RMSE = 1.9664348410254582
Epoch 8: RMSE = 1.8103438125324618
Epoch 9: RMSE = 1.6843064691462886
Epoch 10: RMSE = 1.5807749159855573
-------------------------------------------
k=8: Validation RMSE = 2.1622865101297593
-------------------------------------------
Epoch 1: RMSE = 3.971210543962408
Epoch 2: RMS

### Task 3 [45 points]: Implement the regularized latent factor model with bias using SGD
(A) [30 points] Incorporate the bias terms bg, b(user) and b(item) to the latent factor model.

In [13]:
def initialization_bias(k, datasets):
    # Extract all unique users and items from all datasets
    all_users = set()
    all_items = set()
    for data in datasets:
        all_users.update([d['user_id'] for d in data])
        all_items.update([d['book_id'] for d in data])
    
    num_users = len(all_users)
    num_items = len(all_items)
    
    P = np.random.normal(scale=0.01, size=(num_users, k))
    Q = np.random.normal(scale=0.01, size=(num_items, k))
    
    user_map = {user_id: idx for idx, user_id in enumerate(all_users)}
    item_map = {book_id: idx for idx, book_id in enumerate(all_items)}
    
    # Initialize biases based on computed user and item biases
    b_users = {user: all_user_biases.get(user, 0.0) for user in user_map.keys()}
    b_items = {item: all_item_biases.get(item, 0.0) for item in item_map.keys()}
    
    return P, Q, user_map, item_map, b_users, b_items

def SGD_with_bias(data, P, Q, user_map, item_map, eta, lambda1, lambda2, lambda3, lambda4, bg, b_user, b_item, epochs):
    for epoch in range(epochs):
        np.random.shuffle(data)
        for review in data:
            i = user_map[review['user_id']]
            j = item_map[review['book_id']]
            r_ij_actual = review['rating']
            user_bias = b_users.get(review['user_id'], 0.0)
            item_bias = b_items.get(review['book_id'], 0.0)
            r_ij_predicted = bg + user_bias + item_bias + np.dot(Q[j], P[i])
            e_ij = r_ij_actual - r_ij_predicted

            # Update using gradients
            temp_q = Q[j, :]
            Q[j, :] += 2*eta * (e_ij * P[i, :] - lambda1 * Q[j, :])
            P[i, :] += 2*eta * (e_ij * temp_q - lambda2 * P[i, :])

            # Update biases
            b_users[review['user_id']] += 2*eta * (e_ij - lambda3 * b_users[review['user_id']])
            b_items[review['book_id']] += 2*eta * (e_ij - lambda4 * b_items[review['book_id']])

        rmse_train = compute_rmse_bias(P,Q, bg, b_users, b_items, data, user_map, item_map)
        print(f"Epoch {epoch+1}: RMSE = {rmse_train}")
    return P, Q, b_users, b_items
def compute_rmse_bias(P, Q, bg, b_users, b_items, data, user_map, item_map):
    squared_errors = []
    for review in data:
        i = user_map[review['user_id']]
        j = item_map[review['book_id']]
        r_ij = review['rating']
        user_bias = b_users.get(review['user_id'], 0.0)
        item_bias = b_items.get(review['book_id'], 0.0)
        squared_errors.append((r_ij - (bg + user_bias + item_bias + np.dot(Q[j], P[i]))) ** 2)
    rmse = np.sqrt(sum(squared_errors) / len(squared_errors))
    return rmse

lambda1 = lambda2 = lambda3 =lambda4 =0.3
user_id = "3913f3be1e8fadc1de34dc49dab06381"
book_id = "16130"
# bg = calculate_global_bias(train_data)
# b_user = calculate_user_bias(train_data, user_id, bg)
# b_item = calculate_item_bias(train_data, book_id, bg)
P, Q, user_map, item_map, b_users, b_items = initialization_bias(k, [train_data, validation_data, test_data])
print("Start to SGD with bias")
SGD_with_bias(train_data, P, Q, user_map, item_map, eta, lambda1, lambda2, lambda3, lambda4, bg, b_users, b_items, epochs=10)
print("-------------------------------------------")

# After finishing all epoches, report the learned user-specific bias of the user with user id= “3913f3be1e8fadc1de34dc49dab06381” , and the learned item- specific bias of the book with book id = “16130”.
print(f"User bias for user {user_id}: {b_users[user_id]}")
print(f"Item specific bias for book id {book_id}: {b_items[book_id]}")

Start to SGD with bias
Epoch 1: RMSE = 0.9011646626295965
Epoch 2: RMSE = 0.893854481855423
Epoch 3: RMSE = 0.8887689401512124
Epoch 4: RMSE = 0.8854306106361683
Epoch 5: RMSE = 0.882725747261069
Epoch 6: RMSE = 0.8806881257645449
Epoch 7: RMSE = 0.8792244036607717
Epoch 8: RMSE = 0.8771279643236278
Epoch 9: RMSE = 0.8762063416685426
Epoch 10: RMSE = 0.8746691021799325
-------------------------------------------
User bias for user 3913f3be1e8fadc1de34dc49dab06381: 0.020714971569287772
Item specific bias for book id 16130: 0.4697761519984729


(B) [15 points] Similar to Task 2 (B), find the best k in {4, 8, 16} for the model you developed in Task 3 (A) on the validation set, by using RMSE to compare across these models, and apply the best of these models to the test data. Compare the resulting test RMSE with Task 2 (B). Analyse and explain your findings.

In [17]:
k_values = [4, 8, 16]
best_rmse = float('inf')
best_k = None
user_id = "3913f3be1e8fadc1de34dc49dab06381"
book_id = "16130"
bg = calculate_global_bias(train_data)

for k in k_values:
    P, Q, user_map, item_map, b_users, b_items = initialization_bias(k, [train_data, validation_data, test_data])
    print("Start SGD_with_bias with "+str(k)+" for train data")
    print("-------------------------------------------")
    SGD_with_bias(train_data, P, Q, user_map, item_map, eta,lambda1, lambda2, lambda3, lambda4, bg, b_users, b_items, 10)
    rmse_val = compute_rmse_bias(P,Q, bg, b_users, b_items, validation_data ,user_map, item_map)
    print("-------------------------------------------")
    print(f"k={k}: Validation RMSE = {rmse_val}")
    print("-------------------------------------------")
    if rmse_val < best_rmse:
        best_rmse = rmse_val
        best_k = k

print("-------------------------------------------")
print(f"Best k value (Task 3): {best_k}")
print("-------------------------------------------")
# Use the best model for test data
P_best, Q_best, user_map_best, item_map_best,b_users_best, b_items_best = initialization_bias(best_k, [train_data, validation_data, test_data])
print("Start SGD_with_bias whith best k for train data")
P_best, Q_best, b_users, b_items = SGD_with_bias(train_data, P_best, Q_best, user_map_best, item_map_best, eta, lambda1, lambda2, lambda3, lambda4, bg, b_users_best, b_items_best, 10)
rmse_test = compute_rmse_bias(P_best, Q_best, bg, b_users, b_items, test_data, user_map_best, item_map_best)
print("-------------------------------------------")
print(f"Test RMSE with best k value (Task 3): {rmse_test}")

Start SGD_with_bias with 4 for train data
-------------------------------------------
Epoch 1: RMSE = 0.9010579451355839
Epoch 2: RMSE = 0.8936768966616953
Epoch 3: RMSE = 0.8887608725517381
Epoch 4: RMSE = 0.8853275303909918
Epoch 5: RMSE = 0.8826880894968575
Epoch 6: RMSE = 0.880534731585201
Epoch 7: RMSE = 0.878938365012461
Epoch 8: RMSE = 0.8776251687289484
Epoch 9: RMSE = 0.875965960436849
Epoch 10: RMSE = 0.8746998481853213
-------------------------------------------
k=4: Validation RMSE = 1.195111743486199
-------------------------------------------
Start SGD_with_bias with 8 for train data
-------------------------------------------
Epoch 1: RMSE = 0.9011016318671035
Epoch 2: RMSE = 0.8935007035799424
Epoch 3: RMSE = 0.8889571637738545
Epoch 4: RMSE = 0.8851471368418924
Epoch 5: RMSE = 0.8829126739313063
Epoch 6: RMSE = 0.8802978559211005
Epoch 7: RMSE = 0.878731508269502
Epoch 8: RMSE = 0.877275045043425
Epoch 9: RMSE = 0.8758617357234114
Epoch 10: RMSE = 0.8748647581812774
--

**Findings:**

1. **Lower RMSE in Task 3(B) compared to Task 2(B):**
    - The inclusion of bias terms in the latent factor model (Task 3(B)) led to a better fit to the data, resulting in a lower RMSE. This suggests that incorporating user and item-specific biases can capture inherent characteristics in the data, enhancing prediction accuracy.

2. **Higher optimal \( k \) value in Task 2(B) than in Task 3(B):**
    - The model in Task 3(B) required a smaller number of latent factors \( k \) to achieve optimal performance, likely because the added bias terms already captured some intrinsic properties of users and items. This indicates that the bias-enhanced model can achieve comparable or better performance using fewer latent factors.