In [13]:
import numpy as np
import pandas as pd
import json
from sklearn.metrics import mean_squared_error
 # loading data
with open('goodreads_reviews_young_adult_train.json', 'r') as f:
    training_data = [json.loads(line) for line in f]

with open('goodreads_reviews_young_adult_test.json', 'r') as f:
    test_data = [json.loads(line) for line in f]

with open('goodreads_reviews_young_adult_val.json', 'r') as f:
    val_data = [json.loads(line) for line in f]


train_df = pd.DataFrame(training_data)
test_df = pd.DataFrame(test_data)
val_df = pd.DataFrame(val_data)




# Task 1:Explore biases
Calculate the global bias bg, user specific bias b(user)
i and item specific bias b(item)
j on the
training data. Report


In [14]:

#(A) The global bg bias
global_bias = train_df['rating'].mean()
print(f"Global Bias (bg): {global_bias}")

# (B) User-Specific Bias (b_user_i) 91ceb82d91493506532feb02ce751ce7
user_bias = train_df.groupby('user_id')['rating'].mean() - global_bias

print(f"User Specific Bias (b_user_i) for user '91ceb82d91493506532feb02ce751ce7': {user_bias['91ceb82d91493506532feb02ce751ce7']}")


# (C) Item-Specific Bias (b_item_j) 6931234
item_bias = train_df.groupby('item_id')['rating'].mean() - global_bias
print(f"Item Specific Bias (b_item_j) for item '6931234': {item_bias['6931234']}")


Global Bias (bg): 3.7634559326052694
User Specific Bias (b_user_i) for user '91ceb82d91493506532feb02ce751ce7': -0.9974984857967586
Item Specific Bias (b_item_j) for item '6931234': -0.24732690034720495


# Task 2: Regularized Latent Factor Model Without Bias

In [15]:
#for testing with small case 
# train_df = train_df[1:100]
# test_df = test_df[1:100]
# val_df = val_df[1:100]

# (A) Implement the Model

In [16]:
# Initialize parameters
num_users = train_df['user_id'].nunique()
num_items = train_df['item_id'].nunique()
k = 8  # Number of latent factors
# Create user and item mappings
user_map = {user_id: idx for idx, user_id in enumerate(train_df['user_id'].unique())}
item_map = {item_id: idx for idx, item_id in enumerate(train_df['item_id'].unique())}

# Initialize latent factor matrices P and Q
# P = np.random.normal(scale=1/k, size=(num_users, k))
# Q = np.random.normal(scale=1/k, size=(num_items, k))
P = np.random.normal(0, 0.1, (num_users,k))
Q = np.random.normal(0, 0.1, (num_items,k))

# Set SGD hyperparameters
learning_rate = 0.01
lambda_reg = 0.3
epochs = 10

# SGD training
train_errors = []


In [17]:
# SDG loop
for epoch in range(epochs):
    for index, row in train_df.iterrows():
        user_idx = user_map[row['user_id']]
        item_idx = item_map[row['item_id']]
        rating = row['rating']
        
        # Predict rating
        pred = np.dot(P[user_idx], Q[item_idx])
        
        # Calculate error
        error = rating - pred
        
        # Update latent factors
        P[user_idx] += learning_rate * (error * Q[item_idx] - lambda_reg * P[user_idx])
        Q[item_idx] += learning_rate * (error * P[user_idx] - lambda_reg * Q[item_idx])
    
    # Calculate RMSE for the epoch
    train_preds = train_df.apply(lambda row: np.dot(P[user_map[row['user_id']]], Q[item_map[row['item_id']]]), axis=1)
    train_rmse = np.sqrt(mean_squared_error(train_df['rating'], train_preds))
    train_errors.append(train_rmse)
    print(f'Epoch {epoch+1}: Training RMSE = {train_rmse}')


Epoch 1: Training RMSE = 3.7505610651244297
Epoch 2: Training RMSE = 2.6503636110386704
Epoch 3: Training RMSE = 2.223193726480888
Epoch 4: Training RMSE = 1.9783919265263685
Epoch 5: Training RMSE = 1.811568123377354
Epoch 6: Training RMSE = 1.6883776034265505
Epoch 7: Training RMSE = 1.5930787086208438
Epoch 8: Training RMSE = 1.5169876781386962
Epoch 9: Training RMSE = 1.4547805745084432
Epoch 10: Training RMSE = 1.4029569324554707


# (B) Evaluate on Validation and Test sets.

In [18]:
#(B) Evaluate on Validation and Test Sets
def train_and_evaluate(k):
    P = np.random.normal(0, 0.1,size = (num_users,k))
    Q = np.random.normal(0, 0.1,size =  (num_items,k))
    train_errors = []

    for epoch in range(epochs):
        for index, row in train_df.iterrows():
            user_idx = user_map[row['user_id']]
            item_idx = item_map[row['item_id']]
            rating = row['rating']

            # Predict rating
            pred = np.dot(P[user_idx], Q[item_idx])

            # Calculate error
            error = rating - pred

            # Update latent factors
            P[user_idx] += learning_rate * (error * Q[item_idx] - lambda_reg * P[user_idx])
            Q[item_idx] += learning_rate * (error * P[user_idx] - lambda_reg * Q[item_idx])

        # Calculate RMSE on validation data
        val_preds = val_df.apply(lambda row: np.dot(P[user_map.get(row['user_id'], 0)], Q[item_map.get(row['item_id'], 0)]), axis=1)
        val_rmse = np.sqrt(mean_squared_error(val_df['rating'], val_preds))
        train_errors.append(val_rmse)
        print(f'Epoch {epoch+1}: Validation RMSE (k={k}) = {val_rmse}')

    return train_errors[-1]

# Train for k = 4, 8, 16 and select the best model
best_k = None
best_rmse = float('inf')

for k in [4, 8, 16]:
    print(f'Training for k={k}...')
    val_rmse = train_and_evaluate(k)
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        best_k = k

print(f'Best k = {best_k}, Validation RMSE = {best_rmse}')

# Test on the test data with the best k
def evaluate_on_test(P, Q):
    # Calculate RMSE on the test data
    test_preds = test_df.apply(lambda row: np.dot(P[user_map.get(row['user_id'], 0)], Q[item_map.get(row['item_id'], 0)]), axis=1)
    test_rmse = np.sqrt(mean_squared_error(test_df['rating'], test_preds))
    return test_rmse

# Re-train with the best k found
P = np.random.normal(0, 0.1, size=(num_users, best_k))
Q = np.random.normal(0, 0.1, size=(num_items, best_k))

for epoch in range(epochs):
    for index, row in train_df.iterrows():
        user_idx = user_map[row['user_id']]
        item_idx = item_map[row['item_id']]
        rating = row['rating']

        # Predict rating
        pred = np.dot(P[user_idx], Q[item_idx])

        # Calculate error
        error = rating - pred

        # Update latent factors
        P[user_idx] += learning_rate * (error * Q[item_idx] - lambda_reg * P[user_idx])
        Q[item_idx] += learning_rate * (error * P[user_idx] - lambda_reg * Q[item_idx])

# Evaluate on the test data
test_rmse = evaluate_on_test(P, Q)
print(f'Test RMSE with the best k={best_k}: {test_rmse}')

Training for k=4...
Epoch 1: Validation RMSE (k=4) = 3.786554764384929
Epoch 2: Validation RMSE (k=4) = 2.7157270218375205
Epoch 3: Validation RMSE (k=4) = 2.3081480446775196
Epoch 4: Validation RMSE (k=4) = 2.0776809498956985
Epoch 5: Validation RMSE (k=4) = 1.9213612445192276
Epoch 6: Validation RMSE (k=4) = 1.8063130866120114
Epoch 7: Validation RMSE (k=4) = 1.7177995871565068
Epoch 8: Validation RMSE (k=4) = 1.647583782747846
Epoch 9: Validation RMSE (k=4) = 1.5905617271315184
Epoch 10: Validation RMSE (k=4) = 1.5434482268149905
Training for k=8...
Epoch 1: Validation RMSE (k=8) = 3.6922550616223613
Epoch 2: Validation RMSE (k=8) = 2.67867641082997
Epoch 3: Validation RMSE (k=8) = 2.2903563188856046
Epoch 4: Validation RMSE (k=8) = 2.0667460527454025
Epoch 5: Validation RMSE (k=8) = 1.9138029637694876
Epoch 6: Validation RMSE (k=8) = 1.80086743675591
Epoch 7: Validation RMSE (k=8) = 1.7138030895051817
Epoch 8: Validation RMSE (k=8) = 1.6446126199721023
Epoch 9: Validation RMSE (k=8

# Task 3: Regularized Latent Factor Model With Bias

# (A) Incorporate Bias

In [19]:
# (A) Incorporate Bias

# Initialize bias terms
b_user = user_bias.to_dict()
b_item = item_bias.to_dict()

# Initialize P, Q, and re-use b_user and b_item
P = np.random.normal(0, 0.1, size=(num_users, k))
Q = np.random.normal(0, 0.1, size=(num_items, k))
# Update SGD to include bias
for epoch in range(epochs):
    for index, row in train_df.iterrows():
        user_idx = user_map[row['user_id']]
        item_idx = item_map[row['item_id']]
        rating = row['rating']
        
        # Predict rating including biases
        pred = global_bias + b_user.get(row['user_id'], 0) + b_item.get(row['item_id'], 0) + np.dot(P[user_idx], Q[item_idx])
        
        # Calculate error
        error = rating - pred
        
        # Update latent factors
        P[user_idx] += learning_rate * (error * Q[item_idx] - lambda_reg * P[user_idx])
        Q[item_idx] += learning_rate * (error * P[user_idx] - lambda_reg * Q[item_idx])
        
        # Update biases
        b_user[row['user_id']] = b_user.get(row['user_id'], 0) + learning_rate * (error - lambda_reg * b_user.get(row['user_id'], 0))
        b_item[row['item_id']] = b_item.get(row['item_id'], 0) + learning_rate * (error - lambda_reg * b_item.get(row['item_id'], 0))
    
    # Calculate RMSE for the epoch
    train_preds = train_df.apply(lambda row: global_bias + b_user.get(row['user_id'], 0) + b_item.get(row['item_id'], 0) + np.dot(P[user_map[row['user_id']]], Q[item_map[row['item_id']]]), axis=1)
    train_rmse = np.sqrt(mean_squared_error(train_df['rating'], train_preds))
    print(f'Epoch {epoch+1}: Training RMSE with Bias = {train_rmse}')


Epoch 1: Training RMSE with Bias = 0.9657341991065203
Epoch 2: Training RMSE with Bias = 0.9635950206124181
Epoch 3: Training RMSE with Bias = 0.9622577189919499
Epoch 4: Training RMSE with Bias = 0.9612737390774274
Epoch 5: Training RMSE with Bias = 0.9604982965547415
Epoch 6: Training RMSE with Bias = 0.9598619443473199
Epoch 7: Training RMSE with Bias = 0.9593249481274763
Epoch 8: Training RMSE with Bias = 0.9588622432811763
Epoch 9: Training RMSE with Bias = 0.958456965603243
Epoch 10: Training RMSE with Bias = 0.958097216835294


In [20]:
# After training the model, get the learned biases for the specific user and item
user_id = "91ceb82d91493506532feb02ce751ce7"
item_id = "6931234"

learned_user_bias = b_user.get(user_id, 0)
learned_item_bias = b_item.get(item_id, 0)

print(f"Learned user-specific bias for user '{user_id}': {learned_user_bias}")
print(f"Learned item-specific bias for item '{item_id}': {learned_item_bias}")

Learned user-specific bias for user '91ceb82d91493506532feb02ce751ce7': -0.5928744612577722
Learned item-specific bias for item '6931234': -0.1953191661901935


In [21]:
# (B) Evaluate on Validation and Test Sets
# Define the function to train and evaluate the model with bias
def train_and_evaluate_with_bias(k):
    # Initialize P, Q, and biases
    P = np.random.normal(0, 0.1, size=(num_users, k))
    Q = np.random.normal(0, 0.1, size=(num_items, k))
    b_user = user_bias.to_dict()
    b_item = item_bias.to_dict()
    
    train_rmse_list = []
    val_rmse_list = []
    
    for epoch in range(epochs):
        for index, row in train_df.iterrows():
            user_idx = user_map[row['user_id']]
            item_idx = item_map[row['item_id']]
            rating = row['rating']

            # Predict rating including biases
            pred = global_bias + b_user.get(row['user_id'], 0) + b_item.get(row['item_id'], 0) + np.dot(P[user_idx], Q[item_idx])

            # Calculate error
            error = rating - pred

            # Update latent factors
            P[user_idx] += learning_rate * (error * Q[item_idx] - lambda_reg * P[user_idx])
            Q[item_idx] += learning_rate * (error * P[user_idx] - lambda_reg * Q[item_idx])

            # Update biases
            b_user[row['user_id']] = b_user.get(row['user_id'], 0) + learning_rate * (error - lambda_reg * b_user.get(row['user_id'], 0))
            b_item[row['item_id']] = b_item.get(row['item_id'], 0) + learning_rate * (error - lambda_reg * b_item.get(row['item_id'], 0))

        # Calculate RMSE on the training data
        train_preds = train_df.apply(lambda row: global_bias + b_user.get(row['user_id'], 0) + b_item.get(row['item_id'], 0) + np.dot(P[user_map[row['user_id']]], Q[item_map[row['item_id']]]), axis=1)
        train_rmse = np.sqrt(mean_squared_error(train_df['rating'], train_preds))
        train_rmse_list.append(train_rmse)
        
        # Calculate RMSE on the validation data
        val_preds = val_df.apply(lambda row: global_bias + b_user.get(row['user_id'], 0) + b_item.get(row['item_id'], 0) + np.dot(P[user_map.get(row['user_id'], 0)], Q[item_map.get(row['item_id'], 0)]), axis=1)
        val_rmse = np.sqrt(mean_squared_error(val_df['rating'], val_preds))
        val_rmse_list.append(val_rmse)
        
        print(f'Epoch {epoch+1}: Training RMSE (k={k}) = {train_rmse}, Validation RMSE (k={k}) = {val_rmse}')
    
    return train_rmse_list, val_rmse_list

# Train for k = 4, 8, 16 and select the best model
best_k_with_bias = None
best_val_rmse_with_bias = float('inf')

for k in [4, 8, 16]:
    print(f'Training with Bias for k={k}...')
    train_rmse_list, val_rmse_list = train_and_evaluate_with_bias(k)
    final_val_rmse = val_rmse_list[-1]
    
    if final_val_rmse < best_val_rmse_with_bias:
        best_val_rmse_with_bias = final_val_rmse
        best_k_with_bias = k

print(f'Best k with Bias = {best_k_with_bias}, Validation RMSE = {best_val_rmse_with_bias}')

Training with Bias for k=4...
Epoch 1: Training RMSE (k=4) = 0.966552850813979, Validation RMSE (k=4) = 1.1514363187253567
Epoch 2: Training RMSE (k=4) = 0.9648707444055573, Validation RMSE (k=4) = 1.1491906106678165
Epoch 3: Training RMSE (k=4) = 0.9637925628087926, Validation RMSE (k=4) = 1.1472804049374712
Epoch 4: Training RMSE (k=4) = 0.9629781322351195, Validation RMSE (k=4) = 1.145614041398226
Epoch 5: Training RMSE (k=4) = 0.9623229711882263, Validation RMSE (k=4) = 1.1441364370072569
Epoch 6: Training RMSE (k=4) = 0.9617768523434768, Validation RMSE (k=4) = 1.1428089737280727
Epoch 7: Training RMSE (k=4) = 0.961310566673417, Validation RMSE (k=4) = 1.1416037540416124
Epoch 8: Training RMSE (k=4) = 0.9609053555502296, Validation RMSE (k=4) = 1.1405001286126786
Epoch 9: Training RMSE (k=4) = 0.9605483821355886, Validation RMSE (k=4) = 1.1394824243431567
Epoch 10: Training RMSE (k=4) = 0.9602304479540755, Validation RMSE (k=4) = 1.1385384616666023
Training with Bias for k=8...
Ep

In [22]:
# (B) Evaluate the Best Model on the Test Set

# Use the best k found with bias to evaluate on the test set
P = np.random.normal(scale=1/best_k_with_bias, size=(num_users, best_k_with_bias))
Q = np.random.normal(scale=1/best_k_with_bias, size=(num_items, best_k_with_bias))
b_user = user_bias.to_dict()
b_item = item_bias.to_dict()

# Retrain the model using the best k
for epoch in range(epochs):
    for index, row in train_df.iterrows():
        user_idx = user_map[row['user_id']]
        item_idx = item_map[row['item_id']]
        rating = row['rating']

        # Predict rating including biases
        pred = global_bias + b_user.get(row['user_id'], 0) + b_item.get(row['item_id'], 0) + np.dot(P[user_idx], Q[item_idx])

        # Calculate error
        error = rating - pred

        # Update latent factors
        P[user_idx] += learning_rate * (error * Q[item_idx] - lambda_reg * P[user_idx])
        Q[item_idx] += learning_rate * (error * P[user_idx] - lambda_reg * Q[item_idx])

        # Update biases
        b_user[row['user_id']] = b_user.get(row['user_id'], 0) + learning_rate * (error - lambda_reg * b_user.get(row['user_id'], 0))
        b_item[row['item_id']] = b_item.get(row['item_id'], 0) + learning_rate * (error - lambda_reg * b_item.get(row['item_id'], 0))

# Evaluate on the test data
test_preds = test_df.apply(lambda row: global_bias + b_user.get(row['user_id'], 0) + b_item.get(row['item_id'], 0) + np.dot(P[user_map.get(row['user_id'], 0)], Q[item_map.get(row['item_id'], 0)]), axis=1)
test_rmse_with_bias = np.sqrt(mean_squared_error(test_df['rating'], test_preds))

print(f'Test RMSE with Bias = {test_rmse_with_bias}')


Test RMSE with Bias = 1.1359763303486203


## Analysis 

    
Task 2 (A) : From the obersvation we can discover the RMSE have significant chages through epochs from 3.81 drop down to 1.40 which indicates the model are able to learn user-item interactions and reduce prediction errors effectively over time.

Task 2 (B) : In the term of varying k increase the RMSE decrease accordingly as following <br>
- k = 4 : RMSE drop 3.783 to 1.543 over 10 epochs 
- k = 8 : RMSE drop 3.818 to 1.542 over 10 epochs
- k = 16 : RMSE drop 3.643 to 1.541 over 10 epochs
As the result the best fit k is when k equals 16 with best performance of RMSE with 1.541.


Task 3 (A): Following many cycles, the RMSE dropped from 0.9657 to 0.9580. Nonetheless, the RMSE decrease was not substantial, suggesting that the model's performance did not increase during the epochs. This implies that more training is producing diminishing rewards in terms of lowering prediction error, and that the model may have converged.



Task 3 (B) : On the train side the highest K value have the best performance(RMSE = 0.985 when k =16), however in the validation term the smallest k value actually has the best performance(RMSE = 1.138527 when k =4).

## Key Findings

Task 2: The model without bias performed quite well, as predicted, with the validation RMSE dropping for all values of k. However, in comparison to the model with bias, its final RMSEs remained rather high. For k=16, the best validation RMSE was 1.541.


Task 3: By adding bias terms (global, user-specific, and item-specific biases), performance significantly improved. Task 3's training RMSEs were consistently lower than Task 2's, suggesting that the model may provide a better fit for the data. Consistent improvement was also seen in the validation RMSE; for k=4, the best RMSE was 1.138, as opposed to 1.541 in Task 2.


## Choice of K

With a greater number of latent components (k=16) in Task 2, the bias-free model outperformed the others, while in Task 3, the optimal model had fewer latent elements (k=4).
This shows that the model requires fewer latent features to accurately describe the interactions between users and objects after bias words are incorporated. Some of the variability that would normally need more latent components in the model without bias is captured by bias terms.






