In [1]:
import numpy as np
from tqdm import tqdm
import random
import sqlite3

In [2]:
# import dataset from database
def load_data(db_path):
    conn = sqlite3.connect(db_path)
    print("Loaded database")

    c = conn.cursor()
    print("Fetching data ...")
    c.execute('SELECT UserID, ItemID, Rating FROM example_table')
    data = c.fetchall()
    conn.close()

    user_indices = []
    item_indices = []
    ratings_values = []

    max_user_id = 0
    max_item_id = 0

    for user_id, item_id, rating in data:
        user_indices.append(user_id)
        item_indices.append(item_id)
        ratings_values.append(rating)
        #ratings_values.append(int(rating * 2)) # convert ratings to integers
        max_user_id = max(max_user_id, user_id)
        max_item_id = max(max_item_id, item_id)

    user_indices = np.array(user_indices, dtype=np.int32)
    item_indices = np.array(item_indices, dtype=np.int32)
    #ratings_values = np.array(ratings_values, dtype=np.int32)
    ratings_values = np.array(ratings_values, dtype=np.float32)

    print("Max user id:", max_user_id)
    print("Max item id:", max_item_id)

    return user_indices, item_indices, ratings_values, max_user_id, max_item_id

#def normalize_ids(indices):
#    unique_ids, inverse_indices = np.unique(indices, return_inverse=True)
#    id_map = {original_id: idx for idx, original_id in enumerate(unique_ids)}
#    reverse_map = {idx: original_id for idx, original_id in enumerate(unique_ids)}
#    num_unique = len(unique_ids)  # The total number of unique indices
#    return inverse_indices, num_unique, id_map, reverse_map

def normalize_indices(indices):
    unique_ids = np.unique(indices)
    id_to_norm = {id_: i for i, id_ in enumerate(unique_ids)}
    norm_to_id = {i: id_ for i, id_ in enumerate(unique_ids)}
    normalized_indices = np.vectorize(id_to_norm.get)(indices)
    num_unique = len(unique_ids)  # Total number of unique indices
    return normalized_indices, num_unique, id_to_norm, norm_to_id

# split into train and validation sets
def split_data(user_indices, item_indices, ratings, split_ratio=0.9):
    np.random.seed(42)
    indices = np.random.permutation(len(ratings))
    split_point = int(len(ratings) * split_ratio)
    train_idx, val_idx = indices[:split_point], indices[split_point:]
    
    train_data = (user_indices[train_idx], item_indices[train_idx], ratings[train_idx])
    val_data = (user_indices[val_idx], item_indices[val_idx], ratings[val_idx])
    return train_data, val_data

In [3]:
# impor dataset

path_100k = '../../data/dataset1/train_100k.db'
path_20M = '../../data/dataset2/train_20M.db'

u, i, global_ratings, global_max_user_id, global_max_item_id = load_data(path_20M)

# Normalize user and item indices
global_user_indices, global_num_users, user_to_norm, norm_to_user = normalize_indices(u)
global_item_indices, global_num_items, item_to_norm, norm_to_item = normalize_indices(i)

#global_num_users = np.unique(global_user_indices)
#global_num_items = np.unique(global_item_indices)


print("Number of users:", global_num_users)
print("Number of items:", global_num_items)

train_data, val_data = split_data(global_user_indices, global_item_indices, global_ratings, split_ratio=0.9)
all_data, _ = split_data(global_user_indices, global_item_indices, global_ratings, split_ratio=1)

print("Train data size:",train_data[0].size)
print("Validation data size:",val_data[0].size)
print("All data size:",all_data[0].size)

# train_data[0] = user_indices
# train_data[1] = item_indices
# train_data[2] = ratings


Loaded database
Fetching data ...
Max user id: 138493
Max item id: 26744
Number of users: 138493
Number of items: 26690
Train data size: 16753799
Validation data size: 1861534
All data size: 18615333


# MAE and Prediction Code

In [4]:
# MAE and predict methods
def calculate_mae(actual, predicted):
    """
    Parameters:
    - actual_ratings: np.array, the actual ratings.
    - predicted_ratings: np.array, the predicted ratings.
    """
    # calculate the absolute error between actual and predicted ratings
    abs_err = np.abs(actual - predicted)
    
    # calculate the mean of these absolute errors
    mae = np.mean(abs_err)
    
    return mae # /2

# round prediction to nearest 0.5 in range [0.5, 5]
def round_predictions(predictions):
    rounded_predictions = np.round(predictions * 2) / 2
    return np.clip(rounded_predictions, 0.5, 5.0)
    #rounded_predictions = np.round(predictions)
    #return np.clip(rounded_predictions, 1, 10)

def predict(user_features, item_features, user_indices, item_indices):
    predictions = np.array([np.dot(user_features[u], item_features[i]) for u, i in zip(user_indices, item_indices)])
    return predictions

# SGD

In [5]:
def sgd(user_indices, item_indices, ratings, num_users, num_items, num_factors, alpha, beta, iterations):
    # Initialize feature matrices
    np.random.seed(42)
    user_features = np.random.normal(0, 0.1, (num_users, num_factors))
    item_features = np.random.normal(0, 0.1, (num_items, num_factors))

    # SGD updates (only using training data)
    for iteration in range(iterations): #tqdm(range(iterations), desc='SGD iterations', total=iterations):
        for u, i, r in tqdm(zip(user_indices, item_indices, ratings), desc=f'SGD {iteration+1}/{iterations}', total=len(ratings)):
            prediction = np.dot(user_features[u], item_features[i])
            error = r - prediction

            # Update rules for features
            user_features_grad = -2 * error * item_features[i] + beta * user_features[u]
            item_features_grad = -2 * error * user_features[u] + beta * item_features[i]

            user_features[u] -= alpha * user_features_grad
            item_features[i] -= alpha * item_features_grad

    return user_features, item_features

In [8]:
# TEST 100K
num_factors = 60  # Latent factors
alpha = 0.0075      # Learning rate
beta = 0.03      # Regularization
iterations = 10   # Number of iterations

# Run SGD
print(f"Testing with {num_factors} factors and alpha= {alpha}, beta= {beta}")
sgd_user_features, sgd_item_features = sgd(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
sgd_predictions = predict(sgd_user_features, sgd_item_features, val_data[0], val_data[1])
train_pred2 = round_predictions(sgd_predictions)

truth_ratings = val_data[2]
sgd_mae = calculate_mae(truth_ratings, train_pred2)
print(f"MAE:", sgd_mae)
print()

Testing with 60 factors and alpha= 0.0075, beta= 0.03


SGD 1/2: 100%|██████████| 16753799/16753799 [04:52<00:00, 57228.63it/s]
SGD 2/2: 100%|██████████| 16753799/16753799 [04:49<00:00, 57938.57it/s]


MAE: 0.6359013587718516



# ALS

In [5]:
def als(user_indices, item_indices, ratings, num_users, num_items, num_factors, lambda_reg, iterations):
    # Initialize matrices
    np.random.seed(42)
    user_features = np.random.normal(0, 0.1, (num_users, num_factors))
    item_features = np.random.normal(0, 0.1, (num_items, num_factors))
    
    # Precompute user and item interactions
    interaction_matrix = np.zeros((num_users, num_items))
    interaction_matrix[user_indices, item_indices] = ratings

    # Regularization matrix
    lambda_eye = lambda_reg * np.eye(num_factors)

    for iteration in range(iterations):
        # Update user features
        for u in tqdm(range(num_users), desc=f'ALS {iteration+1}/{iterations} (users)', total=num_users):
            item_idx = interaction_matrix[u, :] > 0
            V = item_features[item_idx]
            r_u = interaction_matrix[u, item_idx]
            A_u = V.T @ V + lambda_eye
            b_u = V.T @ r_u
            user_features[u] = np.linalg.solve(A_u, b_u)

        # Update item features
        for i in tqdm(range(num_items), desc=f'ALS {iteration+1}/{iterations} (items)', total=num_items):
            user_idx = interaction_matrix[:, i] > 0
            U = user_features[user_idx]
            r_i = interaction_matrix[user_idx, i]
            A_i = U.T @ U + lambda_eye
            b_i = U.T @ r_i
            item_features[i] = np.linalg.solve(A_i, b_i)

    return user_features, item_features

In [7]:
import gc

iterations = 4
for beta in [0.005, 0.05, 0.1, 0.2, 0.5]:
        for num_factors in [20, 40, 60, 80]:
            print(f"Testing with {num_factors} factors and beta= {beta}")
            als_user_features, als_item_features = als(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, beta, iterations)
            als_predictions = predict(als_user_features, als_item_features, val_data[0], val_data[1])
            als_rounded = round_predictions(als_predictions)

            truth_ratings = val_data[2]
            als_mae = calculate_mae(truth_ratings, als_rounded)
            print(f"MAE:", als_mae)            
            print()
            gc.collect()

Testing with 20 factors and beta= 0.005


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:53<00:00, 2604.46it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:52<00:00, 114.69it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 4981.49it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:43<00:00, 119.41it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 4955.78it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:42<00:00, 120.03it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5006.40it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:41<00:00, 120.28it/s]


MAE: 0.6262611910392182

Testing with 40 factors and beta= 0.005


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3877.09it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:47<00:00, 117.56it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:33<00:00, 4169.32it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:48<00:00, 116.82it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4366.92it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:46<00:00, 118.04it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4409.34it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:45<00:00, 118.18it/s]


MAE: 0.6673176530753668

Testing with 60 factors and beta= 0.005


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:40<00:00, 3449.24it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:49<00:00, 116.07it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:36<00:00, 3840.38it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:50<00:00, 115.75it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3857.91it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:50<00:00, 115.78it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:36<00:00, 3844.10it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:50<00:00, 115.78it/s]


MAE: 0.7021486043231012

Testing with 80 factors and beta= 0.005


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:46<00:00, 2949.77it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:53<00:00, 114.43it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:41<00:00, 3299.25it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:53<00:00, 114.31it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:41<00:00, 3316.72it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:52<00:00, 114.74it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:42<00:00, 3296.85it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:52<00:00, 114.79it/s]


MAE: 0.7391570607896498

Testing with 20 factors and beta= 0.05


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4379.47it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:43<00:00, 119.55it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5079.18it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:43<00:00, 119.47it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5057.18it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:43<00:00, 119.40it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5060.30it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:43<00:00, 119.35it/s]


MAE: 0.6083592886297

Testing with 40 factors and beta= 0.05


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3886.12it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:45<00:00, 118.17it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4419.44it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:45<00:00, 118.23it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4438.82it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:46<00:00, 118.09it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:30<00:00, 4476.72it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:46<00:00, 117.97it/s]


MAE: 0.639984013184825

Testing with 60 factors and beta= 0.05


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:40<00:00, 3417.34it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:48<00:00, 117.03it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3855.65it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:48<00:00, 116.55it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3859.26it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:49<00:00, 116.55it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3869.88it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:49<00:00, 116.48it/s]


MAE: 0.6665980852350804

Testing with 80 factors and beta= 0.05


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:45<00:00, 3034.61it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:52<00:00, 114.83it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:41<00:00, 3312.20it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:53<00:00, 114.36it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:41<00:00, 3329.89it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:51<00:00, 115.05it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:41<00:00, 3303.70it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:51<00:00, 115.24it/s]


MAE: 0.698452459100935

Testing with 20 factors and beta= 0.1


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4377.89it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:42<00:00, 119.77it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5075.88it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:42<00:00, 119.75it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5067.92it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:42<00:00, 119.79it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5060.70it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:43<00:00, 119.53it/s]


MAE: 0.6046027093783943

Testing with 40 factors and beta= 0.1


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3895.77it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:45<00:00, 118.39it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4426.79it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:45<00:00, 118.25it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4432.16it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:45<00:00, 118.41it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4456.71it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:45<00:00, 118.23it/s]


MAE: 0.6344673264092947

Testing with 60 factors and beta= 0.1


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:40<00:00, 3433.78it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:48<00:00, 116.90it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3865.77it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:49<00:00, 116.53it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3860.49it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:49<00:00, 116.21it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3892.51it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:49<00:00, 116.08it/s]


MAE: 0.6604373060067664

Testing with 80 factors and beta= 0.1


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:46<00:00, 2948.70it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:52<00:00, 114.68it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:42<00:00, 3276.40it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:53<00:00, 114.15it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:42<00:00, 3287.89it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:53<00:00, 114.18it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:42<00:00, 3296.08it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:54<00:00, 114.01it/s]


MAE: 0.6929392640693106

Testing with 20 factors and beta= 0.2


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4363.52it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:42<00:00, 119.76it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5054.90it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:43<00:00, 119.60it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5048.99it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:43<00:00, 119.61it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5060.91it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:42<00:00, 119.80it/s]


MAE: 0.6013843958799571

Testing with 40 factors and beta= 0.2


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3882.96it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:45<00:00, 118.34it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4453.96it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:46<00:00, 118.03it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4412.82it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:45<00:00, 118.27it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4400.51it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:45<00:00, 118.29it/s]


MAE: 0.6308899004799268

Testing with 60 factors and beta= 0.2


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:40<00:00, 3438.57it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:48<00:00, 116.74it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3867.88it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:49<00:00, 116.53it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3852.08it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:48<00:00, 116.73it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3874.44it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:49<00:00, 116.46it/s]


MAE: 0.6568883512200153

Testing with 80 factors and beta= 0.2


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:47<00:00, 2945.69it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:52<00:00, 114.67it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:41<00:00, 3311.63it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:53<00:00, 114.17it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:42<00:00, 3287.70it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:54<00:00, 114.01it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:41<00:00, 3311.74it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:54<00:00, 113.84it/s]


MAE: 0.6914952936664063

Testing with 20 factors and beta= 0.5


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4353.63it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:43<00:00, 119.66it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5031.82it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:43<00:00, 119.49it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5078.22it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:43<00:00, 119.36it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:27<00:00, 5043.82it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:43<00:00, 119.27it/s]


MAE: 0.5983860622475872

Testing with 40 factors and beta= 0.5


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3888.78it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:46<00:00, 118.05it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4439.60it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:45<00:00, 118.13it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4422.65it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:46<00:00, 118.04it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:31<00:00, 4409.81it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:46<00:00, 118.02it/s]


MAE: 0.6289192139386119

Testing with 60 factors and beta= 0.5


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:40<00:00, 3419.34it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:48<00:00, 116.62it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:36<00:00, 3833.39it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:49<00:00, 116.33it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3851.69it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:50<00:00, 115.95it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:35<00:00, 3852.45it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:50<00:00, 115.85it/s]


MAE: 0.6564825031398835

Testing with 80 factors and beta= 0.5


ALS 1/4 (users): 100%|██████████| 138493/138493 [00:47<00:00, 2926.54it/s]
ALS 1/4 (items): 100%|██████████| 26690/26690 [03:52<00:00, 114.92it/s]
ALS 2/4 (users): 100%|██████████| 138493/138493 [00:42<00:00, 3285.78it/s]
ALS 2/4 (items): 100%|██████████| 26690/26690 [03:53<00:00, 114.54it/s]
ALS 3/4 (users): 100%|██████████| 138493/138493 [00:41<00:00, 3311.89it/s]
ALS 3/4 (items): 100%|██████████| 26690/26690 [03:53<00:00, 114.52it/s]
ALS 4/4 (users): 100%|██████████| 138493/138493 [00:42<00:00, 3288.36it/s]
ALS 4/4 (items): 100%|██████████| 26690/26690 [03:53<00:00, 114.27it/s]


MAE: 0.6958073287944244



# Test Set and Submission 

In [14]:
# import test set

# load test set
def load_test(db_path):
    conn = sqlite3.connect(db_path)
    print("Loaded database")

    c = conn.cursor()
    print("Fetching data ...")
    c.execute('SELECT UserID, ItemID, TimeStamp FROM example_table')
    data = c.fetchall()
    conn.close()

    test_user_indices, test_item_indices, timestamps = zip(*data)

    return np.array(test_user_indices, dtype=np.int32), np.array(test_item_indices, dtype=np.int32), np.array(timestamps, dtype=np.int32)

def vectorize_indices(indices, mapping):
    missing_ids = [id for id in indices if id not in mapping]
    if missing_ids:
        print(f"Missing IDs: {set(missing_ids)}")
    
    vectorized_indices = np.array([mapping[id] if id in mapping else -1 for id in indices])

    return vectorized_indices, set(missing_ids)

# 20M dataset
test_dir_20M = '../../data/dataset2/test_20M.db'

# 100k dataset
test_dir_100K = '../../data/dataset1/test_100k.db'

   
# Load the dataset (excluding the header if present)
test_user_indices, test_item_indices, timestamps = load_test(test_dir_20M)
test_data = (test_user_indices, test_item_indices, timestamps)

test_user_indices_normalized, missing_users = vectorize_indices(test_user_indices, user_to_norm)
test_item_indices_normalized, missing_items = vectorize_indices(test_item_indices, item_to_norm)

print("Test data size: ", len(test_data[0]))
print("All data size: ", len(all_data[0]))
print("Ratio All Data / Test:", len(test_data[0]) / len(all_data[0]))
print("Ratio Train / Val:", len(val_data[0]) / len(train_data[0]))

print("Num users:", len(np.unique(test_user_indices_normalized)))
print("Num items:", len(np.unique(test_item_indices_normalized)))

Loaded database
Fetching data ...
Missing IDs: {22144, 23296, 24321, 24961, 18693, 20103, 20615, 25353, 18955, 24715, 21904, 16145, 24976, 19731, 12054, 10648, 24985, 21532, 23068, 25894, 22316, 14382, 24244, 20283, 16189, 25663, 25280, 24769, 24642, 26692, 25289, 26569, 25292, 23502, 13906, 20179, 22482, 25691, 24284, 24796, 21598, 23009, 24803, 24294, 23278, 16751, 22127, 22129, 25329, 26357, 15862, 25978, 21499, 23676}
Test data size:  1384930
All data size:  18615333
Ratio All Data / Test: 0.07439727239904868
Ratio Train / Val: 0.11111115753507607
Num users: 138493
Num items: 12864


In [7]:
# Run SGD on all data
#num_factors = 20  # Latent factors
#alpha = 0.0075      # Learning rate
#beta = 0.125       # Regularization
#iterations = 20 

num_factors = 200  # Latent factors
alpha = 0.005      # Learning rate
beta = 0.03      # Regularization
iterations = 100  # Number of iterations

sgd_user_factors, sgd_item_factors = sgd(all_data[0], all_data[1], all_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)




SGD 1/10: 100%|██████████| 18615333/18615333 [05:26<00:00, 57013.58it/s]
SGD 2/10: 100%|██████████| 18615333/18615333 [05:24<00:00, 57356.15it/s]
SGD 3/10: 100%|██████████| 18615333/18615333 [05:22<00:00, 57719.38it/s]
SGD 4/10: 100%|██████████| 18615333/18615333 [05:22<00:00, 57795.14it/s]
SGD 5/10: 100%|██████████| 18615333/18615333 [05:23<00:00, 57576.29it/s]
SGD 6/10: 100%|██████████| 18615333/18615333 [05:25<00:00, 57118.77it/s]
SGD 7/10: 100%|██████████| 18615333/18615333 [05:27<00:00, 56886.83it/s]
SGD 8/10: 100%|██████████| 18615333/18615333 [05:26<00:00, 57006.30it/s]
SGD 9/10: 100%|██████████| 18615333/18615333 [05:28<00:00, 56621.14it/s]
SGD 10/10: 100%|██████████| 18615333/18615333 [05:29<00:00, 56578.55it/s]


In [23]:
def als(user_indices, item_indices, ratings, num_users, num_items, num_factors, lambda_reg, iterations):
    # Initialize matrices
    np.random.seed(42)
    user_features = np.random.normal(0, 0.1, (num_users, num_factors))
    item_features = np.random.normal(0, 0.1, (num_items, num_factors))
    
    # Precompute user and item interactions
    interaction_matrix = np.zeros((num_users, num_items))
    interaction_matrix[user_indices, item_indices] = ratings

    # Regularization matrix
    lambda_eye = lambda_reg * np.eye(num_factors)

    for iteration in range(iterations):
        # Update user features
        for u in tqdm(range(num_users), desc=f'ALS {iteration+1}/{iterations} (users)', total=num_users):
            item_idx = interaction_matrix[u, :] > 0
            V = item_features[item_idx]
            r_u = interaction_matrix[u, item_idx]
            A_u = V.T @ V + lambda_eye
            b_u = V.T @ r_u
            user_features[u] = np.linalg.solve(A_u, b_u)

        # Update item features
        for i in tqdm(range(num_items), desc=f'ALS {iteration+1}/{iterations} (items)', total=num_items):
            user_idx = interaction_matrix[:, i] > 0
            U = user_features[user_idx]
            r_i = interaction_matrix[user_idx, i]
            A_i = U.T @ U + lambda_eye
            b_i = U.T @ r_i
            item_features[i] = np.linalg.solve(A_i, b_i)

    return user_features, item_features

In [24]:
num_factors = 60  # Latent factors
lambda_reg = 0.1 # Regularization
iterations = 1   # Number of iterations


als_user_features, als_item_features = als(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, beta, iterations)

ALS 1/1 (users): 100%|██████████| 138493/138493 [00:58<00:00, 2366.96it/s]
ALS 1/1 (items):  55%|█████▍    | 14597/26690 [02:03<01:43, 117.35it/s]

In [None]:
als_predictions = predict(als_user_features, als_item_features, val_data[0], val_data[1])
als_rounded = round_predictions(als_predictions)

truth_ratings = val_data[2]
als_mae = calculate_mae(truth_ratings, als_rounded)
print(f"MAE:", als_mae)

In [15]:
default_prediction_value = 3 # Midpoint between 0.5 and 5.0, adjust as needed

# Assuming you have a function `make_predictions` that uses your model
predictions = predict(sgd_user_factors, sgd_item_factors, test_user_indices_normalized, test_item_indices_normalized)

# Replace predictions for missing indices with the default value
predictions[test_item_indices_normalized == -1] = default_prediction_value



In [16]:
def realign_predictions(original_user_indices, original_item_indices, normalized_user_indices, normalized_item_indices, predictions, default_value=3.0):
    # Initialize the aligned predictions array with the default value
    aligned_predictions = np.full(original_user_indices.shape, default_value, dtype=float)

    # Create a mapping from normalized indices back to original positions
    # This map should be built from the normalized indices, not from the original indices directly.
    norm_to_orig_map = {}
    for idx, (norm_u, norm_i) in enumerate(zip(normalized_user_indices, normalized_item_indices)):
        if (norm_u, norm_i) not in norm_to_orig_map:  # avoid overriding if multiple original indices map to the same normalized index
            norm_to_orig_map[(norm_u, norm_i)] = idx

    # Use this map to assign predictions to their corresponding original indices
    for idx in range(len(predictions)):
        norm_u = normalized_user_indices[idx]
        norm_i = normalized_item_indices[idx]
        if (norm_u, norm_i) in norm_to_orig_map:
            orig_idx = norm_to_orig_map[(norm_u, norm_i)]
            aligned_predictions[orig_idx] = predictions[idx]

    return aligned_predictions

# Now call this function to realign your predictions
aligned_predictions = realign_predictions(test_user_indices, test_item_indices, test_user_indices_normalized, test_item_indices_normalized, predictions)

# Round the predictions to the nearest 0.5
final_predictions = round_predictions(aligned_predictions)


In [17]:
def save_predictions_to_csv(user_ids, item_ids, predictions, timestamps, filename):
    # Ensure all parts are numpy arrays (in case they are not)
    user_ids = np.array(user_ids)
    item_ids = np.array(item_ids)
    predictions = np.array(predictions)
    timestamps = np.array(timestamps)
    
    # Stack the arrays horizontally
    data_to_save = np.column_stack((user_ids, item_ids, predictions, timestamps))
        
    # Save to CSV
    np.savetxt(filename, data_to_save, delimiter=',', fmt='%d,%d,%.1f,%d')

path = 'Official_submission/results3.csv'
save_predictions_to_csv(test_user_indices, test_item_indices, final_predictions, timestamps, path)
