In [1]:
import numpy as np
from tqdm import tqdm
import random
import sqlite3

In [2]:
# import dataset from database
def load_data(db_path):
    conn = sqlite3.connect(db_path)
    print("Loaded database")

    c = conn.cursor()
    print("Fetching data ...")
    c.execute('SELECT UserID, ItemID, Rating FROM example_table')
    data = c.fetchall()
    conn.close()

    user_indices = []
    item_indices = []
    ratings_values = []

    max_user_id = 0
    max_item_id = 0

    for user_id, item_id, rating in data:
        user_indices.append(user_id)
        item_indices.append(item_id)
        ratings_values.append(rating)
        #ratings_values.append(int(rating * 2)) # convert ratings to integers
        max_user_id = max(max_user_id, user_id)
        max_item_id = max(max_item_id, item_id)

    user_indices = np.array(user_indices, dtype=np.int32)
    item_indices = np.array(item_indices, dtype=np.int32)
    #ratings_values = np.array(ratings_values, dtype=np.int32)
    ratings_values = np.array(ratings_values, dtype=np.float32)

    print("Max user id:", max_user_id)
    print("Max item id:", max_item_id)

    return user_indices, item_indices, ratings_values, max_user_id, max_item_id

def normalize_ids(indices):
    unique_ids, inverse_indices = np.unique(indices, return_inverse=True)
    id_map = {original_id: idx for idx, original_id in enumerate(unique_ids)}
    reverse_map = {idx: original_id for idx, original_id in enumerate(unique_ids)}
    num_unique = len(unique_ids)  # The total number of unique indices
    return inverse_indices, num_unique, id_map, reverse_map



# split into train and validation sets
def split_data(user_indices, item_indices, ratings, split_ratio=0.9):
    np.random.seed(42)
    indices = np.random.permutation(len(ratings))
    split_point = int(len(ratings) * split_ratio)
    train_idx, val_idx = indices[:split_point], indices[split_point:]
    
    train_data = (user_indices[train_idx], item_indices[train_idx], ratings[train_idx])
    val_data = (user_indices[val_idx], item_indices[val_idx], ratings[val_idx])
    return train_data, val_data

In [3]:
# impor dataset

path_100k = '../../data/dataset1/train_100k.db'
path_20M = '../../data/dataset2/train_20M.db'

global_user_indices, global_item_indices, global_ratings, global_max_user_id, global_max_item_id = load_data(path_20M)

# Normalize user and item indices
global_user_indices, global_num_users, user_to_norm, norm_to_user = normalize_ids(global_user_indices)
global_item_indices, global_num_items, item_to_norm, norm_to_item = normalize_ids(global_item_indices)

all_data = (global_user_indices, global_item_indices, global_ratings)
train_data, val_data = split_data(global_user_indices, global_item_indices, global_ratings, split_ratio=0.9)

print("Train data size:",train_data[0].size)
print("Validation data size:",val_data[0].size)

# train_data[0] = user_indices
# train_data[1] = item_indices
# train_data[2] = ratings


Loaded database
Fetching data ...
Max user id: 138493
Max item id: 26744
Train data size: 16753799
Validation data size: 1861534


# MAE and Prediction Code

In [4]:
# MAE and predict methods
def calculate_mae(actual, predicted):
    """
    Parameters:
    - actual_ratings: np.array, the actual ratings.
    - predicted_ratings: np.array, the predicted ratings.
    """
    # calculate the absolute error between actual and predicted ratings
    abs_err = np.abs(actual - predicted)
    
    # calculate the mean of these absolute errors
    mae = np.mean(abs_err)
    
    return mae # /2

# round prediction to nearest 0.5 in range [0.5, 5]
def round_predictions(predictions):
    rounded_predictions = np.round(predictions * 2) / 2
    return np.clip(rounded_predictions, 0.5, 5.0)
    #rounded_predictions = np.round(predictions)
    #return np.clip(rounded_predictions, 1, 10)

def predict(user_features, item_features, user_indices, item_indices):
    predictions = np.array([np.dot(user_features[u], item_features[i]) for u, i in zip(user_indices, item_indices)])
    return predictions

# SGD

In [5]:
def sgd(user_indices, item_indices, ratings, num_users, num_items, num_factors, alpha, beta, iterations):
    # Initialize feature matrices
    np.random.seed(42)
    user_features = np.random.normal(0, 0.1, (num_users, num_factors))
    item_features = np.random.normal(0, 0.1, (num_items, num_factors))

    # SGD updates (only using training data)
    for iteration in range(iterations): #tqdm(range(iterations), desc='SGD iterations', total=iterations):
        for u, i, r in tqdm(zip(user_indices, item_indices, ratings), desc=f'SGD {iteration+1}/{iterations}', total=len(ratings)):
            prediction = np.dot(user_features[u], item_features[i])
            error = r - prediction

            # Update rules for features
            user_features_grad = -2 * error * item_features[i] + beta * user_features[u]
            item_features_grad = -2 * error * user_features[u] + beta * item_features[i]

            user_features[u] -= alpha * user_features_grad
            item_features[i] -= alpha * item_features_grad

    return user_features, item_features

In [68]:
import multiprocessing as mp

def initialize_shared_arrays(num_users, num_items, num_factors):
    global shared_user_features_base, shared_item_features_base
    shared_user_features_base = mp.Array('d', num_users * num_factors)
    shared_item_features_base = mp.Array('d', num_items * num_factors)
    user_features = np.frombuffer(shared_user_features_base.get_obj()).reshape(num_users, num_factors)
    item_features = np.frombuffer(shared_item_features_base.get_obj()).reshape(num_items, num_factors)
    return user_features, item_features

def worker_process(chunk, num_factors, alpha, beta, iteration):
    user_features = np.frombuffer(shared_user_features_base.get_obj()).reshape(-1, num_factors)
    item_features = np.frombuffer(shared_item_features_base.get_obj()).reshape(-1, num_factors)
    for u, i, r in tqdm.tqdm(chunk, desc=f'Iteration {iteration + 1}', position=0, leave=True):
        prediction = np.dot(user_features[u], item_features[i])
        error = r - prediction

        user_features_grad = -2 * error * item_features[i] + beta * user_features[u]
        item_features_grad = -2 * error * user_features[u] + beta * item_features[i]

        user_features[u] -= alpha * user_features_grad
        item_features[i] -= alpha * item_features_grad

def parallel_sgd(user_indices, item_indices, ratings, num_users, num_items, num_factors, alpha, beta, iterations):
    user_features, item_features = initialize_shared_arrays(num_users, num_items, num_factors)
    chunks = np.array_split(list(zip(user_indices, item_indices, ratings)), mp.cpu_count())

    for iteration in range(iterations):
        print("Iteration", iteration + 1)
        with mp.Pool(processes=mp.cpu_count()) as pool:
            pool.starmap(worker_process, [(chunk, num_factors, alpha, beta, iteration) for chunk in chunks])

    return user_features, item_features

In [6]:
import gc

iterations = 4
for num_factors in [20, 40, 60]:  # Different complexities
    for alpha in [0.005, 0.01, 0.015]:
        for beta in [0.05, 0.1, 0.15]:
            print(f"Testing with {num_factors} factors and alpha= {alpha}, beta= {beta}")
            user_features, item_features = sgd(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
            sgd_predictions = predict(user_features, item_features, val_data[0], val_data[1])
            sgd_rounded_predictions = round_predictions(sgd_predictions)

            truth_ratings = val_data[2]
            sgd_mae = calculate_mae(truth_ratings, sgd_rounded_predictions)
            print(f"MAE:", sgd_mae)
            print()
            gc.collect()

Testing with 20 factors and alpha= 0.005, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:52<00:00, 57287.55it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:52<00:00, 57247.86it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:54<00:00, 56956.73it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:51<00:00, 57403.17it/s]


MAE: 0.6251790190240952

Testing with 20 factors and alpha= 0.005, beta= 0.1


SGD 1/4: 100%|██████████| 16753799/16753799 [04:51<00:00, 57408.65it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:50<00:00, 57598.00it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:49<00:00, 57857.52it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:47<00:00, 58303.58it/s]


MAE: 0.6412810617479993

Testing with 20 factors and alpha= 0.005, beta= 0.15


SGD 1/4: 100%|██████████| 16753799/16753799 [04:47<00:00, 58231.33it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:50<00:00, 57753.37it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:03<00:00, 55171.56it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:40<00:00, 49154.25it/s]


MAE: 0.6540758858017097

Testing with 20 factors and alpha= 0.01, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [05:32<00:00, 50350.71it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:32<00:00, 50443.31it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:30<00:00, 50705.10it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:27<00:00, 51085.45it/s]


MAE: 0.6151278998933138

Testing with 20 factors and alpha= 0.01, beta= 0.1


SGD 1/4: 100%|██████████| 16753799/16753799 [05:28<00:00, 50938.36it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:26<00:00, 51280.11it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:27<00:00, 51198.33it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:27<00:00, 51190.92it/s]


MAE: 0.6319145930184461

Testing with 20 factors and alpha= 0.01, beta= 0.15


SGD 1/4: 100%|██████████| 16753799/16753799 [05:26<00:00, 51261.49it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:26<00:00, 51236.21it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:28<00:00, 51075.66it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:17<00:00, 52698.38it/s]


MAE: 0.6472433487650507

Testing with 20 factors and alpha= 0.015, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [05:08<00:00, 54321.18it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:08<00:00, 54325.72it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:09<00:00, 54173.44it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:08<00:00, 54307.19it/s]


MAE: 0.6217423372337008

Testing with 20 factors and alpha= 0.015, beta= 0.1


SGD 1/4: 100%|██████████| 16753799/16753799 [05:07<00:00, 54464.06it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:48<00:00, 58096.46it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:47<00:00, 58292.12it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:48<00:00, 58148.83it/s]


MAE: 0.6368156584838096

Testing with 20 factors and alpha= 0.015, beta= 0.15


SGD 1/4: 100%|██████████| 16753799/16753799 [04:49<00:00, 57901.61it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:04<00:00, 54939.83it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:18<00:00, 52533.53it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:18<00:00, 52571.55it/s]


MAE: 0.6526061302130394

Testing with 40 factors and alpha= 0.005, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [05:18<00:00, 52547.39it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:19<00:00, 52412.15it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:18<00:00, 52623.34it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:19<00:00, 52492.42it/s]


MAE: 0.6225070291490781

Testing with 40 factors and alpha= 0.005, beta= 0.1


SGD 1/4: 100%|██████████| 16753799/16753799 [05:20<00:00, 52334.90it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:19<00:00, 52461.75it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:19<00:00, 52426.51it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:19<00:00, 52409.90it/s]


MAE: 0.6396079792257353

Testing with 40 factors and alpha= 0.005, beta= 0.15


SGD 1/4: 100%|██████████| 16753799/16753799 [05:19<00:00, 52466.87it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:19<00:00, 52356.87it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:20<00:00, 52294.35it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:19<00:00, 52468.45it/s]


MAE: 0.6529690029835609

Testing with 40 factors and alpha= 0.01, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [05:20<00:00, 52221.03it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:20<00:00, 52286.97it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:20<00:00, 52307.04it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:20<00:00, 52289.68it/s]


MAE: 0.6119278509014608

Testing with 40 factors and alpha= 0.01, beta= 0.1


SGD 1/4: 100%|██████████| 16753799/16753799 [05:21<00:00, 52157.98it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:15<00:00, 53072.86it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:01<00:00, 55629.00it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:02<00:00, 55462.39it/s]


MAE: 0.6297029761476288

Testing with 40 factors and alpha= 0.01, beta= 0.15


SGD 1/4: 100%|██████████| 16753799/16753799 [05:01<00:00, 55535.83it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:02<00:00, 55348.60it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:02<00:00, 55472.46it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:58<00:00, 56128.25it/s]


MAE: 0.6458858661727371

Testing with 40 factors and alpha= 0.015, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59015.14it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59099.04it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58942.57it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59014.88it/s]


MAE: 0.6187727433396328

Testing with 40 factors and alpha= 0.015, beta= 0.1


SGD 1/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58911.40it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59010.78it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58927.19it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59115.34it/s]


MAE: 0.6342637308800162

Testing with 40 factors and alpha= 0.015, beta= 0.15


SGD 1/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58848.15it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58827.51it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58875.17it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59034.69it/s]


MAE: 0.6511186473091547

Testing with 60 factors and alpha= 0.005, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58744.92it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58700.89it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58726.37it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58656.21it/s]


MAE: 0.6219212219599535

Testing with 60 factors and alpha= 0.005, beta= 0.1


SGD 1/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58691.30it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58580.29it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58634.29it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58679.36it/s]


MAE: 0.638545146099937

Testing with 60 factors and alpha= 0.005, beta= 0.15


SGD 1/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58625.84it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:46<00:00, 58488.24it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:46<00:00, 58555.80it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58653.56it/s]


MAE: 0.6515441028742961

Testing with 60 factors and alpha= 0.01, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58739.99it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58658.61it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58858.03it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58874.71it/s]


MAE: 0.6108263399970132

Testing with 60 factors and alpha= 0.01, beta= 0.1


SGD 1/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58672.88it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58603.20it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58737.76it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58630.85it/s]


MAE: 0.6290715076920432

Testing with 60 factors and alpha= 0.01, beta= 0.15


SGD 1/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58735.59it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58693.83it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58743.62it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58683.97it/s]


MAE: 0.6452925383044307

Testing with 60 factors and alpha= 0.015, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58698.81it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58690.89it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:46<00:00, 58448.56it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58628.66it/s]


MAE: 0.6176865423892338

Testing with 60 factors and alpha= 0.015, beta= 0.1


SGD 1/4: 100%|██████████| 16753799/16753799 [04:46<00:00, 58476.66it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58603.94it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58691.02it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:46<00:00, 58531.20it/s]


MAE: 0.6332822822467922

Testing with 60 factors and alpha= 0.015, beta= 0.15


SGD 1/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58706.07it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58847.94it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58770.92it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58798.90it/s]


MAE: 0.650725691821906



In [69]:

num_factors = 30  # Latent factors
alpha = 0.0075      # Learning rate
beta = 0.125       # Regularization
iterations = 1   # Number of iterations

# Run SGD
sgd_user_features, sgd_item_features = sgd(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
sgd_predictions = predict(sgd_user_features, sgd_item_features, val_data[0], val_data[1])
sgd_rounded_predictions = round_predictions(sgd_predictions)

truth_ratings = val_data[2]
sgd_mae = calculate_mae(truth_ratings, sgd_rounded_predictions)
print(f"MAE:", sgd_mae)
#MAE: 0.6496064536022441

In [None]:
# ensemble best SGD models and average predictions


# ALS

In [63]:
#def init_factors(num_factors, size):
#    """ Initialize factors as random normal variables, ensuring small starting values. """
#    np.random.seed(42)
#    return np.random.normal(scale=0.1, size=(size, num_factors))
#
##def update_factors(fixed_factors, ratings_dict, num_factors, lambda_reg):
##    num_entities = fixed_factors.shape[0]
##    new_factors = np.zeros_like(fixed_factors)
##    
##    for i in tqdm(range(num_entities), desc='Updating factors', total=num_entities):
##        A = np.zeros((num_factors, num_factors))
##        b = np.zeros(num_factors)
##        if i in ratings_dict:  # Check if there are ratings for this entity
##            for j, rating in ratings_dict[i].items():
##                A += np.outer(fixed_factors[j], fixed_factors[j])
##                b += rating * fixed_factors[j]
##            A += lambda_reg * np.eye(num_factors)
##            new_factors[i] = np.linalg.solve(A, b)
##        else:
##            new_factors[i] = np.zeros(num_factors)  # No ratings, potentially initialize differently
##    
##    return new_factors
#
#def update_factors(fixed_factors, ratings_dict, num_factors, lambda_reg):
#    num_entities = fixed_factors.shape[0]
#    new_factors = np.zeros_like(fixed_factors)
#
#    for i in tqdm(range(num_entities), desc='Updating factors', total=num_entities):
#        A = np.zeros((num_factors, num_factors))
#        b = np.zeros(num_factors)
#        if i in ratings_dict:  # Ensure 'i' is a valid index for ratings_dict
#            for j, rating in ratings_dict[i].items():
#                # Ensure 'j' is also within the valid range before accessing
#                if j < num_entities:
#                    A += np.outer(fixed_factors[j], fixed_factors[j])
#                    b += rating * fixed_factors[j]
#            A += lambda_reg * np.eye(num_factors)
#            new_factors[i] = np.linalg.solve(A, b)
#        else:
#            new_factors[i] = np.zeros(num_factors)  # Handle entities without ratings
#
#    return new_factors
#
#def als(train_data, num_users, num_items, num_factors, lambda_reg, iterations):
#    user_factors = init_factors(num_factors, num_users)
#    item_factors = init_factors(num_factors, num_items)
#
#    # Convert training data to a dictionary format for fast access
#    user_ratings = {u: {} for u in range(num_users)}
#    item_ratings = {i: {} for i in range(num_items)}
#    for (u, i), r in train_data:
#        user_ratings[u][i] = r
#        item_ratings[i][u] = r
#
#    for iteration in tqdm(range(iterations), desc='ALS Iterations'):
#        user_factors = update_factors(item_factors, user_ratings, num_factors, lambda_reg)
#        item_factors = update_factors(user_factors, item_ratings, num_factors, lambda_reg)
#
#    return user_factors, item_factors

In [59]:
#iterations = 10   # Number of ALS iterations
#for num_factors in [2, 3, 4]:  # Different complexities
#    for lambda_reg in [0.5, 0.75, 1]:  # Different regularization strengths
#        print(f"Testing with {num_factors} factors and lambda_reg= {lambda_reg}")
#        train_dict = [((u, i), r) for u, i, r in zip(train_data[0], train_data[1], train_data[2])]
#        als_user_features, als_item_features = als(train_dict, global_num_users, global_num_items, num_factors, lambda_reg, iterations)
#        als_predictions = predict(als_user_features, als_item_features, val_data[0], val_data[1])
#        als_rounded_predictions = round_predictions(als_predictions)
#
#        truth_ratings = val_data[2]
#        sgd_als = calculate_mae(truth_ratings, als_rounded_predictions)
#        print(f"MAE:", sgd_als)
#        print()

Testing with 2 factors and lambda_reg= 0.5


ALS Iterations: 100%|██████████| 10/10 [00:11<00:00,  1.20s/it]


MAE: 0.7203268190350005

Testing with 2 factors and lambda_reg= 0.75


ALS Iterations: 100%|██████████| 10/10 [00:11<00:00,  1.18s/it]


MAE: 0.7194435243458098

Testing with 2 factors and lambda_reg= 1


ALS Iterations: 100%|██████████| 10/10 [00:12<00:00,  1.21s/it]


MAE: 0.7198851716904052

Testing with 3 factors and lambda_reg= 0.5


ALS Iterations: 100%|██████████| 10/10 [00:12<00:00,  1.25s/it]


MAE: 0.7212653196422657

Testing with 3 factors and lambda_reg= 0.75


ALS Iterations: 100%|██████████| 10/10 [00:12<00:00,  1.22s/it]


MAE: 0.7174561112951309

Testing with 3 factors and lambda_reg= 1


ALS Iterations: 100%|██████████| 10/10 [00:12<00:00,  1.22s/it]


MAE: 0.7178425527216518

Testing with 4 factors and lambda_reg= 0.5


ALS Iterations: 100%|██████████| 10/10 [00:12<00:00,  1.21s/it]


MAE: 0.7257369990062935

Testing with 4 factors and lambda_reg= 0.75


ALS Iterations: 100%|██████████| 10/10 [00:11<00:00,  1.20s/it]


MAE: 0.722369438003754

Testing with 4 factors and lambda_reg= 1


ALS Iterations: 100%|██████████| 10/10 [00:11<00:00,  1.20s/it]

MAE: 0.7208788782157447






In [65]:
# Example usage
#
#num_factors = 15
#lambda_reg = 1
#iterations = 1
#
## Assuming train_data is a list of tuples ((user_id, item_id), rating)
#train_dict = [((u, i), r) for u, i, r in zip(train_data[0], train_data[1], train_data[2])]
#
#als_user_features, als_item_features = als(train_dict, global_num_users, global_num_items, num_factors, lambda_reg, iterations)
#als_predictions = predict(als_user_features, als_item_features, val_data[0], val_data[1])
#als_rounded_predictions = round_predictions(als_predictions)
#
#truth_ratings = val_data[2]
#als_mae = calculate_mae(truth_ratings, als_rounded_predictions)
#print(f"MAE:", als_mae)


Updating factors: 100%|██████████| 26690/26690 [00:27<00:00, 987.18it/s] 
Updating factors: 100%|██████████| 26690/26690 [00:30<00:00, 887.98it/s]  
ALS Iterations: 100%|██████████| 1/1 [00:57<00:00, 57.11s/it]


IndexError: index 41193 is out of bounds for axis 0 with size 26690

# Weighted Predictions


In [25]:
## weighted predictions
#
#for weight_sgd in [0.45, 0.475, 0.5, 0.525, 0.55]:
#    for weight_als in [0.55, 0.525, 0.5, 0.475, 0.45]:
#        #weight_als = 1 - weight_sgd
#        weighted_predictions = (weight_sgd * sgd_rounded_predictions) + (weight_als * als_rounded_predictions)
#        weighted_rounded_predictions = round_predictions(weighted_predictions)
#        weighted_mae = calculate_mae(truth_ratings, weighted_rounded_predictions)
#        print(f"SGD: {weight_sgd}, ALS: {weight_als}, MAE:  ", weighted_mae)
#print()
#
#
## SGD and ALS MAEs
#print("SGD MAE: ", sgd_mae)
#print("ALS MAE: ", als_mae)
#
#
#weight_sgd = 0.5  # Assume SGD has higher validation accuracy
#weight_als = 0.5 # ALS is slightly less accurate
#
## sgd_predictions and als_predictions are arrays of the same shape containing the predicted ratings
#weighted_predictions = (weight_sgd * sgd_rounded_predictions) + (weight_als * als_rounded_predictions)
#weighted_rounded_predictions = round_predictions(weighted_predictions)
#weighted_mae = calculate_mae(truth_ratings, weighted_rounded_predictions)
#
## weighted MAE
#print("Weighted MAE: ", weighted_mae)


SGD: 0.45, ALS: 0.55, MAE:   0.715358286408303
SGD: 0.45, ALS: 0.525, MAE:   0.7314232085679585
SGD: 0.45, ALS: 0.5, MAE:   0.7318648559125538
SGD: 0.45, ALS: 0.475, MAE:   0.7861874792977808
SGD: 0.45, ALS: 0.45, MAE:   0.8148945566964778
SGD: 0.475, ALS: 0.55, MAE:   0.7052556034006846
SGD: 0.475, ALS: 0.525, MAE:   0.715358286408303
SGD: 0.475, ALS: 0.5, MAE:   0.7318648559125538
SGD: 0.475, ALS: 0.475, MAE:   0.7318648559125538
SGD: 0.475, ALS: 0.45, MAE:   0.7861874792977808
SGD: 0.5, ALS: 0.55, MAE:   0.7046483383018659
SGD: 0.5, ALS: 0.525, MAE:   0.7052556034006846
SGD: 0.5, ALS: 0.5, MAE:   0.695925803246108
SGD: 0.5, ALS: 0.475, MAE:   0.7318648559125538
SGD: 0.5, ALS: 0.45, MAE:   0.7324721210113724
SGD: 0.525, ALS: 0.55, MAE:   0.7206580545434471
SGD: 0.525, ALS: 0.525, MAE:   0.7052556034006846
SGD: 0.525, ALS: 0.5, MAE:   0.7052556034006846
SGD: 0.525, ALS: 0.475, MAE:   0.7217621729049354
SGD: 0.525, ALS: 0.45, MAE:   0.7315888263221817
SGD: 0.55, ALS: 0.55, MAE:   0.736

# Test Set and Submission 

In [26]:
# import test set

# 20M dataset
test_dir_20M = '../../data/dataset2/test_20Mwithoutratings.csv'

# 100k dataset
test_dir_100K = '../../data/dataset1/test_100k_withoutratings.csv'


# Load the dataset
def load_data_np(filepath):
    return np.loadtxt(filepath, delimiter=',', skiprows=0, dtype='float32')
   
# Load the dataset (excluding the header if present)
test_data = load_data_np(test_dir_100K)

print("Test data shape: ", test_data.shape)
print("All data size: ", len(all_data[0]))

print("Ratio All Data / Test:", len(test_data) / len(all_data[0]))
print("Ratio Train / Val:", len(val_data[0]) / len(train_data[0]))

Test data shape:  (9430, 3)
All data size:  90570
Ratio All Data / Test: 0.10411836148835155
Ratio Train / Val: 0.1111111111111111


In [27]:
test_user_indices = np.array([user_to_norm.get(int(user), -1) for user in test_data[:, 0]])
test_item_indices = np.array([item_to_norm.get(int(item), -1) for item in test_data[:, 1]])



In [28]:
# Run SGD on all data
#num_factors = 20  # Latent factors
#alpha = 0.0075      # Learning rate
#beta = 0.125       # Regularization
#iterations = 20 

num_factors = 20
alpha = 0.01
beta = 0.02
iterations = 10

sgd_user_factors, sgd_item_factors = sgd(all_data[0], all_data[1], all_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
sgd_test_predictions = round_predictions(predict(sgd_user_factors, sgd_item_factors, test_user_indices, test_item_indices))

# Run ALS on all data
#num_factors = 3
#lambda_reg = 0.75
#iterations = 10

# Run ALS on all data
num_factors = 2  
lambda_reg = 0.5  
iterations = 10

all_data_dict = [((u, i), r) for u, i, r in zip(all_data[0], all_data[1], all_data[2])]
als_user_factors, als_item_factors = als(all_data_dict, global_num_users, global_num_items, num_factors, lambda_reg, iterations=10)
als_test_predictions = round_predictions(predict(als_user_factors, als_item_factors, test_user_indices, test_user_indices))

# weighted predictions
weight_sgd = 0.5  
weight_als = 0.5 

# sgd_predictions and als_predictions are arrays of the same shape containing the predicted ratings
weighted_test_predictions = round_predictions((weight_sgd * sgd_test_predictions) + (weight_als * als_test_predictions))



SGD iterations: 100%|██████████| 10/10 [00:13<00:00,  1.39s/it]
ALS Iterations: 100%|██████████| 10/10 [00:12<00:00,  1.29s/it]


In [41]:
def revert_to_original_ids(predictions, user_indices, item_indices, norm_to_user, norm_to_item):
    original_user_ids = [norm_to_user.get(idx) for idx in user_indices]
    original_item_ids = [norm_to_item.get(idx) for idx in item_indices]
    return np.column_stack((original_user_ids, original_item_ids, predictions))

final_predictions = revert_to_original_ids(weighted_test_predictions, test_user_indices, test_item_indices, norm_to_user, norm_to_item)


In [42]:
print(final_predictions[:, 0:3])

[[1 84 4.0]
 [1 87 3.5]
 [1 180 4.0]
 ...
 [943 653 4.0]
 [943 673 4.5]
 [943 936 4.0]]


In [43]:
if np.any(final_predictions[:, 0:2] == None):
    print("None values found in user/item ID columns.")
if np.any(final_predictions[:, 2] == None):
    print("None values found in prediction column.")

# Before converting types, check and replace None values with a default or drop them
for i in range(final_predictions.shape[1]):  # Assuming final_predictions has 3 columns
    final_predictions[:, i] = np.where(final_predictions[:, i] == None, -1, final_predictions[:, i])

# Now try conversion
final_predictions[:, 0:2] = final_predictions[:, 0:2].astype(int)
predicted_ratings = final_predictions[:, 2].reshape(-1, 1).astype(float)
timestamps = test_data[:, 2].reshape(-1, 1).astype(int)

predicted_testset = np.hstack((final_predictions[:, 0:2].astype(int), predicted_ratings, timestamps))

path = 'Optional_Submission/results3.csv'
#np.savetxt(path, predicted_testset, delimiter=",", fmt='%d,%d,%.1f,%d')



None values found in user/item ID columns.
