In [3]:
import numpy as np
from tqdm import tqdm
import random
import sqlite3

In [4]:
# import dataset from database
def load_data(db_path):
    conn = sqlite3.connect(db_path)
    print("Loaded database")

    c = conn.cursor()
    print("Fetching data ...")
    c.execute('SELECT UserID, ItemID, Rating FROM example_table')
    data = c.fetchall()
    conn.close()

    user_indices = []
    item_indices = []
    ratings_values = []

    max_user_id = 0
    max_item_id = 0

    for user_id, item_id, rating in data:
        user_indices.append(user_id)
        item_indices.append(item_id)
        ratings_values.append(rating)
        #ratings_values.append(int(rating * 2)) # convert ratings to integers
        max_user_id = max(max_user_id, user_id)
        max_item_id = max(max_item_id, item_id)

    user_indices = np.array(user_indices, dtype=np.int32)
    item_indices = np.array(item_indices, dtype=np.int32)
    #ratings_values = np.array(ratings_values, dtype=np.int32)
    ratings_values = np.array(ratings_values, dtype=np.float32)

    print("Max user id:", max_user_id)
    print("Max item id:", max_item_id)

    return user_indices, item_indices, ratings_values, max_user_id, max_item_id

#def normalize_ids(indices):
#    unique_ids, inverse_indices = np.unique(indices, return_inverse=True)
#    id_map = {original_id: idx for idx, original_id in enumerate(unique_ids)}
#    reverse_map = {idx: original_id for idx, original_id in enumerate(unique_ids)}
#    num_unique = len(unique_ids)  # The total number of unique indices
#    return inverse_indices, num_unique, id_map, reverse_map

def normalize_indices(indices):
    unique_ids = np.unique(indices)
    id_to_norm = {id_: i for i, id_ in enumerate(unique_ids)}
    norm_to_id = {i: id_ for i, id_ in enumerate(unique_ids)}
    normalized_indices = np.vectorize(id_to_norm.get)(indices)
    num_unique = len(unique_ids)  # Total number of unique indices
    return normalized_indices, num_unique, id_to_norm, norm_to_id

# split into train and validation sets
def split_data(user_indices, item_indices, ratings, split_ratio=0.9):
    np.random.seed(42)
    indices = np.random.permutation(len(ratings))
    split_point = int(len(ratings) * split_ratio)
    train_idx, val_idx = indices[:split_point], indices[split_point:]
    
    train_data = (user_indices[train_idx], item_indices[train_idx], ratings[train_idx])
    val_data = (user_indices[val_idx], item_indices[val_idx], ratings[val_idx])
    return train_data, val_data

In [5]:
# impor dataset

path_100k = '../../data/dataset1/train_100k.db'
path_20M = '../../data/dataset2/train_20M.db'

u, i, global_ratings, global_max_user_id, global_max_item_id = load_data(path_20M)

# Normalize user and item indices
global_user_indices, global_num_users, user_to_norm, norm_to_user = normalize_indices(u)
global_item_indices, global_num_items, item_to_norm, norm_to_item = normalize_indices(i)

#global_num_users = np.unique(global_user_indices)
#global_num_items = np.unique(global_item_indices)


print("Number of users:", global_num_users)
print("Number of items:", global_num_items)

train_data, val_data = split_data(global_user_indices, global_item_indices, global_ratings, split_ratio=0.9)
all_data, _ = split_data(global_user_indices, global_item_indices, global_ratings, split_ratio=1)

print("Train data size:",train_data[0].size)
print("Validation data size:",val_data[0].size)
print("All data size:",all_data[0].size)

# train_data[0] = user_indices
# train_data[1] = item_indices
# train_data[2] = ratings


Loaded database
Fetching data ...
Max user id: 138493
Max item id: 26744
Number of users: 138493
Number of items: 26690
Train data size: 16753799
Validation data size: 1861534
All data size: 18615333


# MAE and Prediction Code

In [13]:
# MAE and predict methods
def calculate_mae(actual, predicted):
    """
    Parameters:
    - actual_ratings: np.array, the actual ratings.
    - predicted_ratings: np.array, the predicted ratings.
    """
    # calculate the absolute error between actual and predicted ratings
    abs_err = np.abs(actual - predicted)
    
    # calculate the mean of these absolute errors
    mae = np.mean(abs_err)
    
    return mae # /2

# round prediction to nearest 0.5 in range [0.5, 5]
def round_predictions(predictions):
    #rounded_predictions = np.round(predictions * 2) / 2
    #return np.clip(rounded_predictions, 0.5, 5.0)
    rounded_predictions = np.round(predictions)
    return np.clip(rounded_predictions, 1, 5)

def predict(user_features, item_features, user_indices, item_indices):
    predictions = np.array([np.dot(user_features[u], item_features[i]) for u, i in zip(user_indices, item_indices)])
    return predictions

# SGD

In [7]:
def sgd(user_indices, item_indices, ratings, num_users, num_items, num_factors, alpha, beta, iterations):
    # Initialize feature matrices
    np.random.seed(42)
    user_features = np.random.normal(0, 0.1, (num_users, num_factors))
    item_features = np.random.normal(0, 0.1, (num_items, num_factors))

    # SGD updates (only using training data)
    for iteration in range(iterations): #tqdm(range(iterations), desc='SGD iterations', total=iterations):
        for u, i, r in tqdm(zip(user_indices, item_indices, ratings), desc=f'SGD {iteration+1}/{iterations}', total=len(ratings)):
            prediction = np.dot(user_features[u], item_features[i])
            error = r - prediction

            # Update rules for features
            user_features_grad = -2 * error * item_features[i] + beta * user_features[u]
            item_features_grad = -2 * error * user_features[u] + beta * item_features[i]

            user_features[u] -= alpha * user_features_grad
            item_features[i] -= alpha * item_features_grad

    return user_features, item_features

In [None]:

import gc

iterations = 4
alpha = 0.0075
num_factors = 60
for beta in [0.01, 0.03, 0.05]:
    for alpha in [0.0025, 0.005, 0.0075]:
        for num_factors in [20, 40, 60]:
            print(f"Testing with {num_factors} factors and alpha= {alpha}, beta= {beta}")
            user_features, item_features = sgd(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
            sgd_predictions = predict(user_features, item_features, val_data[0], val_data[1])
            sgd_rounded_predictions = round_predictions(sgd_predictions)

            truth_ratings = val_data[2]
            sgd_mae = calculate_mae(truth_ratings, sgd_rounded_predictions)
            print(f"MAE:", sgd_mae)
            print()
            gc.collect()

Testing with 20 factors and alpha= 0.0025, beta= 0.01


SGD 1/4: 100%|██████████| 16753799/16753799 [05:09<00:00, 54130.02it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:42<00:00, 48983.79it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:33<00:00, 50262.52it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:32<00:00, 50332.44it/s]


MAE: 0.6380184299615264

Testing with 40 factors and alpha= 0.0025, beta= 0.01


SGD 1/4: 100%|██████████| 16753799/16753799 [05:30<00:00, 50749.35it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:28<00:00, 51057.26it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:26<00:00, 51329.36it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:27<00:00, 51125.13it/s]


MAE: 0.6355709860792228

Testing with 60 factors and alpha= 0.0025, beta= 0.01


SGD 1/4: 100%|██████████| 16753799/16753799 [05:30<00:00, 50761.32it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:30<00:00, 50684.81it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:28<00:00, 50968.96it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:30<00:00, 50716.33it/s]


MAE: 0.6352333613030974

Testing with 20 factors and alpha= 0.005, beta= 0.01


SGD 1/4: 100%|██████████| 16753799/16753799 [05:26<00:00, 51320.37it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:12<00:00, 53670.67it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:04<00:00, 54955.23it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:06<00:00, 54701.74it/s]


MAE: 0.613132502548973

Testing with 40 factors and alpha= 0.005, beta= 0.01


SGD 1/4: 100%|██████████| 16753799/16753799 [05:07<00:00, 54433.22it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:06<00:00, 54606.31it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:03<00:00, 55226.85it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:48<00:00, 58038.66it/s]


MAE: 0.6104580953127904

Testing with 60 factors and alpha= 0.005, beta= 0.01


SGD 1/4: 100%|██████████| 16753799/16753799 [04:49<00:00, 57856.60it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:49<00:00, 57944.23it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:49<00:00, 57894.55it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:10<00:00, 54019.07it/s]


MAE: 0.6103334669149207

Testing with 20 factors and alpha= 0.0075, beta= 0.01


SGD 1/4: 100%|██████████| 16753799/16753799 [05:17<00:00, 52693.02it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:17<00:00, 52797.43it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:16<00:00, 52858.79it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:17<00:00, 52840.81it/s]


MAE: 0.6076633572096991

Testing with 40 factors and alpha= 0.0075, beta= 0.01


SGD 1/4: 100%|██████████| 16753799/16753799 [05:18<00:00, 52647.15it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:19<00:00, 52370.99it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:18<00:00, 52590.12it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:18<00:00, 52597.42it/s]


MAE: 0.6056733317790597

Testing with 60 factors and alpha= 0.0075, beta= 0.01


SGD 1/4: 100%|██████████| 16753799/16753799 [05:19<00:00, 52453.32it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:17<00:00, 52721.97it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:19<00:00, 52511.12it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:17<00:00, 52715.53it/s]


MAE: 0.6062940564072427

Testing with 20 factors and alpha= 0.0025, beta= 0.03


SGD 1/4: 100%|██████████| 16753799/16753799 [05:16<00:00, 52977.21it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:16<00:00, 52855.11it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:17<00:00, 52848.63it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:16<00:00, 53000.18it/s]


MAE: 0.6447322477053871

Testing with 40 factors and alpha= 0.0025, beta= 0.03


SGD 1/4: 100%|██████████| 16753799/16753799 [05:18<00:00, 52639.20it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [05:17<00:00, 52714.64it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:17<00:00, 52799.42it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:10<00:00, 53880.90it/s]


MAE: 0.6422958162461712

Testing with 60 factors and alpha= 0.0025, beta= 0.03


SGD 1/4: 100%|██████████| 16753799/16753799 [05:00<00:00, 55778.90it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:59<00:00, 55909.92it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [05:00<00:00, 55793.06it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [05:00<00:00, 55700.88it/s]


MAE: 0.6414156281862163

Testing with 20 factors and alpha= 0.005, beta= 0.03


SGD 1/4: 100%|██████████| 16753799/16753799 [04:58<00:00, 56118.99it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:54<00:00, 56851.24it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59399.00it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:41<00:00, 59613.54it/s]


MAE: 0.6183309034377025

Testing with 40 factors and alpha= 0.005, beta= 0.03


SGD 1/4: 100%|██████████| 16753799/16753799 [04:41<00:00, 59436.19it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:41<00:00, 59543.49it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59346.86it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:41<00:00, 59484.86it/s]


MAE: 0.6153282722743716

Testing with 60 factors and alpha= 0.005, beta= 0.03


SGD 1/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59001.64it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59083.41it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59145.43it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59050.78it/s]


MAE: 0.6146917005007698

Testing with 20 factors and alpha= 0.0075, beta= 0.03


SGD 1/4: 100%|██████████| 16753799/16753799 [04:41<00:00, 59527.17it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:40<00:00, 59629.97it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59391.51it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:41<00:00, 59506.04it/s]


MAE: 0.6108913401527987

Testing with 40 factors and alpha= 0.0075, beta= 0.03


SGD 1/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59166.18it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59124.30it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59182.09it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59164.70it/s]


MAE: 0.6073646788079079

Testing with 60 factors and alpha= 0.0075, beta= 0.03


SGD 1/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59057.38it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59179.09it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59055.18it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59117.32it/s]


MAE: 0.6063026514691647

Testing with 20 factors and alpha= 0.0025, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59332.95it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:41<00:00, 59453.35it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59384.35it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:41<00:00, 59536.82it/s]


MAE: 0.6512854452295794

Testing with 40 factors and alpha= 0.0025, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59289.42it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59307.01it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59322.35it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59197.16it/s]


MAE: 0.6491447376196191

Testing with 60 factors and alpha= 0.0025, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58948.65it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58931.69it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59114.94it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58984.40it/s]


MAE: 0.6476473166753871

Testing with 20 factors and alpha= 0.005, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:41<00:00, 59557.68it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:40<00:00, 59676.22it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:41<00:00, 59534.73it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:40<00:00, 59677.78it/s]


MAE: 0.6251790190240952

Testing with 40 factors and alpha= 0.005, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59220.46it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59356.62it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59206.11it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59340.56it/s]


MAE: 0.6225070291490781

Testing with 60 factors and alpha= 0.005, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58920.57it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58973.55it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58916.05it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59043.46it/s]


MAE: 0.6219212219599535

Testing with 20 factors and alpha= 0.0075, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59329.75it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59317.98it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:42<00:00, 59269.02it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58757.03it/s]


MAE: 0.6168098460731848

Testing with 40 factors and alpha= 0.0075, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58756.02it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58648.12it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59048.21it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58749.91it/s]


MAE: 0.61372797918276

Testing with 60 factors and alpha= 0.0075, beta= 0.05


SGD 1/4: 100%|██████████| 16753799/16753799 [04:45<00:00, 58670.36it/s]
SGD 2/4: 100%|██████████| 16753799/16753799 [04:47<00:00, 58327.55it/s]
SGD 3/4: 100%|██████████| 16753799/16753799 [04:47<00:00, 58334.56it/s]
SGD 4/4: 100%|██████████| 16753799/16753799 [04:44<00:00, 58827.85it/s]


MAE: 0.6126971089434843



In [None]:
# ensemble best SGD models and average predictions
#Testing with 40 factors and alpha= 0.0075, beta= 0.01
#SGD 1/4: 100%|██████████| 16753799/16753799 [05:18<00:00, 52647.15it/s]
#SGD 2/4: 100%|██████████| 16753799/16753799 [05:19<00:00, 52370.99it/s]
#SGD 3/4: 100%|██████████| 16753799/16753799 [05:18<00:00, 52590.12it/s]
#SGD 4/4: 100%|██████████| 16753799/16753799 [05:18<00:00, 52597.42it/s]
#MAE: 0.6056733317790597

num_factors = 40  # Latent factors
alpha = 0.0075      # Learning rate
beta = 0.01       # Regularization
iterations = 10   # Number of iterations

# Run SGD
print(f"Testing with {num_factors} factors and alpha= {alpha}, beta= {beta}")
sgd_user_features, sgd_item_features = sgd(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
sgd_predictions = predict(sgd_user_features, sgd_item_features, val_data[0], val_data[1])
train_pred1 = round_predictions(sgd_predictions)

truth_ratings = val_data[2]
sgd_mae = calculate_mae(truth_ratings, train_pred1)
print(f"MAE:", sgd_mae)
print()

#Testing with 60 factors and alpha= 0.0075, beta= 0.03
#SGD 1/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59057.38it/s]
#SGD 2/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59179.09it/s]
#SGD 3/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59055.18it/s]
#SGD 4/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59117.32it/s]
#MAE: 0.6063026514691647

num_factors = 60  # Latent factors
alpha = 0.0075      # Learning rate
beta = 0.03      # Regularization
iterations = 10   # Number of iterations

# Run SGD
print(f"Testing with {num_factors} factors and alpha= {alpha}, beta= {beta}")
sgd_user_features, sgd_item_features = sgd(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
sgd_predictions = predict(sgd_user_features, sgd_item_features, val_data[0], val_data[1])
train_pred2 = round_predictions(sgd_predictions)

truth_ratings = val_data[2]
sgd_mae = calculate_mae(truth_ratings, train_pred2)
print(f"MAE:", sgd_mae)
print()

#Testing with 60 factors and alpha= 0.005, beta= 0.01
#SGD 1/8: 100%|██████████| 16753799/16753799 [05:24<00:00, 51693.81it/s]
#SGD 2/8: 100%|██████████| 16753799/16753799 [05:23<00:00, 51835.93it/s]
#SGD 3/8: 100%|██████████| 16753799/16753799 [05:04<00:00, 55013.00it/s]
#SGD 4/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54915.33it/s]
#SGD 5/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54876.24it/s]
#SGD 6/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54828.28it/s]
#SGD 7/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54852.84it/s]
#SGD 8/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54868.92it/s]
#MAE: 0.5960490111918451

num_factors = 60  # Latent factors
alpha = 0.005      # Learning rate
beta = 0.01       # Regularization
iterations = 8   # Number of iterations

# Run SGD
print()
sgd_user_features, sgd_item_features = sgd(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
sgd_predictions = predict(sgd_user_features, sgd_item_features, val_data[0], val_data[1])
train_pred3 = round_predictions(sgd_predictions)

truth_ratings = val_data[2]
sgd_mae = calculate_mae(truth_ratings, train_pred3)
print(f"MAE:", sgd_mae)
print()


Testing with 40 factors and alpha= 0.0075, beta= 0.01


SGD 1/10: 100%|██████████| 16753799/16753799 [04:45<00:00, 58713.27it/s]
SGD 2/10: 100%|██████████| 16753799/16753799 [04:46<00:00, 58505.60it/s]
SGD 3/10: 100%|██████████| 16753799/16753799 [04:46<00:00, 58534.76it/s]
SGD 4/10: 100%|██████████| 16753799/16753799 [04:45<00:00, 58653.03it/s]
SGD 5/10: 100%|██████████| 16753799/16753799 [04:45<00:00, 58664.29it/s]
SGD 6/10: 100%|██████████| 16753799/16753799 [04:46<00:00, 58513.97it/s]
SGD 7/10: 100%|██████████| 16753799/16753799 [04:46<00:00, 58499.12it/s]
SGD 8/10: 100%|██████████| 16753799/16753799 [04:45<00:00, 58671.02it/s]
SGD 9/10: 100%|██████████| 16753799/16753799 [04:46<00:00, 58405.57it/s]
SGD 10/10: 100%|██████████| 16753799/16753799 [04:46<00:00, 58547.85it/s]


MAE: 0.5988714146504979

Testing with 60 factors and alpha= 0.0075, beta= 0.03


SGD 1/10: 100%|██████████| 16753799/16753799 [04:44<00:00, 58787.60it/s]
SGD 2/10: 100%|██████████| 16753799/16753799 [04:46<00:00, 58488.59it/s]
SGD 3/10: 100%|██████████| 16753799/16753799 [04:46<00:00, 58459.48it/s]
SGD 4/10: 100%|██████████| 16753799/16753799 [04:49<00:00, 57954.58it/s]
SGD 5/10: 100%|██████████| 16753799/16753799 [04:47<00:00, 58209.88it/s]
SGD 6/10: 100%|██████████| 16753799/16753799 [04:43<00:00, 59048.83it/s]
SGD 7/10: 100%|██████████| 16753799/16753799 [04:44<00:00, 58958.53it/s]
SGD 8/10: 100%|██████████| 16753799/16753799 [04:44<00:00, 58968.02it/s]
SGD 9/10: 100%|██████████| 16753799/16753799 [04:43<00:00, 59131.80it/s]
SGD 10/10: 100%|██████████| 16753799/16753799 [04:43<00:00, 59195.12it/s]


MAE: 0.5923512543955683




SGD 1/10: 100%|██████████| 16753799/16753799 [04:45<00:00, 58781.14it/s]
SGD 2/10: 100%|██████████| 16753799/16753799 [04:43<00:00, 59038.33it/s]
SGD 3/10: 100%|██████████| 16753799/16753799 [04:44<00:00, 58929.47it/s]
SGD 4/10: 100%|██████████| 16753799/16753799 [04:43<00:00, 59077.22it/s]
SGD 5/10: 100%|██████████| 16753799/16753799 [04:43<00:00, 59027.23it/s]
SGD 6/10: 100%|██████████| 16753799/16753799 [04:43<00:00, 59005.30it/s]
SGD 7/10: 100%|██████████| 16753799/16753799 [04:43<00:00, 59099.67it/s]
SGD 8/10: 100%|██████████| 16753799/16753799 [04:43<00:00, 59040.08it/s]
SGD 9/10: 100%|██████████| 16753799/16753799 [04:44<00:00, 58893.92it/s]
SGD 10/10: 100%|██████████| 16753799/16753799 [04:44<00:00, 58849.34it/s]


MAE: 0.6060552748432207



In [None]:
# TEST 100K
num_factors = 60  # Latent factors
alpha = 0.0075      # Learning rate
beta = 0.03      # Regularization
iterations = 10   # Number of iterations

# Run SGD
print(f"Testing with {num_factors} factors and alpha= {alpha}, beta= {beta}")
sgd_user_features, sgd_item_features = sgd(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
sgd_predictions = predict(sgd_user_features, sgd_item_features, val_data[0], val_data[1])
train_pred2 = round_predictions(sgd_predictions)

truth_ratings = val_data[2]
sgd_mae = calculate_mae(truth_ratings, train_pred2)
print(f"MAE:", sgd_mae)
print()

Testing with 60 factors and alpha= 0.0075, beta= 0.03


SGD 1/2: 100%|██████████| 16753799/16753799 [04:52<00:00, 57228.63it/s]
SGD 2/2: 100%|██████████| 16753799/16753799 [04:49<00:00, 57938.57it/s]


MAE: 0.6359013587718516



In [None]:
# Best Run
#Testing with 60 factors and alpha= 0.0075, beta= 0.03
#SGD 1/10: 100%|██████████| 16753799/16753799 [04:44<00:00, 58787.60it/s]
#SGD 2/10: 100%|██████████| 16753799/16753799 [04:46<00:00, 58488.59it/s]
#SGD 3/10: 100%|██████████| 16753799/16753799 [04:46<00:00, 58459.48it/s]
#SGD 4/10: 100%|██████████| 16753799/16753799 [04:49<00:00, 57954.58it/s]
#SGD 5/10: 100%|██████████| 16753799/16753799 [04:47<00:00, 58209.88it/s]
#SGD 6/10: 100%|██████████| 16753799/16753799 [04:43<00:00, 59048.83it/s]
#SGD 7/10: 100%|██████████| 16753799/16753799 [04:44<00:00, 58958.53it/s]
#SGD 8/10: 100%|██████████| 16753799/16753799 [04:44<00:00, 58968.02it/s]
#SGD 9/10: 100%|██████████| 16753799/16753799 [04:43<00:00, 59131.80it/s]
#SGD 10/10: 100%|██████████| 16753799/16753799 [04:43<00:00, 59195.12it/s]
#MAE: 0.5923512543955683
final_mae = calculate_mae(truth_ratings, train_pred2)
print(f"Final MAE:", final_mae)

Final MAE: 0.6359013587718516


# Test Set and Submission 

In [8]:
# import test set

# load test set
def load_test(db_path):
    conn = sqlite3.connect(db_path)
    print("Loaded database")

    c = conn.cursor()
    print("Fetching data ...")
    c.execute('SELECT UserID, ItemID, TimeStamp FROM example_table')
    data = c.fetchall()
    conn.close()

    test_user_indices, test_item_indices, timestamps = zip(*data)

    return np.array(test_user_indices, dtype=np.int32), np.array(test_item_indices, dtype=np.int32), np.array(timestamps, dtype=np.int32)

def vectorize_indices(indices, mapping):
    missing_ids = [id for id in indices if id not in mapping]
    if missing_ids:
        print(f"Missing IDs: {set(missing_ids)}")
    
    vectorized_indices = np.array([mapping[id] if id in mapping else -1 for id in indices])

    return vectorized_indices, set(missing_ids)

# 20M dataset
test_dir_20M = '../../data/dataset2/test_20M.db'

# 100k dataset
test_dir_100K = '../../data/dataset1/test_100k.db'

   
# Load the dataset (excluding the header if present)
test_user_indices, test_item_indices, timestamps = load_test(test_dir_20M)
test_data = (test_user_indices, test_item_indices, timestamps)

test_user_indices_normalized, missing_users = vectorize_indices(test_user_indices, user_to_norm)
test_item_indices_normalized, missing_items = vectorize_indices(test_item_indices, item_to_norm)

print("Test data size: ", len(test_data[0]))
print("All data size: ", len(all_data[0]))
print("Ratio All Data / Test:", len(test_data[0]) / len(all_data[0]))
print("Ratio Train / Val:", len(val_data[0]) / len(train_data[0]))

print("Num users:", len(np.unique(test_user_indices_normalized)))
print("Num items:", len(np.unique(test_item_indices_normalized)))

Loaded database
Fetching data ...
Missing IDs: {22144, 23296, 24321, 24961, 18693, 20103, 20615, 25353, 18955, 24715, 21904, 16145, 24976, 19731, 12054, 10648, 24985, 21532, 23068, 25894, 22316, 14382, 24244, 20283, 16189, 25663, 25280, 24769, 24642, 26692, 25289, 26569, 25292, 23502, 13906, 20179, 22482, 25691, 24284, 24796, 21598, 23009, 24803, 24294, 23278, 16751, 22127, 22129, 25329, 26357, 15862, 25978, 21499, 23676}
Test data size:  1384930
All data size:  18615333
Ratio All Data / Test: 0.07439727239904868
Ratio Train / Val: 0.11111115753507607
Num users: 138493
Num items: 12864


In [9]:
# Run SGD on all data
#num_factors = 20  # Latent factors
#alpha = 0.0075      # Learning rate
#beta = 0.125       # Regularization
#iterations = 20 

num_factors = 200  # Latent factors
alpha = 0.002      # Learning rate
beta = 0.05      # Regularization
iterations = 140  # Number of iterations

sgd_user_factors, sgd_item_factors = sgd(all_data[0], all_data[1], all_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)




SGD 1/140: 100%|██████████| 18615333/18615333 [06:34<00:00, 47204.41it/s]
SGD 2/140: 100%|██████████| 18615333/18615333 [06:31<00:00, 47590.42it/s]
SGD 3/140: 100%|██████████| 18615333/18615333 [06:33<00:00, 47350.00it/s]
SGD 4/140: 100%|██████████| 18615333/18615333 [06:33<00:00, 47337.55it/s]
SGD 5/140: 100%|██████████| 18615333/18615333 [06:31<00:00, 47532.87it/s]
SGD 6/140: 100%|██████████| 18615333/18615333 [06:31<00:00, 47567.55it/s]
SGD 7/140: 100%|██████████| 18615333/18615333 [06:33<00:00, 47315.03it/s]
SGD 8/140: 100%|██████████| 18615333/18615333 [06:31<00:00, 47580.56it/s]
SGD 9/140: 100%|██████████| 18615333/18615333 [06:32<00:00, 47413.24it/s]
SGD 10/140: 100%|██████████| 18615333/18615333 [06:33<00:00, 47362.21it/s]
SGD 11/140: 100%|██████████| 18615333/18615333 [06:30<00:00, 47714.18it/s]
SGD 12/140: 100%|██████████| 18615333/18615333 [06:31<00:00, 47552.81it/s]
SGD 13/140: 100%|██████████| 18615333/18615333 [06:32<00:00, 47472.23it/s]
SGD 14/140: 100%|██████████| 18615

In [10]:
default_prediction_value = 3 # Midpoint between 0.5 and 5.0, adjust as needed

# Assuming you have a function `make_predictions` that uses your model
predictions = predict(sgd_user_factors, sgd_item_factors, test_user_indices_normalized, test_item_indices_normalized)

# Replace predictions for missing indices with the default value
predictions[test_item_indices_normalized == -1] = default_prediction_value



In [14]:
def realign_predictions(original_user_indices, original_item_indices, normalized_user_indices, normalized_item_indices, predictions, default_value=3.0):
    # Initialize the aligned predictions array with the default value
    aligned_predictions = np.full(original_user_indices.shape, default_value, dtype=float)

    # Create a mapping from normalized indices back to original positions
    # This map should be built from the normalized indices, not from the original indices directly.
    norm_to_orig_map = {}
    for idx, (norm_u, norm_i) in enumerate(zip(normalized_user_indices, normalized_item_indices)):
        if (norm_u, norm_i) not in norm_to_orig_map:  # avoid overriding if multiple original indices map to the same normalized index
            norm_to_orig_map[(norm_u, norm_i)] = idx

    # Use this map to assign predictions to their corresponding original indices
    for idx in range(len(predictions)):
        norm_u = normalized_user_indices[idx]
        norm_i = normalized_item_indices[idx]
        if (norm_u, norm_i) in norm_to_orig_map:
            orig_idx = norm_to_orig_map[(norm_u, norm_i)]
            aligned_predictions[orig_idx] = predictions[idx]

    return aligned_predictions

# Now call this function to realign your predictions
aligned_predictions = realign_predictions(test_user_indices, test_item_indices, test_user_indices_normalized, test_item_indices_normalized, predictions)

# Round the predictions to the nearest 0.5
final_predictions = round_predictions(aligned_predictions)


In [15]:
def save_predictions_to_csv(user_ids, item_ids, predictions, timestamps, filename):
    # Ensure all parts are numpy arrays (in case they are not)
    user_ids = np.array(user_ids)
    item_ids = np.array(item_ids)
    predictions = np.array(predictions)
    timestamps = np.array(timestamps)
    
    # Stack the arrays horizontally
    data_to_save = np.column_stack((user_ids, item_ids, predictions, timestamps))
        
    # Save to CSV
    np.savetxt(filename, data_to_save, delimiter=',', fmt='%d,%d,%.1f,%d')

path = 'Official_submission/results_140_int.csv'
save_predictions_to_csv(test_user_indices, test_item_indices, final_predictions, timestamps, path)
