In [4]:
import numpy as np
from tqdm import tqdm
import random
import sqlite3

In [5]:
# import dataset from database
def load_data(db_path):
    conn = sqlite3.connect(db_path)
    print("Loaded database")

    c = conn.cursor()
    print("Fetching data ...")
    c.execute('SELECT UserID, ItemID, Rating FROM example_table')
    data = c.fetchall()
    conn.close()

    user_indices = []
    item_indices = []
    ratings_values = []

    max_user_id = 0
    max_item_id = 0

    for user_id, item_id, rating in data:
        user_indices.append(user_id)
        item_indices.append(item_id)
        ratings_values.append(rating)
        #ratings_values.append(int(rating * 2)) # convert ratings to integers
        max_user_id = max(max_user_id, user_id)
        max_item_id = max(max_item_id, item_id)

    user_indices = np.array(user_indices, dtype=np.int32)
    item_indices = np.array(item_indices, dtype=np.int32)
    #ratings_values = np.array(ratings_values, dtype=np.int32)
    ratings_values = np.array(ratings_values, dtype=np.float32)

    print("Max user id:", max_user_id)
    print("Max item id:", max_item_id)

    return user_indices, item_indices, ratings_values, max_user_id, max_item_id

def normalize_ids(indices):
    unique_ids, inverse_indices = np.unique(indices, return_inverse=True)
    id_map = {original_id: idx for idx, original_id in enumerate(unique_ids)}
    reverse_map = {idx: original_id for idx, original_id in enumerate(unique_ids)}
    num_unique = len(unique_ids)  # The total number of unique indices
    return inverse_indices, num_unique, id_map, reverse_map



# split into train and validation sets
def split_data(user_indices, item_indices, ratings, split_ratio=0.9):
    np.random.seed(42)
    indices = np.random.permutation(len(ratings))
    split_point = int(len(ratings) * split_ratio)
    train_idx, val_idx = indices[:split_point], indices[split_point:]
    
    train_data = (user_indices[train_idx], item_indices[train_idx], ratings[train_idx])
    val_data = (user_indices[val_idx], item_indices[val_idx], ratings[val_idx])
    return train_data, val_data

In [6]:
# impor dataset

path_100k = '../../data/dataset1/train_100k.db'
path_20M = '../../data/dataset2/train_20M.db'

global_user_indices, global_item_indices, global_ratings, global_max_user_id, global_max_item_id = load_data(path_20M)

# Normalize user and item indices
global_user_indices, global_num_users, user_to_norm, norm_to_user = normalize_ids(global_user_indices)
global_item_indices, global_num_items, item_to_norm, norm_to_item = normalize_ids(global_item_indices)

all_data = (global_user_indices, global_item_indices, global_ratings)
train_data, val_data = split_data(global_user_indices, global_item_indices, global_ratings, split_ratio=0.9)

print("Train data size:",train_data[0].size)
print("Validation data size:",val_data[0].size)

# train_data[0] = user_indices
# train_data[1] = item_indices
# train_data[2] = ratings


Loaded database
Fetching data ...
Max user id: 138493
Max item id: 26744
Train data size: 16753799
Validation data size: 1861534


# MAE and Prediction Code

In [7]:
# MAE and predict methods
def calculate_mae(actual, predicted):
    """
    Parameters:
    - actual_ratings: np.array, the actual ratings.
    - predicted_ratings: np.array, the predicted ratings.
    """
    # calculate the absolute error between actual and predicted ratings
    abs_err = np.abs(actual - predicted)
    
    # calculate the mean of these absolute errors
    mae = np.mean(abs_err)
    
    return mae # /2

# round prediction to nearest 0.5 in range [0.5, 5]
def round_predictions(predictions):
    rounded_predictions = np.round(predictions * 2) / 2
    return np.clip(rounded_predictions, 0.5, 5.0)
    #rounded_predictions = np.round(predictions)
    #return np.clip(rounded_predictions, 1, 10)

def predict(user_features, item_features, user_indices, item_indices):
    predictions = np.array([np.dot(user_features[u], item_features[i]) for u, i in zip(user_indices, item_indices)])
    return predictions

# SGD

In [8]:
def sgd(user_indices, item_indices, ratings, num_users, num_items, num_factors, alpha, beta, iterations):
    # Initialize feature matrices
    np.random.seed(42)
    user_features = np.random.normal(0, 0.1, (num_users, num_factors))
    item_features = np.random.normal(0, 0.1, (num_items, num_factors))

    # SGD updates (only using training data)
    for iteration in range(iterations): #tqdm(range(iterations), desc='SGD iterations', total=iterations):
        for u, i, r in tqdm(zip(user_indices, item_indices, ratings), desc=f'SGD {iteration+1}/{iterations}', total=len(ratings)):
            prediction = np.dot(user_features[u], item_features[i])
            error = r - prediction

            # Update rules for features
            user_features_grad = -2 * error * item_features[i] + beta * user_features[u]
            item_features_grad = -2 * error * user_features[u] + beta * item_features[i]

            user_features[u] -= alpha * user_features_grad
            item_features[i] -= alpha * item_features_grad

    return user_features, item_features

In [7]:
import gc


beta = 0.01
alpha = 0.005
num_factors = 60

iterations = 4
for iterations in [5,6,7,8]:
    print(f"Testing with {num_factors} factors and alpha= {alpha}, beta= {beta}")
    user_features, item_features = sgd(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
    sgd_predictions = predict(user_features, item_features, val_data[0], val_data[1])
    sgd_rounded_predictions = round_predictions(sgd_predictions)

    truth_ratings = val_data[2]
    sgd_mae = calculate_mae(truth_ratings, sgd_rounded_predictions)
    print(f"MAE:", sgd_mae)
    print()
    gc.collect()
    
#0.607952366166828

Testing with 60 factors and alpha= 0.005, beta= 0.01


SGD 1/5: 100%|██████████| 16753799/16753799 [05:24<00:00, 51559.17it/s]
SGD 2/5: 100%|██████████| 16753799/16753799 [05:25<00:00, 51529.58it/s]
SGD 3/5: 100%|██████████| 16753799/16753799 [05:23<00:00, 51757.13it/s]
SGD 4/5: 100%|██████████| 16753799/16753799 [05:24<00:00, 51581.25it/s]
SGD 5/5: 100%|██████████| 16753799/16753799 [05:23<00:00, 51757.38it/s]


MAE: 0.6030598957633866

Testing with 60 factors and alpha= 0.005, beta= 0.01


SGD 1/6: 100%|██████████| 16753799/16753799 [05:24<00:00, 51682.07it/s]
SGD 2/6: 100%|██████████| 16753799/16753799 [05:23<00:00, 51806.54it/s]
SGD 3/6: 100%|██████████| 16753799/16753799 [05:23<00:00, 51774.27it/s]
SGD 4/6: 100%|██████████| 16753799/16753799 [05:24<00:00, 51691.97it/s]
SGD 5/6: 100%|██████████| 16753799/16753799 [05:24<00:00, 51660.46it/s]
SGD 6/6: 100%|██████████| 16753799/16753799 [05:23<00:00, 51847.83it/s]


MAE: 0.5989162701299037

Testing with 60 factors and alpha= 0.005, beta= 0.01


SGD 1/7: 100%|██████████| 16753799/16753799 [05:24<00:00, 51566.63it/s]
SGD 2/7: 100%|██████████| 16753799/16753799 [05:24<00:00, 51641.02it/s]
SGD 3/7: 100%|██████████| 16753799/16753799 [05:24<00:00, 51628.38it/s]
SGD 4/7: 100%|██████████| 16753799/16753799 [05:24<00:00, 51620.49it/s]
SGD 5/7: 100%|██████████| 16753799/16753799 [05:24<00:00, 51583.60it/s]
SGD 6/7: 100%|██████████| 16753799/16753799 [05:23<00:00, 51721.09it/s]
SGD 7/7: 100%|██████████| 16753799/16753799 [05:24<00:00, 51605.75it/s]


MAE: 0.5968792404543779

Testing with 60 factors and alpha= 0.005, beta= 0.01


SGD 1/8: 100%|██████████| 16753799/16753799 [05:24<00:00, 51693.81it/s]
SGD 2/8: 100%|██████████| 16753799/16753799 [05:23<00:00, 51835.93it/s]
SGD 3/8: 100%|██████████| 16753799/16753799 [05:04<00:00, 55013.00it/s]
SGD 4/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54915.33it/s]
SGD 5/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54876.24it/s]
SGD 6/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54828.28it/s]
SGD 7/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54852.84it/s]
SGD 8/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54868.92it/s]


MAE: 0.5960490111918451



In [9]:
#Testing with 60 factors and alpha= 0.0075, beta= 0.03
#SGD 1/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59057.38it/s]
#SGD 2/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59179.09it/s]
#SGD 3/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59055.18it/s]
#SGD 4/4: 100%|██████████| 16753799/16753799 [04:43<00:00, 59117.32it/s]
#MAE: 0.6063026514691647

num_factors = 60  # Latent factors
alpha = 0.0075      # Learning rate
beta = 0.03      # Regularization
iterations = 8   # Number of iterations

# Run SGD
print(f"Testing with {num_factors} factors and alpha= {alpha}, beta= {beta}")
sgd_user_features, sgd_item_features = sgd(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
sgd_predictions = predict(sgd_user_features, sgd_item_features, val_data[0], val_data[1])
train_pred2 = round_predictions(sgd_predictions)

truth_ratings = val_data[2]
sgd_mae = calculate_mae(truth_ratings, train_pred2)
print(f"MAE:", sgd_mae)
print()

#Testing with 60 factors and alpha= 0.005, beta= 0.01
#SGD 1/8: 100%|██████████| 16753799/16753799 [05:24<00:00, 51693.81it/s]
#SGD 2/8: 100%|██████████| 16753799/16753799 [05:23<00:00, 51835.93it/s]
#SGD 3/8: 100%|██████████| 16753799/16753799 [05:04<00:00, 55013.00it/s]
#SGD 4/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54915.33it/s]
#SGD 5/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54876.24it/s]
#SGD 6/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54828.28it/s]
#SGD 7/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54852.84it/s]
#SGD 8/8: 100%|██████████| 16753799/16753799 [05:05<00:00, 54868.92it/s]
#MAE: 0.5960490111918451

num_factors = 60  # Latent factors
alpha = 0.005      # Learning rate
beta = 0.01       # Regularization
iterations = 8   # Number of iterations

# Run SGD
print()
sgd_user_features, sgd_item_features = sgd(train_data[0], train_data[1], train_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
sgd_predictions = predict(sgd_user_features, sgd_item_features, val_data[0], val_data[1])
train_pred3 = round_predictions(sgd_predictions)

truth_ratings = val_data[2]
sgd_mae = calculate_mae(truth_ratings, train_pred3)
print(f"MAE:", sgd_mae)
print()

Testing with 60 factors and alpha= 0.0075, beta= 0.03


SGD 1/8: 100%|██████████| 16753799/16753799 [04:46<00:00, 58550.62it/s]
SGD 2/8: 100%|██████████| 16753799/16753799 [04:45<00:00, 58739.78it/s]
SGD 3/8: 100%|██████████| 16753799/16753799 [04:45<00:00, 58583.86it/s]
SGD 4/8: 100%|██████████| 16753799/16753799 [04:45<00:00, 58654.28it/s]
SGD 5/8: 100%|██████████| 16753799/16753799 [04:44<00:00, 58876.10it/s]
SGD 6/8: 100%|██████████| 16753799/16753799 [04:45<00:00, 58737.90it/s]
SGD 7/8: 100%|██████████| 16753799/16753799 [04:45<00:00, 58601.00it/s]
SGD 8/8: 100%|██████████| 16753799/16753799 [04:44<00:00, 58818.75it/s]


MAE: 0.5927603256239209




SGD 1/8: 100%|██████████| 16753799/16753799 [04:46<00:00, 58562.61it/s]
SGD 2/8: 100%|██████████| 16753799/16753799 [04:45<00:00, 58712.41it/s]
SGD 3/8: 100%|██████████| 16753799/16753799 [04:45<00:00, 58728.13it/s]
SGD 4/8: 100%|██████████| 16753799/16753799 [04:45<00:00, 58732.49it/s]
SGD 5/8: 100%|██████████| 16753799/16753799 [04:45<00:00, 58752.93it/s]
SGD 6/8: 100%|██████████| 16753799/16753799 [04:45<00:00, 58658.41it/s]
SGD 7/8: 100%|██████████| 16753799/16753799 [04:46<00:00, 58523.03it/s]
SGD 8/8: 100%|██████████| 16753799/16753799 [04:47<00:00, 58187.32it/s]


MAE: 0.5960490111918451



In [None]:
# ensemble best SGD models and average predictions


# Weighted Predictions


In [25]:
## weighted predictions
#
#for weight_sgd in [0.45, 0.475, 0.5, 0.525, 0.55]:
#    for weight_als in [0.55, 0.525, 0.5, 0.475, 0.45]:
#        #weight_als = 1 - weight_sgd
#        weighted_predictions = (weight_sgd * sgd_rounded_predictions) + (weight_als * als_rounded_predictions)
#        weighted_rounded_predictions = round_predictions(weighted_predictions)
#        weighted_mae = calculate_mae(truth_ratings, weighted_rounded_predictions)
#        print(f"SGD: {weight_sgd}, ALS: {weight_als}, MAE:  ", weighted_mae)
#print()
#
#
## SGD and ALS MAEs
#print("SGD MAE: ", sgd_mae)
#print("ALS MAE: ", als_mae)
#
#
#weight_sgd = 0.5  # Assume SGD has higher validation accuracy
#weight_als = 0.5 # ALS is slightly less accurate
#
## sgd_predictions and als_predictions are arrays of the same shape containing the predicted ratings
#weighted_predictions = (weight_sgd * sgd_rounded_predictions) + (weight_als * als_rounded_predictions)
#weighted_rounded_predictions = round_predictions(weighted_predictions)
#weighted_mae = calculate_mae(truth_ratings, weighted_rounded_predictions)
#
## weighted MAE
#print("Weighted MAE: ", weighted_mae)


SGD: 0.45, ALS: 0.55, MAE:   0.715358286408303
SGD: 0.45, ALS: 0.525, MAE:   0.7314232085679585
SGD: 0.45, ALS: 0.5, MAE:   0.7318648559125538
SGD: 0.45, ALS: 0.475, MAE:   0.7861874792977808
SGD: 0.45, ALS: 0.45, MAE:   0.8148945566964778
SGD: 0.475, ALS: 0.55, MAE:   0.7052556034006846
SGD: 0.475, ALS: 0.525, MAE:   0.715358286408303
SGD: 0.475, ALS: 0.5, MAE:   0.7318648559125538
SGD: 0.475, ALS: 0.475, MAE:   0.7318648559125538
SGD: 0.475, ALS: 0.45, MAE:   0.7861874792977808
SGD: 0.5, ALS: 0.55, MAE:   0.7046483383018659
SGD: 0.5, ALS: 0.525, MAE:   0.7052556034006846
SGD: 0.5, ALS: 0.5, MAE:   0.695925803246108
SGD: 0.5, ALS: 0.475, MAE:   0.7318648559125538
SGD: 0.5, ALS: 0.45, MAE:   0.7324721210113724
SGD: 0.525, ALS: 0.55, MAE:   0.7206580545434471
SGD: 0.525, ALS: 0.525, MAE:   0.7052556034006846
SGD: 0.525, ALS: 0.5, MAE:   0.7052556034006846
SGD: 0.525, ALS: 0.475, MAE:   0.7217621729049354
SGD: 0.525, ALS: 0.45, MAE:   0.7315888263221817
SGD: 0.55, ALS: 0.55, MAE:   0.736

# Test Set and Submission 

In [26]:
# import test set

# 20M dataset
test_dir_20M = '../../data/dataset2/test_20Mwithoutratings.csv'

# 100k dataset
test_dir_100K = '../../data/dataset1/test_100k_withoutratings.csv'


# Load the dataset
def load_data_np(filepath):
    return np.loadtxt(filepath, delimiter=',', skiprows=0, dtype='float32')
   
# Load the dataset (excluding the header if present)
test_data = load_data_np(test_dir_100K)

print("Test data shape: ", test_data.shape)
print("All data size: ", len(all_data[0]))

print("Ratio All Data / Test:", len(test_data) / len(all_data[0]))
print("Ratio Train / Val:", len(val_data[0]) / len(train_data[0]))

Test data shape:  (9430, 3)
All data size:  90570
Ratio All Data / Test: 0.10411836148835155
Ratio Train / Val: 0.1111111111111111


In [27]:
test_user_indices = np.array([user_to_norm.get(int(user), -1) for user in test_data[:, 0]])
test_item_indices = np.array([item_to_norm.get(int(item), -1) for item in test_data[:, 1]])



In [28]:
# Run SGD on all data
#num_factors = 20  # Latent factors
#alpha = 0.0075      # Learning rate
#beta = 0.125       # Regularization
#iterations = 20 

num_factors = 20
alpha = 0.01
beta = 0.02
iterations = 10

sgd_user_factors, sgd_item_factors = sgd(all_data[0], all_data[1], all_data[2], global_num_users, global_num_items, num_factors, alpha, beta, iterations)
sgd_test_predictions = round_predictions(predict(sgd_user_factors, sgd_item_factors, test_user_indices, test_item_indices))

# Run ALS on all data
#num_factors = 3
#lambda_reg = 0.75
#iterations = 10

# Run ALS on all data
num_factors = 2  
lambda_reg = 0.5  
iterations = 10

all_data_dict = [((u, i), r) for u, i, r in zip(all_data[0], all_data[1], all_data[2])]
als_user_factors, als_item_factors = als(all_data_dict, global_num_users, global_num_items, num_factors, lambda_reg, iterations=10)
als_test_predictions = round_predictions(predict(als_user_factors, als_item_factors, test_user_indices, test_user_indices))

# weighted predictions
weight_sgd = 0.5  
weight_als = 0.5 

# sgd_predictions and als_predictions are arrays of the same shape containing the predicted ratings
weighted_test_predictions = round_predictions((weight_sgd * sgd_test_predictions) + (weight_als * als_test_predictions))



SGD iterations: 100%|██████████| 10/10 [00:13<00:00,  1.39s/it]
ALS Iterations: 100%|██████████| 10/10 [00:12<00:00,  1.29s/it]


In [41]:
def revert_to_original_ids(predictions, user_indices, item_indices, norm_to_user, norm_to_item):
    original_user_ids = [norm_to_user.get(idx) for idx in user_indices]
    original_item_ids = [norm_to_item.get(idx) for idx in item_indices]
    return np.column_stack((original_user_ids, original_item_ids, predictions))

final_predictions = revert_to_original_ids(weighted_test_predictions, test_user_indices, test_item_indices, norm_to_user, norm_to_item)


In [42]:
print(final_predictions[:, 0:3])

[[1 84 4.0]
 [1 87 3.5]
 [1 180 4.0]
 ...
 [943 653 4.0]
 [943 673 4.5]
 [943 936 4.0]]


In [43]:
if np.any(final_predictions[:, 0:2] == None):
    print("None values found in user/item ID columns.")
if np.any(final_predictions[:, 2] == None):
    print("None values found in prediction column.")

# Before converting types, check and replace None values with a default or drop them
for i in range(final_predictions.shape[1]):  # Assuming final_predictions has 3 columns
    final_predictions[:, i] = np.where(final_predictions[:, i] == None, -1, final_predictions[:, i])

# Now try conversion
final_predictions[:, 0:2] = final_predictions[:, 0:2].astype(int)
predicted_ratings = final_predictions[:, 2].reshape(-1, 1).astype(float)
timestamps = test_data[:, 2].reshape(-1, 1).astype(int)

predicted_testset = np.hstack((final_predictions[:, 0:2].astype(int), predicted_ratings, timestamps))

path = 'Optional_Submission/results3.csv'
#np.savetxt(path, predicted_testset, delimiter=",", fmt='%d,%d,%.1f,%d')



None values found in user/item ID columns.
