In [32]:
import numpy as np
from tqdm import tqdm
import random
import sqlite3


In [10]:
# MAE and predict methods
def calculate_mae(actual, predicted):
    """
    Parameters:
    - actual_ratings: np.array, the actual ratings.
    - predicted_ratings: np.array, the predicted ratings.
    """
    # calculate the absolute error between actual and predicted ratings
    abs_err = np.abs(actual - predicted)
    
    # calculate the mean of these absolute errors
    mae = np.mean(abs_err)
    
    return mae

# round prediction to nearest 0.5 in range [0.5, 5]
def round_predictions(predictions):
    rounded_predictions = np.round(predictions * 2) / 2
    return np.clip(rounded_predictions, 0.5, 5.0)

# predict rating for a user-item pair
def predict_rating(user_id, item_id, user_factors, item_factors):
    user_vector = user_factors[user_id]
    item_vector = item_factors[item_id]
    return np.dot(user_vector, item_vector)

# generate predictions for all user-item pairs in the validation set
def generate_predictions(validation_dict, user_factors, item_factors):
    predictions = []
    actual_ratings = []
    for (user_id, item_id), actual_rating in validation_dict.items():
        predicted_rating = predict_rating(user_id, item_id, user_factors, item_factors)
        if predicted_rating is not None:
            predictions.append(predicted_rating)
            actual_ratings.append(actual_rating)
        else:
            predictions.append(np.nan)  # Appending NaN for undefined predictions
            actual_ratings.append(actual_rating)
    return np.array(predictions), np.array(actual_ratings)

In [8]:
# get num users and items
def get_max_users_items(data):
    max_user = max(user_id for user_id, item_id in data.keys()) + 1
    max_item = max(item_id for user_id, item_id in data.keys()) + 1
    return max_user, max_item

In [9]:
# train/val split method
def split_data(data, val_percent=0.1):
   
    # Convert dictionary keys to a list for easy random selection
    all_keys = list(data.keys())
    total_size = len(all_keys)
    val_size = int(total_size * val_percent)
    
    # Randomly select keys for the validation set
    random.seed(42)
    val_keys = set(random.sample(all_keys, val_size))
    
    # Split the data into training and validation dictionaries
    train_data = {}
    val_data = {}
    for key in all_keys:
        if key in val_keys:
            val_data[key] = data[key]
        else:
            train_data[key] = data[key]
    
    return train_data, val_data

In [5]:
# import database

conn = sqlite3.connect('../Specification/D1/train_100k.db') #stores the main 100k test
#conn = sqlite3.connect('../Specification/D2/train_20M.db') #stores the main 100k test
print("loaded database")

c = conn.cursor()

#Fetch data
print("fetching data ...")
c.execute('SELECT UserID, ItemID, Rating FROM example_table')
data = c.fetchall()

#Close the connection
conn.close()

# Extract matrix defining data
ratings_dict = {}
max_user_id = 0
max_item_id = 0

for user_id, item_id, rating in data:
    if rating > 0:  # Assuming we only care about positive ratings
        ratings_dict[(user_id, item_id)] = rating
        max_user_id = max(max_user_id, user_id)
        max_item_id = max(max_item_id, item_id)

print("Max user id: ", max_user_id)
print("Max item id: ", max_item_id)

# Split the data into training and validation sets
train_data, val_data = split_data(ratings_dict, val_percent=0.1)

print("Total data size: ", len(ratings_dict))
print("Train data size: ", len(train_data))
print("Validation data size: ", len(val_data))

loaded database
fetching data ...
Max user id:  943
Max item id:  1682
Total data size:  90570
Train data size:  81513
Validation data size:  9057


# SGD

In [13]:
def init_factors(num_factors, max_id):
    # Initialize factor vectors with small random numbers
    factors = {}
    np.random.seed(42)
    for i in range(1, max_id + 1):  # IDs start at 1
        factors[i] = np.random.normal(scale=0.05, size=num_factors).astype(np.float64)
    return factors

def sgd_update(user_factors, item_factors, user_id, item_id, actual_rating, alpha, beta, num_factors):
    # Predict the rating    
    prediction = np.dot(user_factors[user_id], item_factors[item_id])
    error = actual_rating - prediction
    
    #Calculate updates
    user_update = alpha * (error * item_factors[item_id] - beta * user_factors[user_id])
    item_update = alpha * (error * user_factors[user_id] - beta * item_factors[item_id])

    # Clip gradients to prevent overflow
    user_update = np.clip(user_update, -1, 1)
    item_update = np.clip(item_update, -1, 1)

    user_factors[user_id] += user_update
    item_factors[item_id] += item_update
    
    
    # Update factors
    #user_factors[user_id] += alpha * (error * item_factors[item_id] - beta * user_factors[user_id])
    #item_factors[item_id] += alpha * (error * user_factors[user_id] - beta * item_factors[item_id])
    return user_factors, item_factors

def matrix_factorization_SGD(train_data, num_factors, alpha, beta, num_epochs):
    # Initialize factors
    
    max_user, max_item = get_max_users_items(train_data)
    print("Number of users:", max_user, ", Number of items:", max_item)
    
    user_factors = init_factors(num_factors, max_user)
    item_factors = init_factors(num_factors, max_item)
    
    # Perform SGD
    for epoch in range(num_epochs):
        for (user_id, item_id), rating in tqdm(train_data.items(), desc=f'SGD {epoch+1}/{num_epochs}', total=len(train_data)):
            user_factors, item_factors = sgd_update(
                user_factors, item_factors, user_id, item_id, rating, alpha, beta, num_factors)
    
    return user_factors, item_factors

#def create_id_mapping(ids):
#    """ Create a mapping from ID to a continuous range of indices starting at 0. """
#    unique_ids = sorted(set(ids))
#    return {id: idx for idx, id in enumerate(unique_ids)}
#
## Extract all user and item IDs
#user_ids = [uid for uid, _ in train_data.keys()]
#item_ids = [iid for _, iid in train_data.keys()]
#
## Create mappings
#user_mapping = create_id_mapping(user_ids)
#item_mapping = create_id_mapping(item_ids)
#
#def init_factors(num_factors, size):
#    """ Initialize factor matrices with a specified size. """
#    np.random.seed(42)
#    return np.random.normal(scale=0.05, size=(size, num_factors))
#
#def sgd_update(user_factors, item_factors, user_idx, item_idx, actual_rating, alpha, beta):
#    # Calculate prediction and error using indices
#    prediction = np.dot(user_factors[user_idx], item_factors[item_idx])
#    error = actual_rating - prediction
#    
#    # Compute updates directly without clipping
#    user_update = alpha * (error * item_factors[item_idx] - beta * user_factors[user_idx])
#    item_update = alpha * (error * user_factors[user_idx] - beta * item_factors[item_idx])
#
#    # Apply updates directly
#    user_factors[user_idx] += user_update
#    item_factors[item_idx] += item_update
#    
#
#def train_sgd_mini_batches(train_data, num_factors, alpha, beta, num_epochs, batch_size=100000):
#    # Initialize user and item factors
#    user_factors = init_factors(num_factors, len(user_mapping))
#    item_factors = init_factors(num_factors, len(item_mapping))
#
#    for epoch in range(num_epochs):
#        # Shuffle training data at the start of each epoch
#        shuffled_data = list(train_data.items())
#        np.random.shuffle(shuffled_data)
#        # Process each batch
#        for i in tqdm(range(0, len(shuffled_data), batch_size), desc=f'SGD {epoch+1}/{num_epochs}', total=len(shuffled_data) // batch_size):
#            batch_data = shuffled_data[i:i + batch_size]
#            for (user_id, item_id), rating in batch_data:
#                user_idx = user_mapping[user_id]
#                item_idx = item_mapping[item_id]
#                sgd_update(user_factors, item_factors, user_idx, item_idx, rating, alpha, beta)
#
#    return user_factors, item_factors
#
#
#

In [None]:
# optimise sgd hyperparameters

#iterations = 10
#for num_factors in [5, 10, 20]:  # Different complexities
#    for alpha in [0.01, 0.02, 0.03]:
#        for beta in [0.01, 0.02, 0.03]:
#            print(f"Testing with {num_factors} factors and alpha= {alpha}, beta= {beta}")
#            user_factors, item_factors = matrix_factorization_SGD(train_data, num_factors, alpha, beta, iterations)
#            SGD_predictions, truth_ratings = generate_predictions(val_data, user_factors, item_factors)
#            SGD_rounded_predictions = round_predictions(SGD_predictions)
#
#            SGD_mae = calculate_mae(truth_ratings, SGD_rounded_predictions)
#            print("MAE: ", SGD_mae)
#            print()

In [14]:
# Parameters
num_factors = 20
alpha = 0.01
beta = 0.2
iterations = 10

# Run SGD
user_factors, item_factors = matrix_factorization_SGD(train_data, num_factors, alpha, beta, iterations)
sgd_predictions, truth_ratings = generate_predictions(val_data, user_factors, item_factors)
sgd_rounded_predictions = round_predictions(sgd_predictions)

sgd_mae = calculate_mae(truth_ratings, sgd_rounded_predictions)
print("MAE: ", sgd_mae)

#print(sgd_rounded_predictions.size)

Number of users: 944 , Number of items: 1683


SGD 1/10:   0%|          | 0/81513 [00:00<?, ?it/s]

SGD 2/10:   0%|          | 0/81513 [00:00<?, ?it/s]

SGD 3/10:   0%|          | 0/81513 [00:00<?, ?it/s]

SGD 4/10:   0%|          | 0/81513 [00:00<?, ?it/s]

SGD 5/10:   0%|          | 0/81513 [00:00<?, ?it/s]

SGD 6/10:   0%|          | 0/81513 [00:00<?, ?it/s]

SGD 7/10:   0%|          | 0/81513 [00:00<?, ?it/s]

SGD 8/10:   0%|          | 0/81513 [00:00<?, ?it/s]

SGD 9/10:   0%|          | 0/81513 [00:00<?, ?it/s]

SGD 10/10:   0%|          | 0/81513 [00:00<?, ?it/s]

MAE:  0.7797283868830739


# ALS


In [15]:
def init_factors(num_factors, size):
    """ Initialize factors as random normal variables. """
    np.random.seed(42)
    return np.random.normal(scale=0.1, size=(size, num_factors))

def update_factors(fixed_factors, ratings_dict, num_factors, lambda_reg):
    num_entities = fixed_factors.shape[0]
    new_factors = np.zeros_like(fixed_factors)
    
    for i in range(num_entities):
        A = np.zeros((num_factors, num_factors))
        b = np.zeros(num_factors)
        for j, rating in ratings_dict.get(i, {}).items():
            A += np.outer(fixed_factors[j], fixed_factors[j])
            b += rating * fixed_factors[j]
        A += lambda_reg * np.eye(num_factors)
        new_factors[i] = np.linalg.solve(A, b)
    
    return new_factors

def run_als(train_data, num_factors, lambda_reg, iterations):
       
    num_users, num_items = get_max_users_items(train_data)
    
    user_factors = init_factors(num_factors, num_users)
    item_factors = init_factors(num_factors, num_items)
    
    for iteration in range(iterations):
        user_ratings = {u: {} for u in range(num_users)}
        item_ratings = {i: {} for i in range(num_items)}
        for (u, i), r in tqdm(train_data.items(), desc=f'ALS {iteration}/{iterations}', total=len(train_data)):
            user_ratings[u][i] = r
            item_ratings[i][u] = r
        
        user_factors = update_factors(item_factors, user_ratings, num_factors, lambda_reg)
        item_factors = update_factors(user_factors, item_ratings, num_factors, lambda_reg)

    return user_factors, item_factors

In [None]:
# optimise als hyperparameters

#iterations = 10   # Number of ALS iterations
#for num_factors in [1, 2, 3]:  # Different complexities
#    for lambda_reg in [0.3, 0.4, 0.5, 0.6]:  # Different regularization strengths
#        print(f"Testing with {num_factors} factors and lambda_reg= {lambda_reg}")
#        user_factors, item_factors = run_als(train_data, num_factors, lambda_reg, iterations=10)
#        ALS_predictions, truth_ratings = generate_predictions(val_data, user_factors, item_factors)
#        ALS_rounded_predictions = round_predictions(ALS_predictions)
#
#        ALS_mae = calculate_mae(truth_ratings, ALS_rounded_predictions)
#        print("MAE: ", ALS_mae)
#        print()


In [17]:
# best hyperparameter run

num_factors = 2  # Latent factors
lambda_reg = 0.5  # Regularization strength
iterations = 10   # Number of ALS iterations

user_factors, item_factors = run_als(train_data, num_factors, lambda_reg, iterations)
ALS_predictions, truth_ratings = generate_predictions(val_data, user_factors, item_factors)
ALS_rounded_predictions = round_predictions(ALS_predictions)

ALS_mae = calculate_mae(truth_ratings, ALS_rounded_predictions)
print("MAE: ", ALS_mae)

#print(ALS_rounded_predictions.size)


ALS 0/10:   0%|          | 0/81513 [00:00<?, ?it/s]

ALS 1/10:   0%|          | 0/81513 [00:00<?, ?it/s]

ALS 2/10:   0%|          | 0/81513 [00:00<?, ?it/s]

ALS 3/10:   0%|          | 0/81513 [00:00<?, ?it/s]

ALS 4/10:   0%|          | 0/81513 [00:00<?, ?it/s]

ALS 5/10:   0%|          | 0/81513 [00:00<?, ?it/s]

ALS 6/10:   0%|          | 0/81513 [00:00<?, ?it/s]

ALS 7/10:   0%|          | 0/81513 [00:00<?, ?it/s]

ALS 8/10:   0%|          | 0/81513 [00:00<?, ?it/s]

ALS 9/10:   0%|          | 0/81513 [00:00<?, ?it/s]

MAE:  0.7118251076515403


# Weighted Predictions

In [18]:
# weighted predictions

for weight_sgd in [0.45, 0.475, 0.5, 0.525, 0.55]:
    for weight_als in [0.55, 0.525, 0.5, 0.475, 0.45]:
        #weight_als = 1 - weight_sgd
        weighted_predictions = (weight_sgd * sgd_rounded_predictions) + (weight_als * ALS_rounded_predictions)
        weighted_rounded_predictions = round_predictions(weighted_predictions)
        weighted_mae = calculate_mae(truth_ratings, weighted_rounded_predictions)
        print(f"SGD: {weight_sgd}, ALS: {weight_als}, MAE:  ", weighted_mae)
print()


# SGD and ALS MAEs
print("SGD MAE: ", sgd_mae)
print("ALS MAE: ", ALS_mae)


weight_sgd = 0.5  # Assume SGD has higher validation accuracy
weight_als = 0.5 # ALS is slightly less accurate

# sgd_predictions and als_predictions are arrays of the same shape containing the predicted ratings
weighted_predictions = (weight_sgd * sgd_rounded_predictions) + (weight_als * ALS_rounded_predictions)
weighted_rounded_predictions = round_predictions(weighted_predictions)
weighted_mae = calculate_mae(truth_ratings, weighted_rounded_predictions)

# weighted MAE
print("Weighted MAE: ", weighted_mae)




SGD: 0.45, ALS: 0.55, MAE:   0.7123771668322845
SGD: 0.45, ALS: 0.525, MAE:   0.7574251959810092
SGD: 0.45, ALS: 0.5, MAE:   0.7579220492436789
SGD: 0.45, ALS: 0.475, MAE:   0.8082146406094733
SGD: 0.45, ALS: 0.45, MAE:   0.8380810422877333
SGD: 0.475, ALS: 0.55, MAE:   0.7119355194876891
SGD: 0.475, ALS: 0.525, MAE:   0.7123771668322845
SGD: 0.475, ALS: 0.5, MAE:   0.7578668433256045
SGD: 0.475, ALS: 0.475, MAE:   0.7578668433256045
SGD: 0.475, ALS: 0.45, MAE:   0.8081042287733246
SGD: 0.5, ALS: 0.55, MAE:   0.7103897537816054
SGD: 0.5, ALS: 0.525, MAE:   0.7121563431599868
SGD: 0.5, ALS: 0.5, MAE:   0.7029921607596334
SGD: 0.5, ALS: 0.475, MAE:   0.7578668433256045
SGD: 0.5, ALS: 0.45, MAE:   0.7596334327039859
SGD: 0.525, ALS: 0.55, MAE:   0.7106657833719775
SGD: 0.525, ALS: 0.525, MAE:   0.7121563431599868
SGD: 0.525, ALS: 0.5, MAE:   0.7121563431599868
SGD: 0.525, ALS: 0.475, MAE:   0.7576460196533068
SGD: 0.525, ALS: 0.45, MAE:   0.7579772551617533
SGD: 0.55, ALS: 0.55, MAE:   0.

# Test Set and submission

In [15]:
# import test set

# 20M dataset
test_dir_20M = '../Specification/D2/test_20M_withoutratings.csv'

# 100k dataset
test_dir_100K = '../Specification/D1/test_100k_withoutratings.csv'

# Load the dataset
def load_data_np(filepath):
    return np.loadtxt(filepath, delimiter=',', skiprows=0, dtype='float32')
   
# Load the dataset (excluding the header if present)
test_data = load_data_np(test_dir_100K)

print("Test data shape: ", test_data.shape)
print("All data size: ", len(ratings_dict))

print("Ratio All Data / Test:", len(test_data) / len(ratings_dict))
print("Ratio Train / Val:", len(val_data) / len(train_data))

Test data shape:  (9430, 3)
All data size:  90570
Ratio All Data / Test: 0.10411836148835155
Ratio Train / Val: 0.1111111111111111


In [16]:
def generate_testset_predicitons(test_data, user_factors, item_factors):
    # Initialize an empty list to hold predictions
    predictions = []
    
    # Assume test_data is a numpy array with columns: [userID, itemID, timestamp]
    for user, item, _ in test_data:
        user = int(user)
        item = int(item)
        prediction = predict_rating(user, item, user_factors, item_factors)
        predictions.append(prediction)
    
    return np.array(predictions)

In [17]:
# Run SGD on all data
num_factors = 20
alpha = 0.01
beta = 0.02
iterations = 10

sgd_user_factors, sgd_item_factors = matrix_factorization_SGD(ratings_dict, num_factors, alpha, beta, iterations)
sgd_test_predictions = round_predictions(generate_testset_predicitons(test_data, sgd_user_factors, sgd_item_factors))

# Run ALS on all data
num_factors = 2  
lambda_reg = 0.5  
iterations = 10   

als_user_factors, als_item_factors = run_als(ratings_dict, num_factors, lambda_reg, iterations=10)
als_test_predictions = round_predictions(generate_testset_predicitons(test_data, als_user_factors, als_item_factors))


# weighted predictions
weight_sgd = 0.5  
weight_als = 0.5 

# sgd_predictions and als_predictions are arrays of the same shape containing the predicted ratings
weighted_test_predictions = round_predictions((weight_sgd * sgd_test_predictions) + (weight_als * als_test_predictions))




Number of users: 944 , Number of items: 1683


SGD iterations: 100%|██████████| 10/10 [00:12<00:00,  1.29s/it]
ALS iterations: 100%|██████████| 10/10 [00:11<00:00,  1.19s/it]


In [18]:
# create numpy array with the predicted ratings
predicted_testset =  complete_data = np.hstack((
    test_data[:, :2],  # UserID and ItemID columns
    weighted_test_predictions.reshape(-1, 1),  # Predicted ratings
    test_data[:, 2:]))  # Timestamp column

path = 'Optional_Submission/test.csv'
np.savetxt(path, predicted_testset, delimiter=",", fmt='%d,%d,%.1f,%d')