In [1]:
import numpy as np

train_dir = '../Specification/D2/train_20M_withratings.csv'
test_dir = '../Specification/D2/test_20M_withoutratings.csv'

# Load the dataset (excluding the header if present)
train_data = np.genfromtxt(train_dir, delimiter=',', dtype='float32',  skip_header=0)
test_data = np.genfromtxt(test_dir, delimiter=',',  dtype='float32', skip_header=0)

print(train_data.shape)
print(test_data.shape)

np.random.seed(42)
np.random.shuffle(train_data)

# split 85:15 to mimmic test set
split_index = int(len(train_data) * 0.9)

# training and validation sets
train_subset = train_data[:split_index]
val_data = train_data[split_index:]
train_data = train_subset

print(val_data.shape)


(18615333, 4)
(1384930, 3)
(1861534, 4)


In [2]:
def calculate_mae(actual, predicted):
    """
    Parameters:
    - actual_ratings: np.array, the actual ratings.
    - predicted_ratings: np.array, the predicted ratings.
    """
    # calculate the absolute error between actual and predicted ratings
    abs_err = np.abs(actual - predicted)
    
    # calculate the mean of these absolute errors
    mae = np.mean(abs_err)
    
    return mae


In [7]:
# Assuming user IDs and item IDs are 0-indexed
def create_sparse_representation(data):
    users = data[:, 0].astype(int)
    items = data[:, 1].astype(int)
    ratings = data[:, 2]
    
    # If user/item IDs are not 0-indexed, you might need to reindex them
    # This can be done by creating a mapping from old to new indices if necessary
    
    return users, items, ratings

users, items, ratings = create_sparse_representation(train_data)


In [8]:
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in range(steps):
        for i in range(len(R)):
            # Adjust indices to be 0-based
            u = int(R[i, 0]) - 1
            item = int(R[i, 1]) - 1
            r = R[i, 2]
            prediction = np.dot(P[u, :], Q[:, item])
            e = r - prediction
            
            # Update user and item latent feature matrices
            P[u, :] += alpha * (2 * e * Q[:, item] - beta * P[u, :])
            Q[:, item] += alpha * (2 * e * P[u, :] - beta * Q[:, item])
    return P, Q.T

# Adjusting the predict_rating function as well
def predict_rating(P, Q, user, item):
    # Adjust indices to be 0-based
    user -= 1
    item -= 1
    rating_prediction = np.dot(P[user, :], Q[item, :].T)
    
    # Ensure the rating is within the range 0.5 to 5.0 and round to the nearest 0.5
    rating_prediction = max(0.5, min(5.0, rating_prediction))
    rounded_rating = round(rating_prediction * 2) / 2
    return rounded_rating

val_pred = np.hstack((val_data, np.zeros((len(val_data), 1))))  # Add a column for predRating

# Then, populate the predRating column
for i in range(len(val_data)):
    user, item = int(val_pred[i, 0]), int(val_pred[i, 1])
    # Call predict_rating with the user and item, ensuring P and Q are accessible
    pred_rating = predict_rating(P, Q, user, item)
    val_pred[i, 4] = pred_rating  # Column index 4 for predRating



In [9]:
print(calculate_mae(val_pred[:,2],val_pred[:,4]))

2.389442524283736


# SGD

In [None]:
#import datasets
#train: userID, itemID, rating, timestamp
#test: userID, itemID, timestamp
# 20M dataset

train_dir = '../Specification/D2/train_20M_withratings.csv'
test_dir = '../Specification/D2/test_20M_withoutratings.csv'

# 100k dataset
train_dir = '../Specification/D1/train_100k_withratings.csv'
test_dir = '../Specification/D1/test_100k_withoutratings.csv'

# Load the dataset
def load_data_np(filepath):
    return np.loadtxt(filepath, delimiter=',', skiprows=0, dtype='float32')
   
# Load the dataset (excluding the header if present)
train_data = load_data_np(train_dir)
test_data = load_data_np(test_dir)

print(train_data.shape)
print(test_data.shape)

# split train data into train and validation sets
def split_train_val(data, val_ratio=0.1):
    np.random.seed(42)  
    shuffled_indices = np.random.permutation(len(data))
    val_set_size = int(len(data) * val_ratio)
    val_indices = shuffled_indices[:val_set_size]
    train_indices = shuffled_indices[val_set_size:]
    
    return data[train_indices], data[val_indices]

train_data, val_data = split_train_val(train_data, val_ratio=0.1)

print(val_data.shape)

print(train_data[:5])

In [None]:
def init_matrices(num_users, num_items, num_factors):
    np.random.seed(42)  # For reproducibility
    P = np.random.normal(scale=1./num_factors, size=(num_users + 1, num_factors)).astype(np.float64)  # +1 if indices start at 1
    Q = np.random.normal(scale=1./num_factors, size=(num_items + 1, num_factors)).astype(np.float64)  # +1 if indices start at 1
    return P, Q

In [None]:
# single thread sgd
def sgd(train_data, P, Q, num_factors, alpha, beta, iterations):
    Q = Q.T  # Transpose for easier multiplication
    for iteration in range(iterations): #tqdm(range(iterations), desc='Training Progress', total=iterations):
        np.random.shuffle(train_data)  # Shuffle the ratings to prevent order effects
        for user, item, rating, _ in tqdm(train_data, desc=f'{iteration+1}/{iterations}:', total=len(train_data)):
            user = int(user) - 1
            item = int(item) - 1
            prediction = np.dot(P[user], Q[:, item])
            e = (rating - prediction)
            
            # Check for large values
            if np.any(np.abs(P[user]) > 1e5) or np.any(np.abs(Q[:, item]) > 1e5):
                print(f"Large values detected at iteration {iteration}, user {user}, item {item}")

            # Update rules for P and Q using gradient descent
            update = alpha * (e * Q[:, item] - beta * P[user])
            P[user] += update
            Q[:, item] += alpha * (e * P[user] - beta * Q[:, item])
            
            # Check for overflow after update
            if np.any(np.isnan(P)) or np.any(np.isinf(P)) or np.any(np.isnan(Q)) or np.any(np.isinf(Q)):
                print(f"Overflow/NaN detected at iteration {iteration}, user {user}, item {item}")
                break
    return P, Q.T

In [None]:
from multiprocessing import cpu_count, Pool

def sgd_chunk(args):
    ratings, P, Q, num_factors, alpha, beta = args
    Q = Q.T
    for user, item, rating, _ in ratings:
        user = int(user) - 1
        item = int(item) - 1
        prediction = np.dot(P[user], Q[:, item])
        e = (rating - prediction)
        update = alpha * (e * Q[:, item] - beta * P[user])
        P[user] += update
        Q[:, item] += alpha * (e * P[user] - beta * Q[:, item])
    return P, Q.T

def sgd_parallel(ratings, P, Q, num_factors, alpha, beta, iterations):
    np.random.shuffle(ratings)
    num_processes = cpu_count()
    pool = Pool(num_processes)
    chunk_size = len(ratings) // num_processes
    for iteration in tqdm(range(iterations), desc='Training Progress', total=iterations):
        chunks = [ratings[i:i + chunk_size] for i in range(0, len(ratings), chunk_size)]
        results = pool.map(sgd_chunk, [(chunk, P, Q, num_factors, alpha, beta) for chunk in chunks])
        P, Q = zip(*results)
        P = np.mean(P, axis=0)
        Q = np.mean(Q, axis=0)
    return P, Q


In [None]:
# parameters 
num_users = int(np.max(train_data[:, 0]))  # Assumes user IDs are in the first column
num_items = int(np.max(train_data[:, 1]))  # Assumes item IDs are in the second column

print(f"Number of Users: {num_users}, Number of Items: {num_items}")

num_factors = 20  # Number of latent factors
alpha = 0.01  # Learning rate
beta = 0.01  # Regularization parameter
iterations = 10  # Number of SGD iterations
workers = 4

P, Q = init_matrices(num_users, num_items, num_factors)

P, Q = sgd(train_data, P, Q, num_factors, alpha, beta, iterations)
#P, Q = sgd_parallel(train_data, P, Q, num_factors, alpha, beta, iterations)

In [None]:
def round_predictions(predictions):
    rounded_predictions = np.round(predictions * 2) / 2
    return np.clip(rounded_predictions, 0.5, 5.0)

def predict_rating(user_index, item_index, P, Q):
    user_index -= 1  # Ensure 0-based index for users
    item_index -= 1  # Ensure 0-based index for items
    
    """ Predicts a rating by the user for the item using the dot product of P and Q. """
    user_vector = P[user_index]
    item_vector = Q[item_index]  # Ensure Q is transposed to align the dimensions
    return np.dot(user_vector, item_vector)

def generate_predictions(validation_set, P, Q):
    predictions = []
    for user, item, actual_rating, time in validation_set:
        user = int(user)
        item = int(item)
        predicted_rating = predict_rating(user, item, P, Q)
        predictions.append(predicted_rating)
    return np.array(predictions)

predictions = generate_predictions(val_data, P, Q)
rounded_predictions = round_predictions(predictions)
truth_ratings = val_data[:, 2]

mae = calculate_mae(truth_ratings, rounded_predictions)
print("MAE: ", mae)