In [1]:
import numpy as np


# dataset locations
train_dir = '../Specification/D1/train_100k_withratings.csv'
test_dir = '../Specification/D1/test_100k_withoutratings.csv'

# Load the datasets
# userid, itemid, rating, and timestamp
train_data = np.genfromtxt(train_dir, delimiter=',', skip_header=0)

#userid, itemid, and timestamp 
test_data = np.genfromtxt(test_dir, delimiter=',', skip_header=0)

#print(train_data.shape, test_data.shape)
print(train_data[:5])

np.random.seed(42)

np.random.shuffle(train_data)

# split 80:20 OR 90:10
split_index = int(len(train_data) * 0.9)

# training and validation sets
train_subset = train_data[:split_index]
val_data = train_data[split_index:]
train_data = train_subset

print(f"Training data size: {train_subset.shape}")
print(f"Validation data size: {val_data.shape}")

print(f"Test data size: {test_data.shape}")




[[1.00000000e+00 1.00000000e+00 3.00000000e+00 8.81250949e+08]
 [1.00000000e+00 1.10000000e+01 2.00000000e+00 8.81251577e+08]
 [1.00000000e+00 9.30000000e+01 4.00000000e+00 8.81251843e+08]
 [1.00000000e+00 2.22000000e+02 5.00000000e+00 8.81251820e+08]
 [1.00000000e+00 2.92000000e+02 3.00000000e+00 8.81251911e+08]]
Training data size: (81513, 4)
Validation data size: (9057, 4)
Test data size: (9430, 3)


In [2]:
def calculate_mae(actual, predicted):
    """
    Parameters:
    - actual_ratings: np.array, the actual ratings.
    - predicted_ratings: np.array, the predicted ratings.
    """
    # calculate the absolute error between actual and predicted ratings
    abs_err = np.abs(actual - predicted)
    
    # calculate the mean of these absolute errors
    mae = np.mean(abs_err)
    
    return mae


In [3]:
#set timestamps equal to correct column
timestamps = train_data[:, 3]

# min and max timestamps
min_time = np.min(timestamps)
max_time = np.max(timestamps)

# convert seconds to years
time_range_years = (max_time - min_time) / (60*60*24*365) 

# calculate suitable number of bins
N = max(int(time_range_years * 12), 1)  # At least 1 bin
#N = 10

# edges for each bin
bin_edges = np.linspace(min_time, max_time, N+1)

# add tiemstamps to bins
timestamp_bins = np.digitize(timestamps, bin_edges, right=True)



In [4]:
# Array dimensions
num_users = int(np.max(train_data[:, 0]))
num_items = int(np.max(train_data[:, 1]))

# Initialize matrices with an additional dimension for timestamp bins
user_item_time_matrix = np.zeros((num_users, num_items, N))
item_user_time_matrix = np.zeros((num_items, num_users, N))

# Populate the user-item matrix with timestamp bins
for i, entry in enumerate(train_data):
    user_id, item_id, rating, _ = entry
    bin_index = timestamp_bins[i] - 1  # Adjusting index if necessary
    user_item_time_matrix[int(user_id)-1, int(item_id)-1, bin_index] = rating

# The item-user "matrix" involves swapping the user and item dimensions but keeping the bin index the same
for i, entry in enumerate(train_data):
    user_id, item_id, rating, _ = entry
    bin_index = timestamp_bins[i] - 1  # Adjusting index if necessary
    item_user_time_matrix[int(item_id)-1, int(user_id)-1, bin_index] = rating

print("User-Item Matrix shape:", user_item_time_matrix.shape)
print("Item-User Matrix shape:", item_user_time_matrix.shape)


User-Item Matrix shape: (943, 1682, 7)
Item-User Matrix shape: (1682, 943, 7)


In [5]:
def cosine_similarity_3d(matrix):
    # Number of time bins
    N = matrix.shape[2]
    
    # Initialize cosine similarity matrix
    num_entities = matrix.shape[0]  # Number of users or items
    cosine_sim = np.zeros((num_entities, num_entities))
    
    # Calculate cosine similarity for each time bin and average
    for n in range(N):
        # Extract the matrix for the current time bin
        matrix_bin = matrix[:, :, n]
        
        # Calculate the dot product
        dot_product = np.dot(matrix_bin, matrix_bin.T)
        
        # Calculate the norm
        norm = np.linalg.norm(matrix_bin, axis=1)
        
        # Avoid division by zero
        norm[norm == 0] = 1
        
        # Calculate the outer product of the norms
        norm_outer = np.outer(norm, norm)
        
        # Calculate cosine similarity for this bin
        cosine_sim_bin = dot_product / norm_outer
        
        # Replace any NaN values with 0
        cosine_sim_bin = np.nan_to_num(cosine_sim_bin)
        
        # Accumulate the results
        cosine_sim += cosine_sim_bin
    
    # Average the accumulated similarities
    cosine_sim /= N
    
    return cosine_sim

user_time_sim_matrix = cosine_similarity_3d(user_item_time_matrix)
item_time_sim_matrix = cosine_similarity_3d(item_user_time_matrix)


In [6]:
def predict_rating_item_time(user_id, item_id, item_time_sim_matrix, user_item_time_matrix, timestamp, bin_edges, k=9, default_rating=3):
    # Determine the time bin for the given timestamp
    bin_index = np.digitize(timestamp, bin_edges) - 1  # Adjusting bin index if necessary
    
    # Validate indices
    if user_id >= user_item_time_matrix.shape[0] or item_id >= item_time_sim_matrix.shape[0] or bin_index >= user_item_time_matrix.shape[2]:
        return default_rating
    
    # Get ratings for the user in the identified time bin
    user_ratings = user_item_time_matrix[user_id, :, bin_index]
    
    # Get the similarities for the target item with all other items, averaged over time as before
    item_similarities = item_time_sim_matrix[item_id, :]
    
    # Identify items that have been rated by the user in this time bin
    rated_by_user = user_ratings > 0
    
    # Filter the similarities and ratings for items rated by the user
    similarities = item_similarities[rated_by_user]
    ratings = user_ratings[rated_by_user]
    
    # Select the top-k most similar items, if k is specified
    if k > 0 and len(similarities) > k:
        top_k_indices = np.argsort(similarities)[-k:]
        similarities = similarities[top_k_indices]
        ratings = ratings[top_k_indices]
    
    # Compute the weighted sum of ratings
    if similarities.size > 0 and np.sum(similarities) > 0:
        weighted_sum = np.dot(similarities, ratings)
        sum_of_similarities = np.sum(similarities)
        predicted_rating = weighted_sum / sum_of_similarities
        return np.clip(round(predicted_rating * 2) / 2, 0.5, 5.0)
    else:
        return default_rating


predictions = []
for user_id, item_id, _, timestamp in val_data:
    user_id, item_id = int(user_id)-1, int(item_id)-1  # Adjusting for 0-based indexing
    pred_rating = predict_rating_item_time(user_id, item_id, item_time_sim_matrix, user_item_time_matrix, timestamp, bin_edges)
    predictions.append(pred_rating)

predictions_array = np.array(predictions).reshape(-1, 1)
val_pred_item = np.hstack((val_data, predictions_array))

print(calculate_mae(val_pred_item[:,2],val_pred_item[:,4]))

0.7490891023517721


## ATTEMPT 2

In [7]:
## user-item matrix

# Array dimensions 
num_users = int(np.max(train_data[:, 0]))  
num_items = int(np.max(train_data[:, 1]))  

# Initialize matrices
user_item_matrix = np.zeros((num_users, num_items))
item_user_matrix = np.zeros((num_items, num_users))

# Populate the user-item matrix
for entry in train_data:
    user_id, item_id, rating, _ = entry
    user_item_matrix[int(user_id)-1, int(item_id)-1] = rating

# The item-user matrix is the transpose of the user-item matrix
item_user_matrix = user_item_matrix.T

print("User-Item Matrix shape:", user_item_matrix.shape)
print("Item-User Matrix shape:", item_user_matrix.shape)
print(user_item_matrix[37, 706])

User-Item Matrix shape: (943, 1682)
Item-User Matrix shape: (1682, 943)
2.0


In [8]:
##cosine similarity
def cosine_similarity(matrix):

    # Calculate the dot product of given matrix and its transpose
    dot_product = np.dot(matrix, matrix.T)

    # Calculate the norm of each vector in the matrix
    norm = np.linalg.norm(matrix, axis=1)

    # Calculate the outer product of the norms
    norm_outer = np.outer(norm, norm)

    # Calculate cosine similarity
    cosine_sim = dot_product / norm_outer
    # Replace any NaN values with 0
    cosine_sim = np.nan_to_num(cosine_sim)

    return cosine_sim

user_sim_matrix = cosine_similarity(user_item_matrix)
item_sim_matrix = cosine_similarity(item_user_matrix)



  cosine_sim = dot_product / norm_outer


In [9]:
def recommend_popular(user_id, iu_matrix, N=10):
    # Calculate the sum of ratings for each item
    item_popularity = np.sum(iu_matrix, axis=1)

    # Get the indices of the top N popular items
    popular_items = np.argsort(item_popularity)[-N:]

    # If the user has already rated some of the popular items, remove them from the list
    rated_items = np.where(user_item_matrix[user_id, :] != 0)[0]
    recommendations = [item for item in popular_items if item not in rated_items]

    return recommendations

In [10]:
def predict_rating_item(user_id, item_id, item_similarity, user_item_matrix, k=5, default_rating=3):
      
    # Validate indices
    if user_id >= user_item_matrix.shape[0] or item_id >= item_similarity.shape[0]:
        return default_rating
    
    # Get all ratings by the user
    user_ratings = user_item_matrix[user_id, :]
    
    # Get the similarities for the target item with all other items
    item_similarities = item_similarity[item_id, :]
    
    # Identify items that have been rated by the user
    rated_by_user = user_ratings > 0
    
    # Filter the similarities and ratings for items rated by the user
    similarities = item_similarities[rated_by_user]
    ratings = user_ratings[rated_by_user]
    
    # Select the top-k most similar items, if k is specified
    if k > 0:
        if len(similarities) > k:
            top_k_indices = np.argsort(similarities)[-k:]
            similarities = similarities[top_k_indices]
            ratings = ratings[top_k_indices]
    
    # Compute the weighted sum of ratings
    if similarities.size > 0 and np.sum(similarities) > 0:
        weighted_sum = np.dot(similarities, ratings)
        sum_of_similarities = np.sum(similarities)
        predicted_rating = weighted_sum / sum_of_similarities
        return np.clip(round(predicted_rating * 2) / 2, 0.5, 5.0)
    else:
        return default_rating




predictions = []
for user_id, item_id, _, _ in val_data:
    user_id, item_id = int(user_id)-1, int(item_id)-1  # Adjusting for 0-based indexing
    pred_rating = predict_rating_item(user_id, item_id, item_sim_matrix, user_item_matrix)
    predictions.append(pred_rating)

predictions_array = np.array(predictions).reshape(-1, 1)
val_pred_item = np.hstack((val_data, predictions_array))

print(calculate_mae(val_pred_item[:,2],val_pred_item[:,4]))

0.7533399580435023


In [11]:
## user based prediction
#                                                                              28                                           0.025
def predict_rating_user(user_id, item_id, user_similarity, user_item_matrix, k=22, default_rating=3.0, similarity_threshold=0.025):
    # Guard against out-of-bound indices
    if user_id >= user_similarity.shape[0] or item_id >= user_item_matrix.shape[1]:
        return default_rating
    
    # Check if the user has rated any items yet
    if np.count_nonzero(user_item_matrix[user_id, :]) == 0:
        # This is a new user, recommend a popular item
        recommendations = recommend_popular(user_id, user_item_matrix.T)
        if item_id in recommendations:
            # If the item is among the recommended items, return a high rating
            return 5
        else:
            # Otherwise, return the default rating
            return default_rating
    
    # Calculate user biases
    user_mean_ratings = np.true_divide(user_item_matrix.sum(1), (user_item_matrix != 0).sum(1))
    user_bias = np.nan_to_num(user_mean_ratings - np.mean(user_mean_ratings))  # Subtract global average
    
    similarities = user_similarity[user_id, :]
    ratings = user_item_matrix[:, item_id] - user_bias  # Apply bias correction to ratings
    
    # Apply similarity threshold and get the k most similar users
    valid_indices = (ratings != -user_bias) & (similarities > similarity_threshold)
    valid_similarities = similarities[valid_indices]
    valid_ratings = ratings[valid_indices]
    
    # Get the indices of the k most similar users
    k_similar_users = np.argsort(-valid_similarities)[:k]
    k_similarities = valid_similarities[k_similar_users]
    k_ratings = valid_ratings[k_similar_users]
    
    if k_similarities.size == 0 or np.sum(k_similarities) == 0:
        return default_rating
    
    weighted_sum = np.dot(k_similarities, k_ratings)
    sum_of_weights = np.sum(k_similarities)
    
    predicted_rating = weighted_sum / sum_of_weights + user_bias[user_id]  # Reapply user's bias to prediction
    return np.clip(round(predicted_rating * 2) / 2, 0.5, 5.0)



predictions = []
for user_id, item_id, _, _ in val_data:
    user_id, item_id = int(user_id)-1, int(item_id)-1  # Adjusting for 0-based indexing
    pred_rating = predict_rating_user(user_id, item_id, user_sim_matrix, user_item_matrix)
    predictions.append(pred_rating)

predictions_array = np.array(predictions).reshape(-1, 1)
val_pred_user = np.hstack((val_data, predictions_array))

print(calculate_mae(val_pred_user[:,2],val_pred_user[:,4]))


0.7376614773103677


In [12]:
                                                                                                                 #0.5                   0.5
def predict_rating_hybrid(user_id, item_id, user_similarity, item_time_sim_matrix, user_item_matrix, user_item_time_matrix, timestamp, user_based_weight=0.5, item_based_weight=0.5, default_rating=3):
    
    # Validate indices to avoid out-of-bounds access
    if user_id >= user_item_matrix.shape[0] or item_id >= user_item_matrix.shape[1]:
        return default_rating
    
    # Obtain predictions from both models
    user_based_rating = predict_rating_user(user_id, item_id, user_similarity, user_item_matrix, default_rating=default_rating)
    item_based_rating = predict_rating_item_time(user_id, item_id, item_time_sim_matrix, user_item_time_matrix, timestamp, bin_edges)
    
    # Calculate the weighted average of the two predictions
    weighted_rating = (user_based_weight * user_based_rating + item_based_weight * item_based_rating) / (user_based_weight + item_based_weight)
    
    return np.clip(round(weighted_rating * 2) / 2, 0.5, 5.0)


predictions = []
for user_id, item_id, _, timestamp in val_data:
    user_id, item_id = int(user_id)-1, int(item_id)-1  # Adjusting for 0-based indexing
    pred_rating = predict_rating_hybrid(user_id, item_id, user_sim_matrix, item_time_sim_matrix, user_item_matrix, user_item_time_matrix, timestamp)
    predictions.append(pred_rating)

predictions_array = np.array(predictions).reshape(-1, 1)
val_pred_hybrid = np.hstack((val_data, predictions_array))

print(calculate_mae(val_pred_hybrid[:,2],val_pred_hybrid[:,4]))


# 0.7209


0.6952081263111406
