In [3]:
import numpy as np
import pandas as pd

In [9]:
## import Dataset
# test: userid, itemid, timestamp (seconds from 1970)
# train: userid, itemid, rating (float), timestamp (seconds from 1970)

train_dir = 'Specification/D1/train_100k_withratings.csv'
test_dir = 'Specification/D1/test_100k_withoutratings.csv'

# Load datasets as pandas DataFrames with corresponding column headings
train_df = pd.read_csv(train_dir, names=['UserID', 'ItemID', 'Rating', 'Timestamp'])
test_df = pd.read_csv(train_dir, names=['UserID', 'ItemID', 'Timestamp'])

# Adding PredRating column and filling with random values within specified range
train_df['PredRating'] = np.random.choice(np.arange(0.5, 5.5, 0.5), size=len(train_df))
test_df['PredRating'] = np.random.choice(np.arange(0.5, 5.5, 0.5), size=len(train_df))

#train_df['PredRating'].min()

In [10]:
def calculate_mae(actual, predicted):
    """
    Parameters:
    - actual_ratings: np.array, the actual ratings.
    - predicted_ratings: np.array, the predicted ratings.
    """
    # calculate the absolute error between actual and predicted ratings
    abs_err = np.abs(actual - predicted)
    
    # calculate the mean of these absolute errors
    mae = np.mean(abs_err)
    
    return mae

In [18]:
## calculating MAE

# MAE calculated as average of absolute difference between ground ratings and fixed average of 3.53
fixed_average = 3.53
mae_fixed = (train_df['Rating'] - fixed_average).abs().mean()
print("Fixed: ", mae_fixed)

# MAE calculated as average of absolute difference between ground ratings and predicted ratings
mae_pred = (train_df['Rating'] - train_df['PredRating']).abs().mean()
print("Pred: ", mae_pred)

print(calculate_mae(train_df['Rating'], fixed_average))


Fixed:  0.9441494976261456
Pred:  1.6126531964226565
0.9441494976261456


In [12]:
## user-item matrix 

# create user-item matrix from train dataframe
user_item_matrix = train_df.pivot(index='UserID', columns='ItemID', values='Rating')

#fill unrated with 0 (base level)
user_item_matrix = user_item_matrix.fillna(0)

#user_item_matrix.shape, user_item_matrix.head()

In [13]:
## cosine similarity

def cosine_similarity(matrix):
    # Normalize the matrix by subtracting the mean rating for each user (or item)
    norm_matrix = matrix.sub(matrix.mean(axis=1), axis='index').fillna(0)
    
    # Compute the dot product between items (or users)
    dot_product = np.dot(norm_matrix, norm_matrix.T)
    
    # Compute the magnitude (Euclidean norm) for each item (or user)
    magnitude = np.sqrt(np.diag(dot_product))
    
    # Compute cosine similarity: dot_product / (magnitude_i * magnitude_j)
    # Avoid division by zero by adding a small value (epsilon) to magnitudes
    epsilon = 1e-9
    cosine_similarity = dot_product / (magnitude[:, None] + epsilon) / (magnitude[None, :] + epsilon)
    
    return pd.DataFrame(cosine_similarity, index=matrix.index, columns=matrix.index)

# Compute cosine similarity for user-based
user_cos_sim = cosine_similarity(user_item_matrix)

# Transpose the user-item matrix to get an item-user matrix for item-based similarity
item_user_matrix = user_item_matrix.T

# Compute cosine similarity for item-based
item_cos_sim = cosine_similarity(item_user_matrix)

# Display the cosine similarity matrix for items
display(user_cos_sim)

UserID,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.014734,0.092760,0.120365,-0.009088,0.054936,0.033352,0.033152,0.148516,0.096982,...,-0.014201,-0.017424,0.091402,0.055472,0.005608,0.077352,0.084427,0.152267,-0.012180,0.019079
2,0.014734,1.000000,0.088723,0.033545,0.052706,0.072540,0.064995,0.198186,0.062794,0.058409,...,0.035142,0.097720,0.073481,0.031396,0.166274,0.116445,0.093414,0.110633,0.064512,0.146535
3,0.092760,0.088723,1.000000,0.189723,-0.009821,0.224263,0.136323,0.143984,0.181299,0.170653,...,-0.024648,0.092504,0.045672,0.025250,0.284135,0.013372,0.025336,0.069339,-0.024062,0.096094
4,0.120365,0.033545,0.189723,1.000000,-0.027147,0.190020,0.157348,0.140152,0.243964,0.182757,...,0.004026,0.120934,0.021787,0.026818,0.239205,0.064009,0.098189,0.122419,0.007584,0.074120
5,-0.009088,0.052706,-0.009821,-0.027147,1.000000,-0.019211,-0.015574,0.104600,-0.001411,-0.025280,...,0.096632,0.130332,0.073061,0.157658,-0.021519,-0.010747,0.101710,0.066120,0.345203,-0.005886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.077352,0.116445,0.013372,0.064009,-0.010747,0.119801,0.054834,0.083571,0.087780,0.071555,...,-0.016793,0.007368,0.167934,0.088532,0.089363,1.000000,0.208990,0.223011,-0.014403,0.086361
940,0.084427,0.093414,0.025336,0.098189,0.101710,0.082673,0.159545,0.077592,0.110373,0.105243,...,0.048925,0.016852,0.209571,0.195721,0.077028,0.208990,1.000000,0.207755,0.100442,0.219661
941,0.152267,0.110633,0.069339,0.122419,0.066120,0.173472,0.084981,0.166087,0.197781,0.118276,...,0.006715,0.078365,0.213456,0.076022,0.113454,0.223011,0.207755,1.000000,0.033369,0.083713
942,-0.012180,0.064512,-0.024062,0.007584,0.345203,0.009944,0.004471,0.036985,-0.006126,0.010245,...,0.080843,0.090994,0.055711,0.111659,-0.028839,-0.014403,0.100442,0.033369,1.000000,-0.007888


In [14]:
def predict_item_based_rating(user_id, item_id, item_similarity_df, ratings_df):
    # Get all items rated by this user
    rated_items = ratings_df.loc[user_id, ratings_df.loc[user_id, :] > 0].index
    
    # Get similarity scores for the rated items with the target item
    sim_scores = item_similarity_df.loc[item_id, rated_items]
    
    # Get the user's ratings for those items
    user_ratings = ratings_df.loc[user_id, rated_items]
    
    # Calculate the weighted average
    weighted_sum = np.dot(user_ratings, sim_scores)
    sum_of_weights = sim_scores.sum()
    
    # Predicted rating
    pred_rating = weighted_sum / (sum_of_weights + 1e-9)  # Adding a small value to avoid division by zero
    
    return pred_rating

# Note: This function is for conceptual understanding.
# Actual implementation needs adjustment to handle cases where 'item_id' or 'user_id' might not exist in your DataFrame or similarity matrix.


In [15]:
def predict_user_based_rating(user_id, item_id, user_similarity_df, ratings_df):
    # Get all users who rated the target item
    users_who_rated_item = ratings_df.loc[:, item_id][ratings_df.loc[:, item_id] > 0].index
    
    # Get similarity scores for the target user with users who rated the item
    sim_scores = user_similarity_df.loc[user_id, users_who_rated_item]
    
    # Get those users' ratings for the target item
    item_ratings = ratings_df.loc[users_who_rated_item, item_id]
    
    # Calculate the weighted average
    weighted_sum = np.dot(item_ratings, sim_scores)
    sum_of_weights = sim_scores.sum()
    
    # Predicted rating
    pred_rating = weighted_sum / (sum_of_weights + 1e-9)  # Adding a small value to avoid division by zero
    
    return pred_rating


In [16]:
# Example of applying item-based prediction (for demonstration, not optimized for large datasets)
train_df['PredRatingItem'] = train_df.apply(
    lambda row: predict_item_based_rating(
        row['UserID'], 
        row['ItemID'], 
        item_cos_sim, 
        user_item_matrix), 
    axis=1)

In [9]:
# Example of applying user-based prediction
train_df['PredRatingUser'] = train_df.apply(
    lambda row: predict_user_based_rating(
        row['UserID'], 
        row['ItemID'], 
        user_cos_sim, 
        user_item_matrix), 
    axis=1)

In [12]:
# Round predicted ratings to the nearest 0.5, and ensure they are within 0.5 to 5.0 range
train_df['PredRatingItem'] = train_df['PredRatingItem'].apply(lambda x: min(max(round(x * 2) / 2, 0.5), 5.0))
train_df['PredRatingUser'] = train_df['PredRatingUser'].apply(lambda x: min(max(round(x * 2) / 2, 0.5), 5.0))


In [13]:
display(train_df)

Unnamed: 0,UserID,ItemID,Rating,Timestamp,PredRating,PredRatingItem,PredRatingUser
0,1,1,3.0,881250949,2.0,3.5,4.0
1,1,11,2.0,881251577,0.5,3.5,3.5
2,1,93,4.0,881251843,4.5,3.5,3.5
3,1,222,5.0,881251820,1.5,4.0,4.0
4,1,292,3.0,881251911,5.0,3.5,3.5
...,...,...,...,...,...,...,...
90565,943,210,4.0,875048952,3.5,4.0,4.0
90566,943,356,4.0,875049038,3.5,4.0,4.0
90567,943,453,4.0,875049077,3.5,4.0,3.5
90568,943,874,5.0,875048996,4.0,4.5,4.0


In [17]:
# MAE calculated as average of absolute difference between ground ratings and predicted ratings
mae_pred = (train_df['Rating'] - train_df['PredRatingItem']).abs().mean()
print("Item: ", mae_pred)

# MAE calculated as average of absolute difference between ground ratings and predicted ratings
mae_pred = (train_df['Rating'] - train_df['PredRatingUser']).abs().mean()
print("User: ", mae_pred)

Item:  0.7346114303226375


KeyError: 'PredRatingUser'