In [1]:
# I have used these 4 libraries.
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import math, joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
def Intersection(lst1, lst2):
    return list(set(lst1).intersection(lst2))

In [3]:
def predict_rating_UB(userId, itemId, train_pt_df, sm_df, K):
    # Check if a completly new movie comes in my testing dataset.
    try:
        users_similarities = sm_df[userId]   # cosine similarities of user 'userId' with all other users.
        users_ratings = train_pt_df[itemId]  # ratings of all users for item 'itemId'
    except:
        return -1   # That movie doesn't exit in my training dataset. Ignore this case.
    
    # Consider only highest K item similarities. (Here, similarity of the same item will also not considered).
    d = dict(users_similarities) # {itemId : item_similarity}
    d_sorted = {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}
    drop_rest_users = list(d_sorted.keys())
    drop_rest_users = drop_rest_users[K:]
    users_ratings = users_ratings.drop(drop_rest_users)
    users_similarities = users_similarities.drop(drop_rest_users)
    
    # Not consider users who haven't rated the movie 'itemId'
    drop_indices = users_ratings[users_ratings == 0].index
    users_ratings = users_ratings.drop(drop_indices)
    users_similarities = users_similarities.drop(drop_indices)
    
    global coverage
    # If I encountered with coverage problem i.e, no similar users exist after threshold. Then take aveage rating of the movie.
    if len(users_similarities) == 0 or len(users_ratings) == 0:
        coverage = coverage + 1
        return -1
    else:
        l = [x for x in list(train_pt_df.loc[userId]) if x != 0]
        if len(l)==0:
            ru = 2.5
        else:
            ru = sum(l)/len(l)
        
        num = 0
        for user, sim in dict(users_similarities).items():
            rji = train_pt_df[itemId][user]
            l = [x for x in list(train_pt_df.loc[user]) if x != 0]
            rj = sum(l)/len(l) 
            num += sim * (rji - rj)
        
        den = sum([abs(sim) for sim in users_similarities])
        
        if math.isnan(ru + (num/den)) ==  False:
            return (ru + (num/den))
        else:
            return -1

In [4]:
# This function takes the training & testing dataframes and return the MAE.
def UB_MAE(train_df, test_df, K):
    # pivot the training dataframe.
    train_pt_df = pd.pivot_table(train_df, values='Rating', index='userId', columns='itemId')
    # Replace the NA values with 0   (Note: I observed -> No user in the whole dataset have rated 0 to any movie)
    train_pt_df = train_pt_df.fillna(0)
    # Calculate the cosine similarities of user-user.
    sm_df = pd.DataFrame(np.corrcoef(train_pt_df), index=train_pt_df.index, columns=train_pt_df.index)
    # Not consider the similarity of same user while predicting the rating. eg: similarity of user 1 with user 1.
    np.fill_diagonal(sm_df.values, 0)
    
    d = {}
    for user in train_df['userId'].unique().tolist():
        d[user] = train_df[train_df['userId']==user]['itemId'].tolist()
        
    UserCount = {user:len(d[user]) for user in list(d.keys())}
    
    threshold = {}
    for user in list(d.keys()):
        other_users = list(set(list(d.keys())).difference([user]))
        l = []
        for other_u in other_users:
            l.append(len(Intersection(d[user], d[other_u])))
        threshold[user] = sum(l)/len(l)

    for i in tqdm(sm_df.index):
        for j in sm_df.columns:
            y = len(Intersection(d[i], d[j]))
            if y >= threshold[i] and y >= threshold[j]:
                num = 2 * len(Intersection(d[i], d[j]))
                den = len(d[i]) + len(d[j])
                sm_df[i][j] = sm_df[i][j] * (num/den) 
            else:
                pi = abs(y - UserCount[i])
                pj = abs(y - UserCount[j])
                sm_df[i][j] = ((pi/UserCount[i]) * sm_df[i][j]) + ((pj/UserCount[j]) * sm_df[i][j])
    
    
    # Actual Ratings of testing dataframe.
    actual_ratings  = list(test_df['Rating'])
    # Start predicting the ratings of testing dataframe.
    predicted_ratings = []
    for userId, itemId in tqdm(zip(test_df['userId'], test_df['itemId'])):
        predicted_ratings.append(predict_rating_UB(userId, itemId, train_pt_df, sm_df, K))
    
    # Ignore the case when predicted rating is -1. (Because no such users are available to predict the rating. #Coverage_Problem)
    new_actual_ratings = []
    new_predicted_ratings = []
    for i in range(0, len(predicted_ratings)):
        if predicted_ratings[i] <= 0:
            continue
        new_actual_ratings.append(actual_ratings[i])
        new_predicted_ratings.append(predicted_ratings[i])
    
    # return the MAE between Actual Ratings & Predicted Ratings.
    return mean_absolute_error(new_actual_ratings, new_predicted_ratings)

In [5]:
df = joblib.load('./Dataset/finefoods.pkl')
df = df.rename(columns={'rating': 'Rating'})

d = (dict(enumerate(df['userId'].unique())))
new_d = {}
new_d = {value:key for key, value in d.items()}
df['userId'] = df['userId'].map(new_d)

d = (dict(enumerate(df['itemId'].unique())))
new_d = {}
new_d = {value:key for key, value in d.items()}
df['itemId'] = df['itemId'].map(new_d)

train, test = train_test_split(df[(df['userId']<2000) & (df['itemId']<10000)], test_size=0.2)

coverage = 0
mae_value = UB_MAE(train, test, 800)  # Returning MAE for each fold.
print("MAE =", mae_value, ", Coverage = " + str(100-((coverage*100)/test.shape[0])) + "%")

100%|██████████████████████████████████████████████████████████████████████████████| 1709/1709 [04:01<00:00,  7.09it/s]
1016it [00:14, 69.47it/s]

MAE = 0.49947657921027194 , Coverage = 91.43700787401575%



