# Movielens Recommender System

## 0. Load Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load datasets
links_df = pd.read_csv("data/links.csv")
movies_df = pd.read_csv("data/movies.csv")
ratings_df = pd.read_csv("data/ratings.csv")
# tags_df = pd.read_csv("data/tags.csv")

# Merge ratings with movies on 'movieId'
df = pd.merge(ratings_df, movies_df, on='movieId', how='left')

# Merge the result with links on 'movieId'
df = pd.merge(df, links_df, on='movieId', how='left')

# # Merge the result with tags on 'movieId' and 'userId'
# df = pd.merge(df, tags_df, on=['movieId', 'userId'], how='left')

# # Rename columns for clarity
# df.rename(columns={
#     'timestamp_x': 'timestamp',
#     'timestamp_y': 'timestamp_tag'
# }, inplace=True)

# Drop N/A's
df.dropna(inplace=True)

# Save the merged DataFrame to a CSV file
df.to_csv("data/merged.csv", index=False)

In [3]:
# Load the merged DataFrame
df = pd.read_csv("data/merged.csv")
print("Initial shape:", df.shape)

# Define thresholds
MIN_RATINGS_PER_USER  = 5
MIN_RATINGS_PER_MOVIE = 5

# Discard all users with < MIN_RATINGS_PER_USER
df_dense = df.groupby('userId').filter(
    lambda user_ratings: len(user_ratings) >= MIN_RATINGS_PER_USER
)

# Discard all movies with < MIN_RATINGS_PER_MOVIE
df_dense = df_dense.groupby('movieId').filter(
    lambda movie_ratings: len(movie_ratings) >= MIN_RATINGS_PER_MOVIE
)
print("After cold filtering:", df_dense.shape)

# Define thresholds
NUM_USERS   = 500
NUM_MOVIES  = 500

# Choose a random set of users
users = np.random.RandomState(42).choice(
    df_dense['userId'].unique(), size=NUM_USERS, replace=False
)

# Restrict dataframe to those users
df_u = df_dense[df_dense['userId'].isin(users)]

# Choose a random set of movies from those
movies = np.random.RandomState(42).choice(
    df_u['movieId'].unique(), size=NUM_MOVIES, replace=False
)

# Restrict dataframe to those users
df_sample = df_u[df_u['movieId'].isin(movies)]
print("Final sample:", df_sample.shape)

Initial shape: (100823, 8)
After cold filtering: (90274, 8)
Final sample: (10583, 8)


In [4]:
# Check sparsity
n_users  = df_sample['userId'].nunique()
n_movies = df_sample['movieId'].nunique()
n_ratings = len(df_sample)
sparsity = 1 - (n_ratings / (n_users * n_movies))
print(f"Sparsity: {sparsity:.2%}")

Sparsity: 95.76%


## 1. Non-Personalised

In [5]:
def top_n_count(df, N=10, target_user=None):
    # By count of ratings
    counts = df['movieId'].value_counts()

    # Return top N movieIds and scores
    ids = counts.nlargest(N).index.tolist()
    scores = counts.nlargest(N).values.tolist()
    return ids, scores


# Testing
top_count = top_n_count(df_sample)
print("Top 10 by volume:", top_count)

Top 10 by volume: ([318, 480, 2959, 1198, 592, 380, 590, 595, 316, 2329], [264, 199, 181, 167, 151, 149, 137, 124, 114, 110])


In [6]:
def top_n_likes(df, N=10, threshold=4, target_user=None):
    # By count of likes (e.g. rating ≥4)
    likes = df[df['rating'] >= threshold]
    like_counts = likes['movieId'].value_counts()

    # Return top N movieIds and scores
    ids = like_counts.nlargest(N).index.tolist()
    scores = like_counts.nlargest(N).values.tolist()
    return ids, scores


# Testing
top_likes = top_n_likes(df_sample)
print("Top 10 by likes:",  top_likes)

Top 10 by likes: ([318, 2959, 1198, 480, 2329, 590, 595, 380, 1617, 5418], [228, 150, 136, 125, 90, 84, 69, 65, 61, 60])


In [7]:
def average_rating(df, N=10, target_user=None):
    # Calculate average ratings for each movie
    average_ratings = df.groupby('movieId')['rating'].mean()

    # Return top N movieIds and scores
    ids = average_ratings.nlargest(N).index.tolist()
    scores = average_ratings.nlargest(N).values.tolist()
    return ids, scores


# Testing
averages = average_rating(df_sample)
print("Top 10 by average rating:", averages)

Top 10 by average rating: ([56921, 1104, 318, 55721, 1209, 176, 5690, 942, 1280, 2959], [5.0, 4.5625, 4.412878787878788, 4.375, 4.366666666666666, 4.363636363636363, 4.363636363636363, 4.333333333333333, 4.285714285714286, 4.265193370165746])


In [8]:
def average_rating_normalized(df, N=10, target_user=None):
    # Compute user averages
    user_avg = df.groupby('userId')['rating'].mean().rename('user_avg')
    df = df.merge(user_avg, on='userId')

    # Compute predicted score S(u, i) for all (user, item) pairs
    predictions = []

    for _, row in df.iterrows():
        u = row['userId']
        i = row['movieId']
        user_avg_row = row['user_avg']

        # Get all ratings for item i
        item_ratings = df[df['movieId'] == i]

        # Check for cold start problem
        if item_ratings.empty:
            pred = user_avg_row
        else:
            deviation_sum = (item_ratings['rating'] - item_ratings['user_avg']).sum()
            normalized_score = deviation_sum / len(item_ratings)
            pred = user_avg_row + normalized_score

        predictions.append((i, pred))

    # Create a DataFrame for predictions
    pred_df = pd.DataFrame(predictions, columns=['movieId', 'predicted'])

    # Calculate normalized average ratings for each movie
    average_ratings_normalized = pred_df.groupby('movieId')['predicted'].mean()

    # Return top N movieIds and scores
    ids = average_ratings_normalized.nlargest(N).index.tolist()
    scores = average_ratings_normalized.nlargest(N).values.tolist()
    return ids, scores


# Testing
normalized_averages = average_rating_normalized(df_sample)
print("Top 10 by normalized average rating:", normalized_averages)

Top 10 by normalized average rating: ([56921, 1104, 318, 55721, 1209, 176, 5690, 942, 1280, 2959], [5.0, 4.5625, 4.412878787878788, 4.375, 4.366666666666666, 4.363636363636363, 4.363636363636363, 4.333333333333333, 4.285714285714286, 4.265193370165746])


In [9]:
def average_rating_weighted(df, N=10, m=10, target_user=None):
    # Compute average rating for each movie (U(j)) and number of votes (v)
    average_ratings_weighted = df.groupby('movieId')['rating'].agg(['mean', 'count']).rename(columns={'mean': 'U', 'count': 'v'})

    # Compute overall mean rating across all movies (C)
    C = df['rating'].mean()

    # Compute WR(j) for each movie
    average_ratings_weighted['WR'] = (
        (average_ratings_weighted['v'] / (average_ratings_weighted['v'] + m)) * average_ratings_weighted['U'] +
        (m / (average_ratings_weighted['v'] + m)) * C
    )

    # Return top N movieIds and scores
    ids = average_ratings_weighted['WR'].nlargest(N).index.tolist()
    scores = average_ratings_weighted['WR'].nlargest(N).values.tolist()
    return ids, scores


# Testing
weighted_averages = average_rating_weighted(df_sample)
print("Top 10 by weighted average rating:", weighted_averages)

Top 10 by weighted average rating: ([318, 2959, 2329, 1104, 1198, 68157, 1201, 1209, 1617, 68954], [4.380834570799747, 4.226956399995448, 4.1862389366594215, 4.167256630735796, 4.162986849712603, 4.07742283413574, 4.054206653508883, 4.033946895965228, 4.009326070320117, 3.9927630266570073])


## 2. Collaborative Filtering

In [10]:
def user_based_pearson_cf(df, target_user, N=10, k=30, min_common=3, shrink=10):
    # Build user–item matrix
    mat = df.pivot_table(
        index="userId", columns="movieId", values="rating", aggfunc="mean"
    ).astype(float)

    # Raise error if the target user isn’t in the matrix
    if target_user not in mat.index:
        raise ValueError(f"userId {target_user!r} not found.")

    # Compute user–user pair-wise Pearson similarities
    demeaned = mat.sub(mat.mean(axis=1), axis=0)
    sim = demeaned.T.corr(method="pearson", min_periods=min_common).fillna(0.0)

    # Predict ratings for unrated items
    unrated_items = mat.columns[mat.loc[target_user].isna()]
    r_u_bar = mat.loc[target_user].mean()
    preds = {}

    for item in unrated_items:
        # Users who have rated this item
        neighbours = mat.index[mat[item].notna()]
        if neighbours.empty:
            continue

        # Get similarities
        sims = sim.loc[target_user, neighbours]

        # Check top k most similar neighbours
        top_k = sims.abs().nlargest(k).index
        sims_k = sims.loc[top_k]
        ratings_k = mat.loc[top_k, item]
        means_k = mat.loc[top_k].mean(axis=1)

        # Compute the prediction
        numer = ((ratings_k - means_k) * sims_k).sum()
        denom = sims_k.abs().sum() + shrink

        # Save the prediction
        preds[item] = r_u_bar + numer / denom if denom else r_u_bar

    # Return top N movieIds and scores
    if not preds:
        return []
    ids = pd.Series(preds).nlargest(N).index.tolist()
    scores = pd.Series(preds).nlargest(N).values.tolist()
    return ids, scores


# Testing
# Get recommendations for the top 3 most popular users
top_users = df_sample['userId'].value_counts().nlargest(3).index.tolist()
for user in top_users:
    cf_recommendations = user_based_pearson_cf(df_sample, target_user=user)
    print(f"Top 10 for user {user} user-pearson collaborative filtering:", cf_recommendations)

Top 10 for user 599 user-pearson collaborative filtering: ([8873, 3037, 1273, 37741, 1284, 1292, 57669, 30749, 4033, 265], [2.9330772779188106, 2.9278815241204943, 2.8962773767283827, 2.8798134078740287, 2.8779474510380116, 2.8647858009245306, 2.8638890622035706, 2.863466145556261, 2.8610388522397066, 2.8585657367523143])
Top 10 for user 474 user-pearson collaborative filtering: ([50872, 58998, 88163, 96079, 3347, 112175, 1218, 68954, 1280, 942], [3.6430345556346655, 3.589038201421255, 3.5865206542130994, 3.5843209904963773, 3.5707109058564517, 3.5701991324205746, 3.5617317326710918, 3.5562435471559968, 3.5484698631700557, 3.546661304870535])
Top 10 for user 448 user-pearson collaborative filtering: ([1219, 2160, 318, 1104, 1273, 910, 2761, 5690, 3037, 1280], [3.5247841133412545, 3.495546211330362, 3.435245931440617, 3.3871915815418685, 3.3489492356408777, 3.3414431801341364, 3.3266106129488726, 3.322713939986797, 3.3108559747180095, 3.2904041622631963])


In [11]:
def user_based_cosine_cf(df, target_user, N=10, k=30, shrink=10):
    # Build user–item matrix
    mat = df.pivot_table(
        index="userId", columns="movieId", values="rating", aggfunc="mean"
    ).astype(float)

    # Raise error if the target user isn’t in the matrix
    if target_user not in mat.index:
        raise ValueError(f"userId {target_user!r} not in data.")

    # Compute user–user cosine similarity on filled mean-centered data
    X = mat.fillna(0.0).values
    sim = pd.DataFrame(
        cosine_similarity(X),
        index=mat.index,
        columns=mat.index
    ).astype(float)

    # Predict ratings for unrated items
    preds = {}
    unrated = mat.columns[mat.loc[target_user].isna()]

    for item in unrated:
        # Users who have rated this item
        neighbours = mat.index[mat[item].notna()]
        if neighbours.empty:
            continue

        # Get similarities
        scores = sim.loc[target_user, neighbours]
        topk = scores.abs().nlargest(k)
        if topk.empty:
            continue
        
        # Compute the prediction
        r_k = mat.loc[topk.index, item]
        w_k = topk
        denom = w_k.abs().sum() + shrink
        if denom>0:
            preds[item] = (w_k * r_k).sum() / denom

    # Return top N movieIds and scores
    if not preds:
        return []
    ids = pd.Series(preds).nlargest(N).index.tolist()
    scores = pd.Series(preds).nlargest(N).values.tolist()
    return ids, scores


# Testing
# Get recommendations for the top 3 most popular users
top_users = df_sample['userId'].value_counts().nlargest(3).index.tolist()
for user in top_users:
    cf_recommendations = user_based_cosine_cf(df_sample, target_user=user)
    print(f"Top 10 for user {user} user-cosine collaborative filtering:", cf_recommendations)

Top 10 for user 599 user-cosine collaborative filtering: ([590, 3114, 54001, 4246, 57669, 30749, 58998, 5971, 2712, 36], [2.151211079674275, 2.1216005726762144, 2.1072852605736925, 2.089417818690414, 2.046854474286677, 1.958850931593125, 1.9500214180229551, 1.8956318668841179, 1.8880968236240874, 1.8857553590673468])
Top 10 for user 474 user-cosine collaborative filtering: ([68954, 50872, 2288, 68157, 63082, 8641, 2916, 51662, 57669, 72998], [2.0597623106720504, 2.0218136249239613, 2.001852902972266, 2.0001903387572226, 1.991285389265404, 1.9696086965175268, 1.9678651002455771, 1.9576384888373273, 1.8403136061179313, 1.8378384003223331])
Top 10 for user 448 user-cosine collaborative filtering: ([318, 2329, 1219, 54001, 63082, 595, 4246, 5816, 2288, 34405], [2.4723388100160566, 2.338285607183104, 2.196653027967335, 2.038349337781408, 2.027184775443459, 2.0000461709066695, 1.9889006226637929, 1.9760914947074786, 1.9744752792824494, 1.9707683436127947])


In [12]:
def item_based_pearson_cf(df, target_user, N=10, k=30, min_common=3, shrink=10):
    # Build user–item matrix
    mat = df.pivot_table(
        index="userId", columns="movieId", values="rating", aggfunc="mean"
    ).astype(float)

    # Raise error if the target user isn’t in the matrix
    if target_user not in mat.index:
        raise ValueError(f"userId {target_user!r} not found.")

    # Compute user–user pair-wise Pearson similarities
    item_means = mat.mean(axis=0)
    demeaned = mat.sub(item_means, axis=1)
    sim = demeaned.corr(method="pearson", min_periods=min_common).fillna(0.0)

    # Predict ratings for unrated items
    preds = {}
    seen = mat.loc[target_user].dropna().index
    candidates = mat.columns[mat.loc[target_user].isna()]

    for item in candidates:
        # Check if the item exists in similarity matrix
        if item not in sim.index:
            continue

        # Users who have rated this item
        neighbours = seen.intersection(sim.index)
        if neighbours.empty:
            continue

        # Check top k most similar neighbours
        sims = sim.loc[item, neighbours]
        top_k = sims.abs().nlargest(k).index
        sims_k = sims.loc[top_k]

        # Compute the prediction
        r_k = mat.loc[target_user, top_k]
        m_k = item_means[top_k]
        numer = ((r_k - m_k) * sims_k).sum()
        denom = sims_k.abs().sum() + shrink

        # Save the prediction
        preds[item] = item_means[item] + (numer / denom if denom else 0.0)

    # Return top N movieIds and scores
    if not preds:
        return []
    ids = pd.Series(preds).nlargest(N).index.tolist()
    scores = pd.Series(preds).nlargest(N).values.tolist()
    return ids, scores


# Testing
# Get recommendations for the top 3 most popular users
top_users = df_sample['userId'].value_counts().nlargest(3).index.tolist()
for user in top_users:
    cf_recommendations = item_based_pearson_cf(df_sample, target_user=user)
    print(f"Top 10 for user {user} item-pearson collaborative filtering:", cf_recommendations)

Top 10 for user 599 item-pearson collaborative filtering: ([56921, 55721, 942, 1177, 1280, 3727, 176, 3675, 1284, 26131], [5.0, 4.411377957432644, 4.336990195619395, 4.334824385461091, 4.324046520293536, 4.284842633332316, 4.201429919648416, 4.1956583414804935, 4.182649478469705, 4.175])
Top 10 for user 474 item-pearson collaborative filtering: ([56921, 1280, 942, 55721, 64716, 176, 86, 3727, 1273, 3347], [5.0, 4.352742779612574, 4.348774112952073, 4.325085267368846, 4.318202681099249, 4.2952247099847485, 4.283362006954494, 4.207690186281015, 4.1940931756621955, 4.1393331730247045])
Top 10 for user 448 item-pearson collaborative filtering: ([56921, 1104, 5690, 176, 64716, 1280, 942, 55721, 3675, 1131], [5.0, 4.666210304264679, 4.5877214096895305, 4.450772092781765, 4.423323040618656, 4.393861277437147, 4.35999203610821, 4.3408260074708656, 4.2972779262087855, 4.292543011469858])


In [13]:
def item_based_cosine_cf(df, target_user, N=10, k=30, shrink=10):
    # Build user–item matrix
    mat = df.pivot_table(
        index="userId", columns="movieId", values="rating", aggfunc="mean"
    ).astype(float)

    # Raise error if the target user isn’t in the matrix
    if target_user not in mat.index:
        raise ValueError(f"userId {target_user!r} not in data.")

    # Compute user–user cosine similarity on filled mean-centered data
    X = mat.fillna(0.0).T.values
    sim = pd.DataFrame(
        cosine_similarity(X),
        index=mat.columns,
        columns=mat.columns
    ).astype(float)

    # Collect the user’s existing ratings
    user_ratings = mat.loc[target_user].dropna()
    if user_ratings.empty:
        return []

    # Predict ratings for unrated items
    preds = {}
    candidates = mat.columns[mat.loc[target_user].isna()]
    
    for item in candidates:
        # Get similarities
        scores = sim.loc[item, user_ratings.index]
        
        # Pick top k most similar neighbours
        topk = scores.abs().nlargest(k)
        if topk.empty:
            continue

        # Compute the prediction
        r_j = user_ratings[topk.index]
        w_j = topk
        denom = w_j.abs().sum() + shrink
        if denom > 0:
            preds[item] = (w_j * r_j).sum() / denom

    # Return top N movieIds and scores
    if not preds:
        return []
    ids = pd.Series(preds).nlargest(N).index.tolist()
    scores = pd.Series(preds).nlargest(N).values.tolist()
    return ids, scores


# Testing
# Get recommendations for the top 3 most popular users
top_users = df_sample['userId'].value_counts().nlargest(3).index.tolist()
for user in top_users:
    cf_recommendations = item_based_cosine_cf(df_sample, target_user=user)
    print(f"Top 10 for user {user} item-cosine collaborative filtering:", cf_recommendations)

Top 10 for user 599 item-cosine collaborative filtering: ([54001, 57669, 5299, 590, 3114, 8376, 4246, 30749, 100163, 71254], [1.6642694750768543, 1.647378946949287, 1.6388093363047247, 1.6239976969605672, 1.6203418819026274, 1.6189647355083898, 1.577606014196386, 1.5532642525137776, 1.5511647389846237, 1.5412309576633139])
Top 10 for user 474 item-cosine collaborative filtering: ([51662, 34162, 8641, 2916, 50872, 63082, 72998, 68954, 1391, 68157], [2.1416924833491886, 2.059880415828726, 2.019286116767599, 2.0086108981427495, 2.001171037360891, 1.9689186320279444, 1.947360892119922, 1.9248969723524472, 1.9232270589073828, 1.9023300824742582])
Top 10 for user 448 item-cosine collaborative filtering: ([318, 1219, 5816, 2329, 920, 4246, 54001, 595, 2712, 63082], [2.0106448558043075, 1.9760723396665953, 1.966469578618982, 1.9510499072779741, 1.8494640344197988, 1.847628252693416, 1.837478900947019, 1.8331124753965806, 1.8314845234335542, 1.8271448459033282])


## 3. Content-Based Filtering

In [14]:
def content_based_cosine_f(df, target_user, N=10):
    # Build movie-feature matrix from genres
    movies = (
        df[['movieId','genres']]
        .drop_duplicates()
        .assign(genres_list=lambda d: d['genres'].str.split('|'))
    )
    
    # Encode genres as binary features
    mlb = MultiLabelBinarizer()
    genre_mat = mlb.fit_transform(movies['genres_list'])

    # Apply TF-IDF transformation
    tfidf = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)
    tfidf_mat = tfidf.fit_transform(genre_mat)

    # Create DataFrame with movie features
    features = pd.DataFrame(
        tfidf_mat.toarray(),
        index=movies['movieId'],
        columns=mlb.classes_
    )

    # Build user profile as weighted average of their rated-movie features
    user_hist = (
        df[df['userId'] == target_user]
          .loc[:, ['movieId','rating']]
          .dropna(subset=['rating'])
    )
    if user_hist.empty:
        return []

    # Align features & ratings
    user_feats = features.loc[user_hist['movieId']].values
    ratings = user_hist['rating'].values.reshape(-1,1)

    # Weighted average as singular profile vector
    user_profile = (user_feats * ratings).sum(axis=0) / ratings.sum()

    # Score all unseen movies
    seen = set(user_hist['movieId'])
    candidates = [m for m in features.index if m not in seen]
    if not candidates:
        return []

    # Compute cosine similarity
    cand_feat = features.loc[candidates].values
    sims = cosine_similarity(cand_feat, user_profile.reshape(1,-1)).flatten()
    sims_series = pd.Series(sims, index=candidates)
    
    # Return top N movieIds and scores
    if sims_series.empty:
        return []
    ids = sims_series.nlargest(N).index.tolist()
    scores = sims_series.nlargest(N).values.tolist()
    return ids, scores


# Testing
# Get recommendations for the top 3 most popular users
top_users = df_sample['userId'].value_counts().nlargest(3).index.tolist()
for user in top_users:
    cf_recommendations = content_based_cosine_f(df_sample, target_user=user)
    print(f"Top 10 for user {user} content-cosine filtering:", cf_recommendations)

Top 10 for user 599 content-cosine filtering: ([171, 1734, 2926, 2374, 3565, 2431, 1292, 3506, 2469, 57669], [0.7032472819670628, 0.7032472819670628, 0.7032472819670628, 0.7032472819670628, 0.7032472819670628, 0.7032472819670628, 0.7032472819670628, 0.7032472819670628, 0.7032472819670628, 0.6965647155175801])
Top 10 for user 474 content-cosine filtering: ([171, 1734, 2926, 3565, 45440, 3506, 71464, 32291, 8529, 60950], [0.7738129901920858, 0.7738129901920858, 0.7738129901920858, 0.7738129901920858, 0.7738129901920858, 0.7738129901920858, 0.7738129901920858, 0.7738129901920858, 0.7364707038361644, 0.7364707038361644])
Top 10 for user 448 content-cosine filtering: ([5628, 171, 345, 1734, 2926, 371, 2374, 3565, 1810, 2431], [0.774533181110697, 0.7089108535014519, 0.7089108535014519, 0.7089108535014519, 0.7089108535014519, 0.7089108535014519, 0.7089108535014519, 0.7089108535014519, 0.7089108535014519, 0.7089108535014519])


## 4. Matrix Factorisation

In [15]:
def matrix_factorisation_svd(df, target_user, N=10, k=30, random_state=42):
    # Build user–item matrix
    R = (
        df.pivot_table(
            index="userId", columns="movieId", values="rating", aggfunc="mean"
        )
        .astype(float)
    )

    # Raise error if the target user isn’t in the matrix
    if target_user not in R.index:
        raise ValueError(f"userId {target_user!r} not found in data.")

    # Demean by user means
    user_means = R.mean(axis=1)
    R_demeaned = R.sub(user_means, axis=0).fillna(0.0)

    # Apply Model
    model = TruncatedSVD(n_components=k, random_state=random_state)
    user_factors = model.fit_transform(R_demeaned.values)
    item_factors = model.components_

    # Approximate ratings and add back means
    R_hat = np.dot(user_factors, item_factors)
    R_hat += user_means.values.reshape(-1, 1)

    # Save predictions
    preds_df = pd.DataFrame(R_hat, index=R.index, columns=R.columns)

    # Pick top N for the target user
    user_pred = preds_df.loc[target_user]
    seen = R.loc[target_user].dropna().index
    top_by_matrix_factorisation_svd = (
        user_pred.drop(seen).nlargest(N).index.tolist()
    )

    # Return top N movieIds and scores
    if not top_by_matrix_factorisation_svd:
        return []
    ids = top_by_matrix_factorisation_svd
    scores = user_pred[top_by_matrix_factorisation_svd].values.tolist()
    return ids, scores


# Testing
# Get recommendations for the top 3 most popular users
top_users = df_sample['userId'].value_counts().nlargest(3).index.tolist()
for user in top_users:
    cf_recommendations = matrix_factorisation_svd(df_sample, target_user=user)
    print(f"Top 10 for user {user} SVD matrix factorisation:", cf_recommendations)

Top 10 for user 599 SVD matrix factorisation: ([36, 8376, 158238, 85780, 5299, 58998, 1273, 3101, 135887, 1209], [3.282265291160939, 3.107580173886322, 2.9714740359334533, 2.946639824379959, 2.9461875345683652, 2.9223610716231545, 2.919973436955726, 2.9199166676819868, 2.914709773036582, 2.9040818547510185])
Top 10 for user 474 SVD matrix factorisation: ([68319, 145, 2720, 2916, 1805, 432, 72998, 89904, 1370, 45662], [3.706906045971289, 3.665372201287005, 3.6543426950746696, 3.65060924563688, 3.625348077251128, 3.6191021257611764, 3.617514592185122, 3.6025255185745277, 3.572679384833827, 3.5480552649327115])
Top 10 for user 448 SVD matrix factorisation: ([2761, 1499, 30749, 1094, 1049, 56782, 7254, 69, 52, 2080], [3.318459603432919, 3.3156189970986922, 3.293660858921625, 3.290801225391084, 3.251077493582337, 3.2489460699071766, 3.2408184325359217, 3.238063748636572, 3.2364996498544603, 3.2355534287986223])


In [16]:
def matrix_factorisation_nmf(df, target_user, N=10, k=30, random_state=42, max_iter=500):
    # Build user–item matrix
    R = (
        df.pivot_table(
            index="userId", columns="movieId", values="rating", aggfunc="mean"
        )
        .fillna(0.0)
    )

    # Raise error if the target user isn’t in the matrix
    if target_user not in R.index:
        raise ValueError(f"userId {target_user!r} not found in data.")

    # Apply Model
    model = NMF(n_components=k, init="random", random_state=random_state, max_iter=max_iter, tol=1e-4)
    user_factors = model.fit_transform(R.values)
    item_factors = model.components_

    # Reconstruct ratings matrix
    R_hat = np.dot(user_factors, item_factors)

    # Save predictions
    preds_df = pd.DataFrame(R_hat, index=R.index, columns=R.columns)

    # Pick top N for the target user
    user_pred = preds_df.loc[target_user]
    seen = R.loc[target_user].to_numpy().nonzero()[0]
    seen_ids = R.loc[target_user][R.loc[target_user] > 0].index
    top_by_matrix_factorisation_nmf = (
        user_pred.drop(seen_ids).nlargest(N).index.tolist()
    )

    # Return top N movieIds and scores
    if not top_by_matrix_factorisation_nmf:
        return []
    ids = top_by_matrix_factorisation_nmf
    scores = user_pred[top_by_matrix_factorisation_nmf].values.tolist()
    return ids, scores


# Testing
# Get recommendations for the top 3 most popular users
top_users = df_sample['userId'].value_counts().nlargest(3).index.tolist()
for user in top_users:
    cf_recommendations = matrix_factorisation_nmf(df_sample, target_user=user)
    print(f"Top 10 for user {user} NMF matrix factorisation:", cf_recommendations)



Top 10 for user 599 NMF matrix factorisation: ([8376, 4448, 57669, 2944, 1209, 4246, 54001, 8781, 2712, 2139], [2.71569923831463, 2.5093419892325084, 2.4125014902415245, 2.409982495305774, 2.3987092084288633, 2.2792379749161853, 2.105383963041863, 2.0922070714483354, 2.064199731476649, 1.9559231099035692])




Top 10 for user 474 NMF matrix factorisation: ([32291, 98124, 4275, 176371, 47423, 6659, 37857, 719, 45440, 34162], [1.018492376650457, 0.9337489949401748, 0.9236707448682046, 0.9217502803673463, 0.9048629311314674, 0.9036430411911786, 0.9004096859508648, 0.8514016580878903, 0.8354584223514891, 0.831252463391934])
Top 10 for user 448 NMF matrix factorisation: ([58998, 3100, 2329, 1805, 2193, 1257, 4008, 4041, 1094, 2761], [3.3043630074406436, 3.2289566327039494, 2.3112305349762154, 2.2460252100402993, 2.196552001977241, 2.100459887183775, 2.0380718913627813, 1.900379984976353, 1.875305711458508, 1.8187769677547803])




In [17]:
def matrix_factorisation_als(df, target_user, N=10, factors=30, regularization=0.1, iterations=15, alpha=1.0):
    # Build id mappings
    unique_users = df['userId'].unique()
    unique_items = df['movieId'].unique()
    user2idx = {u: i for i, u in enumerate(unique_users)}
    item2idx = {m: i for i, m in enumerate(unique_items)}
    idx2item = {i: m for m, i in item2idx.items()}

    # Raise error if the target user isn’t in the matrix
    if target_user not in user2idx:
        raise ValueError(f"userId {target_user!r} not found in data.")

    # Build item-user confidence matrix
    rows = df['movieId'].map(item2idx).to_numpy()
    cols = df['userId'].map(user2idx).to_numpy()
    data = (1.0 + alpha * df['rating'].astype(float)).to_numpy()
    item_user_csr = coo_matrix(
        (data, (rows, cols)),
        shape=(len(unique_items), len(unique_users))
    ).tocsr()

    # Apply model
    model = AlternatingLeastSquares(
        factors=factors,
        regularization=regularization,
        iterations=iterations,
        calculate_training_loss=False
    )
    model.fit(item_user_csr)

    # Get raw scores
    uidx = user2idx[target_user]
    user_vec = model.user_factors[uidx]
    item_vecs = model.item_factors
    scores_all = item_vecs.dot(user_vec)

    # Mask out already-rated items   
    seen_items = df.loc[df['userId'] == target_user, 'movieId'].unique()
    seen_idx = [item2idx[m] for m in seen_items if m in item2idx]
    for i in seen_idx:
        if 0 <= i < scores_all.shape[0]:
            scores_all[i] = -np.inf

    # Pick Top N
    top_idx = np.argpartition(-scores_all, N)[:N]
    top_idx = top_idx[np.argsort(-scores_all[top_idx])]

    # Return top N movieIds and scores
    ids = [idx2item[i] for i in top_idx]
    scores = scores_all[top_idx].tolist()
    return ids, scores


# Testing
# Get recommendations for the top 3 most popular users
top_users = df_sample['userId'].value_counts().nlargest(3).index.tolist()
for user in top_users:
    cf_recommendations = matrix_factorisation_als(df_sample, target_user=user)
    print(f"Top 10 for user {user} ALS matrix factorisation:", cf_recommendations)

  check_blas_config()
100%|██████████| 15/15 [00:00<00:00, 129.24it/s]


Top 10 for user 599 ALS matrix factorisation: ([79185, 113378, 590, 611, 1202, 2720, 4448, 2944, 118900, 3249], [0.3258478343486786, 0.2731245756149292, 0.22364316880702972, 0.16128499805927277, 0.1462622582912445, 0.144228994846344, 0.13965092599391937, 0.13898852467536926, 0.13785482943058014, 0.13633930683135986])


100%|██████████| 15/15 [00:00<00:00, 130.40it/s]


Top 10 for user 474 ALS matrix factorisation: ([7845, 2926, 37857, 5009, 51077, 89774, 688, 68157, 111113, 145], [0.875626266002655, 0.5513898134231567, 0.41076260805130005, 0.4005529582500458, 0.3783547282218933, 0.36520931124687195, 0.35562241077423096, 0.35036346316337585, 0.2767240107059479, 0.26729315519332886])


100%|██████████| 15/15 [00:00<00:00, 131.54it/s]

Top 10 for user 448 ALS matrix factorisation: ([3469, 2245, 1284, 2390, 920, 96610, 1359, 2937, 42, 2208], [0.8502200841903687, 0.8072934746742249, 0.6354320645332336, 0.5123622417449951, 0.4474309980869293, 0.40943294763565063, 0.40674248337745667, 0.4015004634857178, 0.38561147451400757, 0.3722541630268097])





## 5. Hybrid

In [18]:
# Z normalization
def _zscore(s: pd.Series) -> pd.Series:
    return (s - s.mean()) / (s.std(ddof=0) + 1e-9)

# Min-Max normalization
def _minmax01(s: pd.Series) -> pd.Series:
    rng = s.max() - s.min()
    return (s - s.min()) / (rng + 1e-9)

In [19]:
def hybrid_recommender_cf_aw(df, target_user, N=10, alpha=0.8, min_interactions=3, k=30, shrink=10, m=10):
    # Collect user history
    interactions = df[df["userId"] == target_user]
    seen_items: set[int] = set(interactions["movieId"].unique())
    seen_count = len(seen_items)

    # Popularity prior – Bayesian weighted rating for every movie
    n_movies = df["movieId"].nunique()
    prior_ids, prior_scores = average_rating_weighted(df, N=n_movies, m=m)
    prior_series = pd.Series(prior_scores, index=prior_ids, name="prior")

    # Cold‑start path – rely entirely on the prior  
    if seen_count < min_interactions:
        top = prior_series[~prior_series.index.isin(seen_items)].nlargest(N)
        return pd.DataFrame(
            {
                "movieId": top.index,
                "score": _minmax01(top.values),
                "cf_raw": np.nan,
                "prior_raw": top.values,
                "alpha_used": 0.0,
                "seen_count": seen_count,
            }
        )

    # Personalised CF scores  
    cf_ids, cf_scores = user_based_cosine_cf(
        df,
        target_user,
        N=max(1000, n_movies),
        k=k,
        shrink=shrink,
    )
    cf_series = pd.Series(cf_scores, index=cf_ids, name="cf")

    # Blend the two models
    candidates = prior_series.index.union(cf_series.index).difference(seen_items)
    cf_aligned = cf_series.reindex(candidates)
    prior_aligned = prior_series.reindex(candidates)

    # Normalise to share scale before mixing
    cf_norm = _zscore(cf_aligned.fillna(cf_aligned.mean()))
    prior_norm = _zscore(prior_aligned)

    # Mix the two models
    final = alpha * cf_norm + (1 - alpha) * prior_norm
    final = _minmax01(final)

    # Return top N movieIds and scores
    ids = final.nlargest(N).index.tolist()
    scores = final.nlargest(N).values.tolist()
    return ids, scores


# Testing
# Get recommendations for the top 3 most popular users
top_users = df_sample['userId'].value_counts().nlargest(3).index.tolist()
for user in top_users:
    cf_recommendations = hybrid_recommender_cf_aw(df_sample, target_user=user)
    print(f"Top 10 for user {user} hybrid collaborative filtering & avg. weighted recommender:", cf_recommendations)

Top 10 for user 599 hybrid collaborative filtering & avg. weighted recommender: ([54001, 590, 3114, 57669, 4246, 30749, 5971, 58998, 36, 55247], [0.9999999997528908, 0.9955276320390531, 0.985216444213922, 0.9694779495811998, 0.9426606995665555, 0.907980097689071, 0.9011205141597508, 0.8959691155170222, 0.8734377184638255, 0.8398130154433585])
Top 10 for user 474 hybrid collaborative filtering & avg. weighted recommender: ([68954, 68157, 50872, 2288, 63082, 8641, 2916, 57669, 51662, 72998], [0.9999999997516398, 0.9902203888405727, 0.9662082209061752, 0.9471296192304157, 0.9318292646940572, 0.9188857240010104, 0.8961714287289245, 0.8919081103638996, 0.8907411838895203, 0.8434340979479239])
Top 10 for user 448 hybrid collaborative filtering & avg. weighted recommender: ([318, 2329, 1219, 54001, 63082, 595, 2288, 34405, 910, 2761], [0.9999999997906448, 0.9261191436281038, 0.8488071190428258, 0.7882645791332755, 0.7604490908960007, 0.7522426783144225, 0.7519766752608774, 0.7455249379704997,

In [20]:
def hybrid_recommender(df, target_user, cold_model=average_rating_weighted, warm_model=user_based_cosine_cf, N=10, alpha=0.8, min_interactions=3):
    # Collect user history
    interactions = df[df["userId"] == target_user]
    seen_items: set[int] = set(interactions["movieId"].unique())
    seen_count = len(seen_items)

    # Apply cold model
    n_movies = df["movieId"].nunique()
    prior_ids, prior_scores = cold_model(df, N=n_movies)
    prior_series = pd.Series(prior_scores, index=prior_ids, name="prior")

    # Cold‑start path – rely entirely on the prior  
    if seen_count < min_interactions:
        print(f"Cold Start Detected: User {target_user} has only {seen_count} interactions, using prior model.")
        top = prior_series[~prior_series.index.isin(seen_items)].nlargest(N)

        # Return top N movieIds and scores
        ids = top.index.tolist()
        scores = top.values.tolist()
        return ids, scores

    # Apply warm model
    cf_ids, cf_scores = warm_model(df, target_user, N=max(1000, n_movies))
    cf_series = pd.Series(cf_scores, index=cf_ids, name="cf")

    # Blend the two models
    candidates = prior_series.index.union(cf_series.index).difference(seen_items)
    cf_aligned = cf_series.reindex(candidates)
    prior_aligned = prior_series.reindex(candidates)

    # Normalise to share scale before mixing
    cf_norm = _zscore(cf_aligned.fillna(cf_aligned.mean()))
    prior_norm = _zscore(prior_aligned)

    # Mix the two models
    final = alpha * cf_norm + (1 - alpha) * prior_norm
    final = _minmax01(final)

    # Return top N movieIds and scores
    ids = final.nlargest(N).index.tolist()
    scores = final.nlargest(N).values.tolist()
    return ids, scores


# Testing
# Get recommendations for the top 3 most popular users
top_users = df_sample['userId'].value_counts().nlargest(3).index.tolist()
for user in top_users:
    cf_recommendations = hybrid_recommender(df_sample, target_user=user, cold_model=average_rating, warm_model=item_based_cosine_cf)
    print(f"Top 10 for user {user} hybrid dynamic recommender:", cf_recommendations)

Top 10 for user 599 hybrid dynamic recommender: ([54001, 57669, 590, 3114, 5299, 8376, 4246, 30749, 58998, 71254], [0.9999999997506893, 0.9823387113646082, 0.9449131360730677, 0.9432169491996957, 0.9191060133212328, 0.8999491411466705, 0.887026415460161, 0.8729325451947673, 0.8405997531116943, 0.8285963807767025])
Top 10 for user 474 hybrid dynamic recommender: ([51662, 34162, 8641, 50872, 2916, 63082, 68954, 68157, 72998, 8873], [0.9999999997779269, 0.9363152836191435, 0.9238612857046477, 0.9219221938432781, 0.9054808072867694, 0.8890070277876002, 0.8740077904367726, 0.8650392080002691, 0.8624987614596469, 0.823127722936592])
Top 10 for user 448 hybrid dynamic recommender: ([318, 1219, 2329, 5816, 54001, 4246, 595, 920, 63082, 2944], [0.9999999997783844, 0.9491770069566781, 0.94458473570229, 0.9165068576064693, 0.8457133373119707, 0.8296251636238859, 0.826785585757196, 0.8248208666770616, 0.8234808440503364, 0.8063041193445082])


## 6. Evaluation

In [21]:
### Train-test split function
def make_train_test(df, test_size=0.2):
    # Hold out the most recent X% of interactions
    df_sorted = df.sort_values("timestamp")
    cutoff = int(len(df) * (1 - test_size))
    train = df_sorted.iloc[:cutoff]
    test  = df_sorted.iloc[cutoff:]
    return train.reset_index(drop=True), test.reset_index(drop=True)


### Top N evaluation function
def evaluate_top_n(df_train, df_test, model_func, N=10):
    # Build ground‐truth list
    gt = df_test.groupby('userId')['movieId'].apply(list).to_dict()
    
    # Calculate metrics
    precisions, recalls, mrrs = [], [], []
    train_users = set(df_train['userId'].unique())
    
    # Process each user in the test set
    for user, actual in gt.items():
        if not actual or user not in train_users:
            continue
            
        try:
            # Get recommendations
            recs_and_scores = model_func(df_train, target_user=user, N=N)
            
            # Handle both return types (older functions might just return IDs)
            if isinstance(recs_and_scores, tuple) and len(recs_and_scores) == 2:
                recs, scores = recs_and_scores
            else:
                continue
                
            if not recs:
                continue
                
            # Calculate ranking metrics
            precisions.append(len(set(recs[:N]) & set(actual)) / N)
            recalls.append(len(set(recs[:N]) & set(actual)) / len(actual))
            
            # MRR calculation
            rank = next((i+1 for i, r in enumerate(recs[:N]) if r in actual), 0)
            mrrs.append(1/rank if rank > 0 else 0)
            
        except Exception:
            # Skip problematic users
            continue
    
    # Return metrics dictionary
    return {
        f'Precision@{N}': np.nanmean(precisions) if precisions else np.nan,
        f'Recall@{N}': np.nanmean(recalls) if recalls else np.nan,
        f'MRR@{N}': np.nanmean(mrrs) if mrrs else np.nan,
    }


### RMSE evaluation function
def evaluate_rmse(df_train, df_test, model_func, N=10):
    # Create a dictionary mapping (userId, movieId) to rating
    test_ratings = df_test.set_index(['userId', 'movieId'])['rating'].to_dict()
    
    # Initialize a list to collect errors
    all_errors = []
    train_users = set(df_train['userId'].unique())
    
    # Process each user in the test set
    for user in df_test['userId'].unique():
        if user not in train_users:
            continue
            
        try:
            # Get recommendations and scores
            recs_and_scores = model_func(df_train, target_user=user, N=N)
            
            # Handle different return types
            if isinstance(recs_and_scores, tuple) and len(recs_and_scores) == 2:
                recs, scores = recs_and_scores
            else:
                continue
                
            if not recs:
                continue
                
            # Find items that exist in both recommendations and test set
            user_errors = []
            for i, item in enumerate(recs):
                if (user, item) in test_ratings:
                    actual = test_ratings[(user, item)]
                    predicted = scores[i]  
                    user_errors.append((actual - predicted) ** 2)
            
            if user_errors:  # Only add if we have overlapping items
                all_errors.extend(user_errors)
                
        except Exception:
            # Skip problematic users
            continue
    
    # Return RMSE
    return np.sqrt(np.mean(all_errors)) if all_errors else np.nan


### Model evaluation function
def evaluate_models(df_train, df_test, models, N=10):
    # Initialize results dictionary
    results = {}
    
    # Iterate over each model
    for model_name, model_func in models.items():
        print(f"Evaluating {model_name}...")
        
        try:
            # Evaluate top-N metrics
            topn_metrics = evaluate_top_n(df_train, df_test, model_func, N=N)
            
            # Evaluate RMSE (if applicable)
            rmse = evaluate_rmse(df_train, df_test, model_func, N=N)
            
            # Store results
            results[model_name] = {
                'TopN': topn_metrics,
                'RMSE': rmse
            }
            
            # Display results
            metrics_str = ', '.join([f"{k}: {v:.4f}" for k, v in topn_metrics.items()])
            print(f"{model_name} - {metrics_str}, RMSE: {rmse:.4f}")
            
        except Exception as e:
            print(f"Error evaluating {model_name}: {e}")
            results[model_name] = {'error': str(e)}
    
    return results

In [22]:
# Define models to evaluate
models = {
    # Non-personalized models
    'Top N Count': top_n_count,
    'Top N Likes': top_n_likes,
    'Average Rating': average_rating,
    'Average Rating Normalized': average_rating_normalized,
    'Average Rating Weighted': average_rating_weighted,
    # Personalized models
    'User CF Pearson': user_based_pearson_cf,
    'User CF Cosine': user_based_cosine_cf,
    'Item CF Pearson': item_based_pearson_cf,
    'Item CF Cosine': item_based_cosine_cf,
    'Content CF Cosine': content_based_cosine_f,
    'Matrix Factorisation SVD': matrix_factorisation_svd,
    'Matrix Factorisation NMF': matrix_factorisation_nmf,
    'Matrix Factorisation ALS': matrix_factorisation_als,
    'Hybrid CF & Avg Weighted': hybrid_recommender_cf_aw
}

# Split data into train and test sets
df_train, df_test = make_train_test(df_sample)

In [23]:
# Evaluate models
results_10 = evaluate_models(df_train, df_test, models, N=10)

Evaluating Top N Count...
Top N Count - Precision@10: 0.0357, Recall@10: 0.1081, MRR@10: 0.1811, RMSE: 160.6179
Evaluating Top N Likes...
Top N Likes - Precision@10: 0.0429, Recall@10: 0.1111, MRR@10: 0.1994, RMSE: 119.8845
Evaluating Average Rating...
Average Rating - Precision@10: 0.0286, Recall@10: 0.0985, MRR@10: 0.0400, RMSE: 1.6000
Evaluating Average Rating Normalized...
Average Rating Normalized - Precision@10: 0.0286, Recall@10: 0.0985, MRR@10: 0.0400, RMSE: 1.6000
Evaluating Average Rating Weighted...
Average Rating Weighted - Precision@10: 0.0643, Recall@10: 0.1233, MRR@10: 0.2024, RMSE: 0.3744
Evaluating User CF Pearson...
User CF Pearson - Precision@10: 0.0571, Recall@10: 0.1109, MRR@10: 0.2381, RMSE: 1.6648
Evaluating User CF Cosine...
User CF Cosine - Precision@10: 0.1214, Recall@10: 0.1860, MRR@10: 0.4054, RMSE: 2.4447
Evaluating Item CF Pearson...
Item CF Pearson - Precision@10: 0.0214, Recall@10: 0.0896, MRR@10: 0.0312, RMSE: 1.7690
Evaluating Item CF Cosine...
Item CF



Matrix Factorisation NMF - Precision@10: 0.0429, Recall@10: 0.0387, MRR@10: 0.1865, RMSE: 3.1826
Evaluating Matrix Factorisation ALS...


100%|██████████| 15/15 [00:00<00:00, 150.46it/s]
100%|██████████| 15/15 [00:00<00:00, 92.83it/s]
100%|██████████| 15/15 [00:00<00:00, 143.93it/s]
100%|██████████| 15/15 [00:00<00:00, 144.11it/s]
100%|██████████| 15/15 [00:00<00:00, 154.68it/s]
100%|██████████| 15/15 [00:00<00:00, 151.32it/s]
100%|██████████| 15/15 [00:00<00:00, 147.19it/s]
100%|██████████| 15/15 [00:00<00:00, 146.88it/s]
100%|██████████| 15/15 [00:00<00:00, 157.88it/s]
100%|██████████| 15/15 [00:00<00:00, 157.84it/s]
100%|██████████| 15/15 [00:00<00:00, 151.80it/s]
100%|██████████| 15/15 [00:00<00:00, 152.82it/s]
100%|██████████| 15/15 [00:00<00:00, 154.95it/s]
100%|██████████| 15/15 [00:00<00:00, 151.21it/s]
100%|██████████| 15/15 [00:00<00:00, 156.13it/s]
100%|██████████| 15/15 [00:00<00:00, 156.52it/s]
100%|██████████| 15/15 [00:00<00:00, 156.95it/s]
100%|██████████| 15/15 [00:00<00:00, 155.95it/s]
100%|██████████| 15/15 [00:00<00:00, 161.53it/s]
100%|██████████| 15/15 [00:00<00:00, 153.54it/s]
100%|██████████| 15/1

Matrix Factorisation ALS - Precision@10: 0.0286, Recall@10: 0.0238, MRR@10: 0.0476, RMSE: 3.7432
Evaluating Hybrid CF & Avg Weighted...
Hybrid CF & Avg Weighted - Precision@10: 0.1000, Recall@10: 0.1765, MRR@10: 0.4355, RMSE: 3.3867


In [24]:
# Evaluate models
results_5 = evaluate_models(df_train, df_test, models, N=5)

Evaluating Top N Count...
Top N Count - Precision@5: 0.0429, Recall@5: 0.0972, MRR@5: 0.1607, RMSE: 180.0356
Evaluating Top N Likes...
Top N Likes - Precision@5: 0.0571, Recall@5: 0.0903, MRR@5: 0.1905, RMSE: 142.0416
Evaluating Average Rating...
Average Rating - Precision@5: 0.0000, Recall@5: 0.0000, MRR@5: 0.0000, RMSE: nan
Evaluating Average Rating Normalized...
Average Rating Normalized - Precision@5: 0.0000, Recall@5: 0.0000, MRR@5: 0.0000, RMSE: nan
Evaluating Average Rating Weighted...
Average Rating Weighted - Precision@5: 0.0571, Recall@5: 0.0903, MRR@5: 0.1786, RMSE: 0.3446
Evaluating User CF Pearson...
User CF Pearson - Precision@5: 0.1000, Recall@5: 0.1079, MRR@5: 0.2381, RMSE: 1.7792
Evaluating User CF Cosine...
User CF Cosine - Precision@5: 0.1429, Recall@5: 0.1354, MRR@5: 0.3952, RMSE: 2.2955
Evaluating Item CF Pearson...
Item CF Pearson - Precision@5: 0.0143, Recall@5: 0.0102, MRR@5: 0.0143, RMSE: 3.0000
Evaluating Item CF Cosine...
Item CF Cosine - Precision@5: 0.0714,



Matrix Factorisation NMF - Precision@5: 0.0571, Recall@5: 0.0218, MRR@5: 0.1786, RMSE: 2.4230
Evaluating Matrix Factorisation ALS...


100%|██████████| 15/15 [00:00<00:00, 151.19it/s]
100%|██████████| 15/15 [00:00<00:00, 155.57it/s]
100%|██████████| 15/15 [00:00<00:00, 152.79it/s]
100%|██████████| 15/15 [00:00<00:00, 135.41it/s]
100%|██████████| 15/15 [00:00<00:00, 154.80it/s]
100%|██████████| 15/15 [00:00<00:00, 145.89it/s]
100%|██████████| 15/15 [00:00<00:00, 158.77it/s]
100%|██████████| 15/15 [00:00<00:00, 155.79it/s]
100%|██████████| 15/15 [00:00<00:00, 159.48it/s]
100%|██████████| 15/15 [00:00<00:00, 159.95it/s]
100%|██████████| 15/15 [00:00<00:00, 156.91it/s]
100%|██████████| 15/15 [00:00<00:00, 155.92it/s]
100%|██████████| 15/15 [00:00<00:00, 157.80it/s]
100%|██████████| 15/15 [00:00<00:00, 158.47it/s]
100%|██████████| 15/15 [00:00<00:00, 159.35it/s]
100%|██████████| 15/15 [00:00<00:00, 163.69it/s]
100%|██████████| 15/15 [00:00<00:00, 162.83it/s]
100%|██████████| 15/15 [00:00<00:00, 157.24it/s]
100%|██████████| 15/15 [00:00<00:00, 163.28it/s]
100%|██████████| 15/15 [00:00<00:00, 156.55it/s]
100%|██████████| 15/

Matrix Factorisation ALS - Precision@5: 0.0286, Recall@5: 0.0060, MRR@5: 0.0238, RMSE: 3.7765
Evaluating Hybrid CF & Avg Weighted...
Hybrid CF & Avg Weighted - Precision@5: 0.1833, Recall@5: 0.1557, MRR@5: 0.4236, RMSE: 3.3452


## END