In [None]:
# from section1 task 9
def get_total_ratings(item_list, ni_dict):
    total = 0
    for movie_id in item_list:
        if movie_id in ni_dict:
            total = total + ni_dict[movie_id]
    return total

In [None]:
# from section1 task 9
def quick_sort(arr):
    # Base case: if list is empty or has 1 item, it's already sorted
    if len(arr) <= 1:
        return arr
    
    pivot = arr[len(arr) // 2] # Choose middle element as pivot
    left = []
    middle = []
    right = []
    
    for x in arr:
        # Compare the count (index 1 of the inner list)
        if x[1] < pivot[1]:
            left.append(x)
        elif x[1] == pivot[1]:
            middle.append(x)
        else:
            right.append(x)
            
    # Recursive calls
    return quick_sort(left) + middle + quick_sort(right)

In [None]:
# from section1 task 14
def manual_intersection_size(set1, set2):
    count = 0
    for x in set1:
        for y in set2:
            if x == y:
                count += 1
                break
    return count


In [None]:
# helper function 
# Merge Sort implementation to sort by count
def merge_sort(arr):
    if len(arr) <= 1:
        return arr
    
    mid = len(arr) // 2
    left = merge_sort(arr[:mid])
    right = merge_sort(arr[mid:])
    
    return merge(left, right)

def merge(left, right):
    result = []
    i = j = 0
    
    while i < len(left) and j < len(right):
        # Compare counts (index 1 in tuple)
        if left[i][1] <= right[j][1]:
            result.append(left[i])
            i += 1
        else:
            result.append(right[j])
            j += 1
            
    # Append leftovers
    result.extend(left[i:])
    result.extend(right[j:])
    
    return result


In [2]:
# ---- Helper functions ----

def common_items(u1_dict, u2_dict):
    com = []
    for x in u1_dict:
        for y in u2_dict:
            if x == y:
                com.append(x)
                break
    return com

In [None]:

# user_user cosine similarity
def cosine_similarity(u1_dict, u2_dict):
    com = common_items(u1_dict, u2_dict)

    if len(com) == 0:
        return 0.0

    dot = 0.0
    for item in com:
        dot += u1_dict[item] * u2_dict[item]

    norm1 = 0.0
    for item in u1_dict:
        norm1 += u1_dict[item] * u1_dict[item]

    norm2 = 0.0
    for item in u2_dict:
        norm2 += u2_dict[item] * u2_dict[item]

    if norm1 == 0 or norm2 == 0:
        return 0.0

    sim = dot / ((norm1**0.5) * (norm2**0.5))
    return round(sim, 4)


In [None]:
# helper function
def top_20_percent_desc(sorted_list):
    n = len(sorted_list)
    k = int(n * 0.2)
    if k < 1:
        k = 1

    # Take last k elements (top scores)
    top_users = sorted_list[-k:]

    # Reverse to make highest first
    top_users.reverse()

    # Round values INSIDE the list
    for i in range(len(top_users)):
        uid, sim = top_users[i]
        top_users[i] = (uid, round(sim, 2))

    return top_users


In [None]:
# ---- Helper function to predict rating ----
def predict_rating(target_user, item, top_sim_users, user_ratings):
    numerator = 0.0
    denominator = 0.0
    
    for (other_user, similarity) in top_sim_users:
        # Only consider users who rated the item
        if item in user_ratings.get(other_user, {}):
            r = user_ratings[other_user][item]
            numerator += similarity * r
            denominator += abs(similarity)  # use abs(similarity) to avoid negative impact
    
    if denominator == 0:
        return None  # Cannot predict without any rating info
    
    predicted = numerator / denominator
    return round(predicted, 2)


In [None]:
# Mean-Centered Cosine Similarity function
def mean_centered_cosine(u1_dict, u2_dict, u1_mean, u2_mean):
    com = common_items(u1_dict, u2_dict)
    if len(com) == 0:
        return 0.0
    
    dot = norm1 = norm2 = 0.0
    for item in com:
        dev1 = u1_dict[item] - u1_mean
        dev2 = u2_dict[item] - u2_mean
        dot += dev1 * dev2
        norm1 += dev1 ** 2
        norm2 += dev2 ** 2
    
    if norm1 == 0 or norm2 == 0:
        return 0.0
    
    return dot / ((norm1 ** 0.5) * (norm2 ** 0.5))


In [None]:
def pearson_correlation(u1_dict, u2_dict, u1_mean, u2_mean):
    com = common_items(u1_dict, u2_dict)
    if len(com) < 2:  # PCC undefined or zero with less than 2 common items
        return 0.0
    
    sum1 = sum2 = sum1_sq = sum2_sq = sum_prod = 0.0
    for item in com:
        dev1 = u1_dict[item] - u1_mean
        dev2 = u2_dict[item] - u2_mean
        sum1 += dev1
        sum2 += dev2
        sum1_sq += dev1 ** 2
        sum2_sq += dev2 ** 2
        sum_prod += dev1 * dev2
    
    numerator = sum_prod
    denominator = (sum1_sq * sum2_sq) ** 0.5
    
    if denominator == 0:
        return 0.0
    
    pcc = numerator / denominator
    return round(pcc, 2)

In [None]:
def mean_center(item_dict):
    mean_rating = sum(item_dict.values()) / len(item_dict)
    centered = {}
    for u in item_dict:
        centered[u] = item_dict[u] - mean_rating
    return centered


In [None]:
def common_users(i1_dict, i2_dict):
    com = []
    for u in i1_dict:
        if u in i2_dict:
            com.append(u)
    return com


In [None]:
def cosine_similarity_items(i1_dict_raw, i2_dict_raw):
    # mean-center the rating vectors
    i1_dict = mean_center(i1_dict_raw)
    i2_dict = mean_center(i2_dict_raw)

    com = common_users(i1_dict, i2_dict)
    if len(com) == 0:
        return 0.0

    # dot product
    dot = 0.0
    for u in com:
        dot += i1_dict[u] * i2_dict[u]

    # norms
    norm1 = sum(v*v for v in i1_dict.values()) ** 0.5
    norm2 = sum(v*v for v in i2_dict.values()) ** 0.5

    if norm1 == 0 or norm2 == 0:
        return 0.0

    return round(dot / (norm1 * norm2), 2)


In [None]:
def pcc_similarity_items(i1_dict, i2_dict):
    # Find common users
    com = common_users(i1_dict, i2_dict)
    if len(com) < 2:
        return 0.0  # PCC undefined for <2 points

    # Extract rating vectors
    x = [i1_dict[u] for u in com]
    y = [i2_dict[u] for u in com]

    # Means
    mean_x = sum(x) / len(x)
    mean_y = sum(y) / len(y)

    # Numerator
    num = sum((x[i] - mean_x) * (y[i] - mean_y) for i in range(len(com)))

    # Denominator
    den_x = sum((x[i] - mean_x) ** 2 for i in range(len(com))) ** 0.5
    den_y = sum((y[i] - mean_y) ** 2 for i in range(len(com))) ** 0.5

    if den_x == 0 or den_y == 0:
        return 0.0

    return round(num / (den_x * den_y), 2)


# Functions

Function 1: Count raters per item (manual)

In [None]:
def compute_num_raters(df):
    num_raters = {}
    for _, row in df.iterrows():
        item = int(row['movieId'])  # convert to int here
        if item not in num_raters:
            num_raters[item] = 0
        num_raters[item] += 1
    return num_raters



Function 2: Compute average rating per item (manual)

In [None]:
def compute_avg_rating(df):
   
    rating_sum = {}    # sum of ratings per item
    rating_count = {}  # number of ratings per item

    for _, row in df.iterrows():
        item = int(row['movieId'])  # ensure item ID is integer
        rating = float(row['rating'])  # ensure rating is float

        if item not in rating_sum:
            rating_sum[item] = 0.0
            rating_count[item] = 0

        rating_sum[item] += rating
        rating_count[item] += 1

    # Compute average rating per item as plain float
    avg_rating = {}
    for item in rating_sum:
        avg_rating[item] = float(rating_sum[item] / rating_count[item])

    return avg_rating



Function 3: Compute rating standard deviation per item (manual)

In [None]:
import math

def compute_std_rating(df, avg_rating):
    rating_values = {}

    # Collect all ratings per item
    for _, row in df.iterrows():
        item = int(row['movieId'])  # convert item ID to integer
        rating = float(row['rating'])

        
        if item not in rating_values:
            rating_values[item] = []
        rating_values[item].append(rating)

    # Compute standard deviation manually
    std_rating = {}
    for item, values in rating_values.items():
        mean = avg_rating[item]
        n = len(values)
        
        if n == 1:
            std_rating[item] = 0  # no variation for single rating
            continue

        variance_sum = 0
        for x in values:
            variance_sum += (x - mean) ** 2

        variance = variance_sum / n
        std_rating[item] = math.sqrt(variance)

    return std_rating


function 4: Build the item feature vector

In [None]:
def build_item_features(num_raters, avg_rating, std_rating):
    features = {}

    for item in num_raters.keys():
        item_int = int(item)  # ensure item ID is an integer
        features[item_int] = [
            num_raters[item_int],
            avg_rating[item_int],
            std_rating[item_int]
        ]

    return features


function 5: Normalize the feature vectors using Z-score

In [None]:
def z_score_normalize(item_features, feature_means, feature_stds):
    """
    Normalize each feature independently using Z-score:
    Xi,f = (X_i,f - mean_f) / std_f
    Returns a new dictionary with normalized features.
    """
    normalized_features = {}
    for item, features in item_features.items():
        normalized = []
        for j in range(len(features)):
            if feature_stds[j] == 0:  # prevent division by zero
                normalized.append(0.0)
            else:
                normalized.append((features[j] - feature_means[j]) / feature_stds[j])
        normalized_features[item] = normalized
    return normalized_features


function 6: Compute mean of each feature

In [None]:
def compute_feature_means(item_features):
   
    n_features = len(next(iter(item_features.values())))
    feature_sums = [0.0] * n_features
    n_items = len(item_features)

    for features in item_features.values():
        for j in range(n_features):
            feature_sums[j] += features[j]

    feature_means = [s / n_items for s in feature_sums]
    return feature_means


function 7: Compute standard deviation of each feature

In [None]:
import math

def compute_feature_stds(item_features, feature_means):
    
    n_features = len(feature_means)
    n_items = len(item_features)
    variance_sums = [0.0] * n_features

    for features in item_features.values():
        for j in range(n_features):
            variance_sums[j] += (features[j] - feature_means[j]) ** 2

    feature_stds = [math.sqrt(v / n_items) for v in variance_sums]
    return feature_stds


function 8: Compute Euclidean distance

In [None]:
import math

def euclidean_distance(vec1, vec2):
    """
    Compute Euclidean distance between two vectors.
    """
    return math.sqrt(sum((a - b) ** 2 for a, b in zip(vec1, vec2)))


function 9: Initialize centroids randomly

In [None]:
import random

def initialize_centroids(data, K):
    """
    Randomly select K items as initial centroids.
    Input: data = dict {item_id: feature_vector}
    Output: list of centroid vectors
    """
    all_features = list(data.values())
    return random.sample(all_features, K)


function 10: Assign items to nearest centroid

In [None]:
def assign_clusters(data, centroids):
    """
    Assign each item to the nearest centroid.
    Returns: dict {movieId: cluster_index}
    """
    assignments = {}
    for movieId, features in data.items():
        distances = [euclidean_distance(features, c) for c in centroids]
        cluster_index = distances.index(min(distances))
        assignments[movieId] = cluster_index
    return assignments


function 11: Update centroids

In [None]:
def update_centroids(data, assignments, K):
    """
    Compute new centroids as mean of assigned items.
    """
    n_features = len(next(iter(data.values())))
    centroids = [[0.0] * n_features for _ in range(K)]
    counts = [0] * K

    for movieId, cluster_index in assignments.items():
        features = data[movieId]
        for j in range(n_features):
            centroids[cluster_index][j] += features[j]
        counts[cluster_index] += 1

    for i in range(K):
        if counts[i] > 0:
            centroids[i] = [x / counts[i] for x in centroids[i]]
        else:
            # If a cluster gets no items, reinitialize randomly
            centroids[i] = list(random.choice(list(data.values())))
    return centroids


function 12: Compute WCSS (within-cluster sum of squares)

In [None]:
def compute_wcss(data, assignments, centroids):
    """
    Compute total WCSS for all clusters.
    """
    wcss = 0.0
    for movieId, cluster_index in assignments.items():
        wcss += euclidean_distance(data[movieId], centroids[cluster_index]) ** 2
    return wcss


function 13: Compute silhouette score (manual)

In [None]:
def compute_silhouette_score(data, assignments, K):
    """
    Compute the silhouette score manually.
    For each item i:
      a(i) = average distance to other items in the same cluster
      b(i) = minimum average distance to items in other clusters
      s(i) = (b(i) - a(i)) / max(a(i), b(i))
    Returns average silhouette over all items.
    """
    from collections import defaultdict
    cluster_items = defaultdict(list)
    for movieId, cluster_index in assignments.items():
        cluster_items[cluster_index].append(movieId)

    silhouette_scores = []

    for movieId, cluster_index in assignments.items():
        current_vec = data[movieId]

        # a(i): average distance to items in same cluster
        same_cluster = [data[other] for other in cluster_items[cluster_index] if other != movieId]
        if len(same_cluster) == 0:
            a_i = 0
        else:
            a_i = sum(euclidean_distance(current_vec, o) for o in same_cluster) / len(same_cluster)

        # b(i): minimum average distance to other clusters
        b_i = float('inf')
        for other_cluster_index, items in cluster_items.items():
            if other_cluster_index == cluster_index:
                continue
            avg_dist = sum(euclidean_distance(current_vec, data[other]) for other in items) / len(items)
            if avg_dist < b_i:
                b_i = avg_dist

        s_i = 0 if max(a_i, b_i) == 0 else (b_i - a_i) / max(a_i, b_i)
        silhouette_scores.append(s_i)

    return sum(silhouette_scores) / len(silhouette_scores)


function 14: K-means usage function

In [None]:
def k_means_clustering(data, K, max_iters=100):
    """
    Perform K-means clustering manually.
    Returns: centroids, assignments, WCSS, silhouette
    """
    centroids = initialize_centroids(data, K)

    for iteration in range(max_iters):
        assignments = assign_clusters(data, centroids)
        new_centroids = update_centroids(data, assignments, K)

        # stop if centroids do not change
        if all(
            euclidean_distance(new_centroids[i], centroids[i]) < 1e-6 
            for i in range(K)
        ):
            break

        centroids = new_centroids

    wcss = compute_wcss(data, assignments, centroids)
    silhouette = compute_silhouette_score(data, assignments, K)
    return centroids, assignments, wcss, silhouette


function 15: Plot Elbow curve

In [None]:
import matplotlib.pyplot as plt

def plot_elbow_wcss(metrics_df):
    """
    Plot the elbow curve using WCSS for different K values.
    Input: metrics_df with columns ['K', 'WCSS']
    """
    plt.figure(figsize=(8,5))
    plt.plot(metrics_df['K'], metrics_df['WCSS'], 'o-', color='blue', linewidth=2)
    plt.title("Elbow Method - WCSS vs K")
    plt.xlabel("Number of Clusters K")
    plt.ylabel("WCSS")
    plt.xticks(metrics_df['K'])
    plt.grid(True)
    plt.show()


function 16: Plot Silhouette scores

In [None]:
def plot_silhouette(metrics_df):
    """
    Plot silhouette scores for different K values.
    Input: metrics_df with columns ['K', 'Silhouette']
    """
    plt.figure(figsize=(8,5))
    plt.plot(metrics_df['K'], metrics_df['Silhouette'], 'o-', color='green', linewidth=2)
    plt.title("Silhouette Score vs K")
    plt.xlabel("Number of Clusters K")
    plt.ylabel("Silhouette Score")
    plt.xticks(metrics_df['K'])
    plt.grid(True)
    plt.show()


function 17: Compute average number of raters per cluster

In [None]:
def average_num_raters_per_cluster(assignments, num_raters):
    """
    Compute the average number of raters for items in each cluster.
    assignments: dict {item_id: cluster_index}
    num_raters: dict {item_id: num_raters}
    Returns: dict {cluster_index: avg_num_raters}
    """
    cluster_raters = {}
    cluster_counts = {}

    for movieId, cluster in assignments.items():
        if cluster not in cluster_raters:
            cluster_raters[cluster] = 0
            cluster_counts[cluster] = 0
        cluster_raters[cluster] += num_raters[movieId]
        cluster_counts[cluster] += 1

    avg_raters_per_cluster = {}
    for cluster in cluster_raters:
        avg_raters_per_cluster[cluster] = cluster_raters[cluster] / cluster_counts[cluster]

    return avg_raters_per_cluster


functoin 18: Classify clusters (popular/niche/long-tail)

In [None]:
def classify_clusters(avg_raters_per_cluster):
    """
    Classify clusters based on average number of raters:
    - 'popular item': high num_raters
    - 'niche item': low num_raters
    - 'long-tail item': very few raters
    Returns: dict {cluster_index: category}
    """
    categories = {}
    values = list(avg_raters_per_cluster.values())
    max_raters = max(values)
    min_raters = min(values)
    range_raters = max_raters - min_raters

    for cluster, avg_raters in avg_raters_per_cluster.items():
        if avg_raters >= min_raters + 0.66 * range_raters:
            categories[cluster] = 'popular item'
        elif avg_raters <= min_raters + 0.33 * range_raters:
            categories[cluster] = 'long-tail item'
        else:
            categories[cluster] = 'niche item'

    return categories


function 19: Visualize distribution of items across clusters

In [None]:
def plot_items_per_cluster(assignments):
    """
    Plot the distribution of items across clusters.
    assignments: dict {item_id: cluster_index}
    """
    from collections import Counter
    import matplotlib.pyplot as plt

    cluster_counts = Counter(assignments.values())
    clusters = sorted(cluster_counts.keys())
    counts = [cluster_counts[c] for c in clusters]

    plt.figure(figsize=(8,5))
    plt.bar(clusters, counts, color='skyblue')
    plt.xlabel("Cluster Index")
    plt.ylabel("Number of Items")
    plt.title("Distribution of Items Across Clusters")
    plt.xticks(clusters)
    plt.show()


function 20: Plot distribution of number of raters per cluster

In [None]:
def plot_raters_distribution_manual(df, cluster_col='cluster', raters_col='num_raters'):
    import matplotlib.pyplot as plt

    clusters = df[cluster_col].unique()  # <- use actual cluster IDs
    clusters.sort()
    
    plt.figure(figsize=(10,6))
    
    for cluster in clusters:
        cluster_data = df[df[cluster_col] == cluster]
        plt.hist(cluster_data[raters_col], alpha=0.5, label=f'Cluster {cluster}')
    
    plt.xlabel('Number of Raters')
    plt.ylabel('Count of Items')
    plt.title('Distribution of Raters per Cluster')
    plt.legend()
    plt.show()


function 21: Cluster popularity summary

In [None]:
def cluster_popularity_summary_manual(df, cluster_col='cluster', raters_col='num_raters'):
    summary_list = []
    clusters = df[cluster_col].unique()
    clusters.sort()  # optional: sort ascending

    for cluster in clusters:
        cluster_df = df[df[cluster_col] == cluster]
        num_items = len(cluster_df)
        total_raters = cluster_df[raters_col].sum()
        avg_raters = cluster_df[raters_col].mean()
        summary_list.append({
            'cluster': cluster,
            'num_items': num_items,
            'total_raters': total_raters,
            'avg_raters': avg_raters
        })

    return summary_list


function 22: Head vs Tail distribution

In [None]:
def analyze_head_tail_distribution_manual(df, cluster_col='cluster', raters_col='num_raters', head_percent=0.2):
    # Sort items by number of raters
    df_sorted = df.sort_values(raters_col, ascending=False).reset_index(drop=True)
    
    # Determine cutoff for head
    head_cutoff = int(len(df) * head_percent)
    
    head_items = df_sorted.iloc[:head_cutoff]
    tail_items = df_sorted.iloc[head_cutoff:]
    
    # Use actual cluster values
    clusters = df[cluster_col].unique()
    clusters.sort()
    
    distribution = {}
    for cluster in clusters:
        head_count = len(head_items[head_items[cluster_col] == cluster])
        tail_count = len(tail_items[tail_items[cluster_col] == cluster])
        distribution[cluster] = {'head': head_count, 'tail': tail_count}
    
    return distribution


function 23: get cluster of a target item

In [None]:
def get_item_cluster(movieId, assignments):
    """
    Returns the cluster assignment of a given item.
    
    item_id: the item to check
    assignments: dictionary mapping item_id -> cluster
    """
    return assignments[movieId]


fucntion 24: djusted Cosine Similarity Function

In [None]:
def adjusted_cosine_similarity(item1_ratings, item2_ratings, user_avg_ratings):
    """
    Computes the Adjusted Cosine similarity between two items.
    
    item1_ratings, item2_ratings: dictionaries {user_id: rating}
    user_avg_ratings: dictionary {user_id: avg rating}
    """
    numerator = 0
    denom1 = 0
    denom2 = 0
    common_users = set(item1_ratings.keys()).intersection(set(item2_ratings.keys()))
    
    if not common_users:
        return 0  # No common users
    
    for user in common_users:
        dev1 = item1_ratings[user] - user_avg_ratings[user]
        dev2 = item2_ratings[user] - user_avg_ratings[user]
        numerator += dev1 * dev2
        denom1 += dev1 ** 2
        denom2 += dev2 ** 2
    
    if denom1 == 0 or denom2 == 0:
        return 0
    
    return numerator / ((denom1 ** 0.5) * (denom2 ** 0.5))


function 25: Function to select top N% similar items

In [None]:
def select_top_similar(sim_dict, top_percent=0.2):
    """
    Selects the top 'top_percent' most similar items from a similarity dictionary.
    
    sim_dict: {item_id: similarity_value}
    top_percent: fraction to select
    """
    sorted_items = sorted(sim_dict.items(), key=lambda x: x[1], reverse=True)
    top_n = max(1, int(len(sorted_items) * top_percent))
    return dict(sorted_items[:top_n])


function 26: Predict rating for a user for a target item

In [None]:
def predict_rating(user_id, target_item, similar_items, ratings_data, user_avg_ratings):
    """
    Predicts rating using item-based collaborative filtering with Adjusted Cosine.
    
    user_id: target user
    target_item: target item
    similar_items: dict {item_id: similarity}
    ratings_data: dict {item_id: {user_id: rating}}
    user_avg_ratings: dict {user_id: avg rating}
    """
    numerator = 0
    denominator = 0
    for sim_item, sim in similar_items.items():
        if user_id in ratings_data[sim_item]:
            numerator += sim * (ratings_data[sim_item][user_id] - user_avg_ratings[user_id])
            denominator += abs(sim)
    
    if denominator == 0:
        return user_avg_ratings[user_id]  # fallback to user's average
    
    return user_avg_ratings[user_id] + numerator / denominator


function 27: create per-user rating dictionary

In [None]:
def build_user_item_ratings(assignments_optimal, average_ri, avg_rating):
    """
    Build a dictionary of per-item ratings per user.

    Returns:
    ratings_data = {item_id: {user_id: rating}}
    """
    ratings_data = {}
    
    # average_ri is a Series: index = user_id, value = user avg rating
    user_avg = average_ri.to_dict()
    
    # For each item
    for item in assignments_optimal:
        # For this example, assume every user rated every item as the item's average
        ratings_data[item] = {}
        for user_id in user_avg:
            ratings_data[item][user_id] = avg_rating[item]  # or use a randomized approach if needed
    
    return ratings_data


function 29: Predict ratings using non-clustering item CF

In [None]:
def predict_rating_non_cluster(user_id, target_item, ratings_data, user_avg_ratings):
    """
    Predict rating using item-based CF without clustering.
    Uses Adjusted Cosine similarity with all other items.
    """
    sim_dict = {}
    
    # Compute similarity between target_item and all other items
    for other_item in ratings_data:
        if other_item == target_item:
            continue
        sim = adjusted_cosine_similarity(
            ratings_data[other_item],
            ratings_data[target_item],
            user_avg_ratings
        )
        sim_dict[other_item] = sim
    
    # Select top 20% similar items
    top_sim_items = select_top_similar(sim_dict, top_percent=0.2)
    
    # Predict rating
    return predict_rating(user_id, target_item, top_sim_items, ratings_data, user_avg_ratings)


function 30: compute prediction errors

In [None]:
def compute_prediction_errors(actual_ratings, predicted_ratings):
    """
    Compute prediction error for each user-item pair.
    
    actual_ratings: {user_id: {item_id: actual_rating}}
    predicted_ratings: {user_id: {item_id: predicted_rating}}
    
    Returns: {user_id: {item_id: error}}
    """
    errors = {}
    for user in predicted_ratings:
        errors[user] = {}
        for item in predicted_ratings[user]:
            actual = actual_ratings[user].get(item, None)
            if actual is not None:
                errors[user][item] = actual - predicted_ratings[user][item]
            else:
                errors[user][item] = None  # or skip
    return errors


function 31: Identify long-tail items


In [None]:
def get_long_tail_items(num_raters, percentile=20):
    sorted_items = sorted(num_raters.items(), key=lambda x: x[1])
    n = max(1, int(len(sorted_items) * percentile / 100))
    return [item for item, _ in sorted_items[:n]]



function 32: Count similar items for a target item

In [None]:
def count_similar_items(target_item, similarity_dict, top_percent=0.2):
    """
    Returns the number of top similar items for a target item.
    
    target_item: item_id
    similarity_dict: dict {item_id: {other_item: similarity}}
    top_percent: fraction to select  
    """
    top_items = select_top_similar(similarity_dict[target_item], top_percent)
    return len(top_items)


function 33: Compute reliability for a set of items

In [None]:
def average_abs_error_items(errors, items):
    """
    Compute average absolute error for a subset of items.
    
    errors: {user_id: {item_id: error}}
    items: list of item_ids
    """
    total = 0
    count = 0
    for user in errors:
        for item in items:
            if item in errors[user] and errors[user][item] is not None:
                total += abs(errors[user][item])
                count += 1
    return total / count if count > 0 else None


 Function 34: Reduction in similarity computations

In [None]:
# ------------------------------
# Function: Compute similarity reduction due to clustering
# ------------------------------
def compute_similarity_reduction(cluster_item_counts):
    """
    cluster_item_counts: list of number of items per cluster
    Returns the reduction in similarity computations due to clustering
    """
    total_items = sum(cluster_item_counts)
    total_pairs_no_cluster = total_items * (total_items - 1) / 2
    total_pairs_clustered = sum(n * (n - 1) / 2 for n in cluster_item_counts)
    reduction = 1 - (total_pairs_clustered / total_pairs_no_cluster)
    return reduction


Function 35: Compute speedup factor

In [None]:
# ------------------------------
# Function: Compute speedup factor compared to non-clustering CF
# ------------------------------
def compute_speedup_factor(sim_computations_no_cluster, sim_computations_clustered):
    """
    sim_computations_no_cluster: total similarity computations without clustering
    sim_computations_clustered: total similarity computations with clustering
    Returns the speedup factor
    """
    if sim_computations_clustered == 0:
        return float('inf')  # Avoid division by zero
    return sim_computations_no_cluster / sim_computations_clustered


Function 36: Compare speedup for item vs user clustering

In [None]:
# ------------------------------
# Function: Compare speedup factor between item-based and user-based clustering
# ------------------------------
def compare_speedup_item_user(item_cluster_counts, user_cluster_counts):
    """
    item_cluster_counts: list of number of items per item-cluster
    user_cluster_counts: list of number of users per user-cluster
    Returns speedup factors for items and users
    """
    # Compute total pairs for item-based CF
    total_items = sum(item_cluster_counts)
    total_pairs_items_no_cluster = total_items * (total_items - 1) / 2
    total_pairs_items_clustered = sum(n * (n - 1) / 2 for n in item_cluster_counts)
    speedup_items = total_pairs_items_no_cluster / total_pairs_items_clustered

    # Compute total pairs for user-based CF
    total_users = sum(user_cluster_counts)
    total_pairs_users_no_cluster = total_users * (total_users - 1) / 2
    total_pairs_users_clustered = sum(n * (n - 1) / 2 for n in user_cluster_counts)
    speedup_users = total_pairs_users_no_cluster / total_pairs_users_clustered

    return speedup_items, speedup_users


Function 37: Compute average prediction error per cluster

In [None]:
# ------------------------------
# Function 11a: Average prediction error per cluster
# ------------------------------
def compute_avg_error_per_cluster(df_items, clustering_cf_errors):
    """
    Compute the average prediction error for each cluster.

    Parameters:
        df_items: DataFrame with columns ['movieId', 'cluster', ...]
        clustering_cf_errors: dict {movieId: error_value} of prediction errors

    Returns:
        dict {cluster_id: avg_error}
    """
    cluster_errors = {}
    for cluster_id in df_items['cluster'].unique():
        items_in_cluster = df_items[df_items['cluster'] == cluster_id]['movieId']
        errors = [clustering_cf_errors[item] for item in items_in_cluster if item in clustering_cf_errors]
        if errors:
            cluster_errors[cluster_id] = sum(errors) / len(errors)
        else:
            cluster_errors[cluster_id] = None  # or np.nan if preferred
    return cluster_errors


Function 38: Group clusters by size

In [None]:
# ------------------------------
# Function 11b: Group clusters by size
# ------------------------------
def group_clusters_by_size(df_items):
    """
    Count the number of items in each cluster.

    Returns:
        dict {cluster_id: num_items}
    """
    cluster_sizes = df_items.groupby('cluster').size().to_dict()
    return cluster_sizes


Function 39: Analyze accuracy vs cluster size

In [None]:
# ------------------------------
# Function 11c: Analyze relationship between cluster size and prediction error
# ------------------------------
def accuracy_vs_cluster_size(cluster_sizes, cluster_errors):
    """
    Create a summary of cluster size vs average prediction error.

    Returns:
        list of tuples: [(cluster_id, size, avg_error)]
    """
    summary = []
    for cluster_id, size in cluster_sizes.items():
        avg_error = cluster_errors.get(cluster_id, None)
        summary.append((cluster_id, size, avg_error))
    return sorted(summary, key=lambda x: x[1])  # sorted by cluster size


Function 40: Count items per cluster

In [None]:
def group_clusters_by_size(df_items):
    cluster_sizes = df_items.groupby("cluster")["movieId"].count().to_dict()
    return cluster_sizes

Function 41: Compute avg rating-prediction error per cluster


In [None]:
def compute_avg_error_per_cluster(df_items, item_errors):
    # Merge error values into the dataframe
    df = df_items.copy()
    df["error"] = df["movieId"].map(item_errors)

    # Compute average error per cluster
    cluster_errors = df.groupby("cluster")["error"].mean().to_dict()
    return cluster_errors

Function 42: Combine cluster sizes & avg errors


In [None]:
def accuracy_vs_cluster_size(cluster_sizes, cluster_errors):
    summary = []

    for cid in cluster_sizes:
        size = cluster_sizes.get(cid, 0)
        err = cluster_errors.get(cid, None)
        summary.append((cid, size, err))

    # sort by cluster id
    summary = sorted(summary, key=lambda x: x[0])
    return summary