In [None]:
import pandas as pd
import numpy as np
import math
import heapq
from collections import defaultdict
import time


In [None]:
def load_and_preprocess(ratings_path, movies_path, test_ratio=0.2, seed=42):
    """
    Loads MovieLens ratings and movies data,
    cleans it, and splits into train and test sets.
    """
    ratings_df = pd.read_csv(ratings_path)
    movies_df = pd.read_csv(movies_path)

    # Remove missing values and duplicate rows
    ratings_df.dropna(inplace=True)
    ratings_df.drop_duplicates(inplace=True)

    # Random train-test split (80% train, 20% test)
    np.random.seed(seed)
    mask = np.random.rand(len(ratings_df)) < (1 - test_ratio)
    train_df = ratings_df[mask]
    test_df = ratings_df[~mask]

    return train_df, test_df, movies_df


In [None]:
train_df, test_df, movies_df = load_and_preprocess(
    "/content/sample_data/ratings.csv",
    "/content/sample_data/movies.csv"
)

print("Train size:", len(train_df))
print("Test size:", len(test_df))


Train size: 80764
Test size: 20072


In [None]:
def build_rating_maps(ratings_df):
    """
    Builds two hash tables:
    1. user_ratings_map: user → {movie: rating}: For each user, stores all movies they rated with ratings
    2. movie_ratings_map: movie → {user: rating}: For each movie, stores all users who rated it with ratings
    """
    user_ratings_map = defaultdict(dict)
    movie_ratings_map = defaultdict(dict)

    for _, row in ratings_df.iterrows():
        user_id = int(row["userId"])
        movie_id = int(row["movieId"])
        rating = float(row["rating"])

        # Store rating in both directions
        user_ratings_map[user_id][movie_id] = rating
        movie_ratings_map[movie_id][user_id] = rating

    return user_ratings_map, movie_ratings_map


In [None]:
# Build user→movie and movie→user rating maps from training data
user_ratings_map, movie_ratings_map = build_rating_maps(train_df)

# Total unique users in training set
print("Number of users:", len(user_ratings_map))

# Total unique movies in training set
print("Number of movies:", len(movie_ratings_map))



Number of users: 610
Number of movies: 8985


In [None]:
# Show sample ratings for one user
sample_user = next(iter(user_ratings_map))
print("Sample user:", sample_user)
print("Movies rated by user (movie_id, rating):")
print(list(user_ratings_map[sample_user].items())[:5])
print('\n')

# Show users who rated one sample movie
sample_movie = next(iter(movie_ratings_map))
print("Sample movie ID:", sample_movie)
print("Users who rated this movie (user_id, rating):")
print(list(movie_ratings_map[sample_movie].items())[:5])


Sample user: 1
Movies rated by user (movie_id, rating):
[(1, 4.0), (6, 4.0), (47, 5.0), (50, 5.0), (70, 3.0)]


Sample movie ID: 1
Users who rated this movie (user_id, rating):
[(1, 4.0), (5, 4.0), (7, 4.5), (15, 2.5), (21, 3.5)]


In [None]:
def compute_cosine_similarity(movie_a, movie_b, movie_ratings_map, min_common_users=5):
    """
    Computes cosine similarity between two movies
    using ratings from users who rated both movies.
    """
    ratings_a = movie_ratings_map[movie_a]
    ratings_b = movie_ratings_map[movie_b]

    # Find users who rated both movies
    common_users = set(ratings_a.keys()) & set(ratings_b.keys())

    # Ignore movie pairs with too few co-ratings
    if len(common_users) < min_common_users:
        return 0.0

    # Compute dot product of rating vectors
    numerator = sum(ratings_a[u] * ratings_b[u] for u in common_users)

    # Compute magnitudes of rating vectors
    magnitude_a = math.sqrt(sum(ratings_a[u] ** 2 for u in common_users))
    magnitude_b = math.sqrt(sum(ratings_b[u] ** 2 for u in common_users))

    # Avoid division by zero
    if magnitude_a == 0 or magnitude_b == 0:
        return 0.0

    return numerator / (magnitude_a * magnitude_b)


In [None]:
def build_movie_similarity_graph(
    movie_ratings_map,
    min_similarity=0.1,
    top_k_neighbors=50,
    min_common_users=5
):
    """
    Builds a sparse item–item similarity graph:
    movie → [(similarity_score, similar_movie)]
    """
    similarity_graph = defaultdict(list)
    movie_ids = list(movie_ratings_map.keys())

    # Compare every pair of movies
    for i in range(len(movie_ids)):
        for j in range(i + 1, len(movie_ids)):
            similarity = compute_cosine_similarity(
                movie_ids[i],
                movie_ids[j],
                movie_ratings_map,
                min_common_users
            )

            # Store only meaningful similarities
            if similarity >= min_similarity:
                similarity_graph[movie_ids[i]].append((similarity, movie_ids[j]))
                similarity_graph[movie_ids[j]].append((similarity, movie_ids[i]))

    # Keep only Top-K similar movies per movie
    for movie_id in similarity_graph:
        similarity_graph[movie_id] = heapq.nlargest(
            top_k_neighbors,
            similarity_graph[movie_id]
        )

    return similarity_graph


In [None]:
# Measure time taken to build item–item similarity graph (offline step)
start_time = time.time()
movie_similarity_graph = build_movie_similarity_graph(movie_ratings_map)
print("Similarity graph built in", round(time.time() - start_time, 2), "seconds")


Similarity graph built in 95.58 seconds


In [None]:
# Show similar movies for one movie
# Create a mapping from movieId to movie title
movie_id_to_title = dict(zip(movies_df["movieId"], movies_df["title"]))

print("Sample movie ID:", sample_movie)
print("Sample movie title:", movie_id_to_title.get(sample_movie, "Unknown"))

print("\nTop similar movies:")
for sim_score, movie_id in movie_similarity_graph[sample_movie][:5]:
    movie_title = movie_id_to_title.get(movie_id)
    print(f"Similarity: {round(sim_score, 3)} | Movie ID: {movie_id} | Title: {movie_title}")


Sample movie ID: 1
Sample movie title: Toy Story (1995)

Top similar movies:
Similarity: 0.998 | Movie ID: 112138 | Title: 22 Jump Street (2014)
Similarity: 0.998 | Movie ID: 158238 | Title: The Nice Guys (2016)
Similarity: 0.998 | Movie ID: 4855 | Title: Dirty Harry (1971)
Similarity: 0.997 | Movie ID: 56587 | Title: Bucket List, The (2007)
Similarity: 0.997 | Movie ID: 93510 | Title: 21 Jump Street (2012)
