In [None]:
# Capstone 8 â€“ Recommendation System
# Content-Based Filtering using MovieLens Dataset

import pandas as pd
import matplotlib.pyplot as plt

# Load data
ratings = pd.read_csv("../data/raw/ratings.csv")
movies = pd.read_csv("../data/raw/movies.csv")
tags = pd.read_csv("../data/raw/tags.csv")

print(ratings.head())
print(movies.head())
print(tags.head())

print("Ratings shape:", ratings.shape)
print("Movies shape:", movies.shape)
print("Tags shape:", tags.shape)

print("Unique users:", ratings['userId'].nunique())
print("Unique movies:", ratings['movieId'].nunique())

# Rating distribution
ratings['rating'].hist(bins=10, edgecolor='black')
plt.xlabel("Rating")
plt.ylabel("Count")
plt.title("Distribution of Movie Ratings")
plt.show()

# Feature creation
movies['genres'] = movies['genres'].fillna('').astype(str).str.split('|')

# Similarity function
def jaccard_similarity(a, b):
    a, b = set(a), set(b)
    if not a and not b:
        return 0.0
    return len(a & b) / len(a | b)

# Recommendation function
def recommend_movies(title, top_n=5):
    seed = movies[movies['title'] == title].iloc[0]
    scores = []

    for _, row in movies.iterrows():
        if row['title'] == title:
            continue
        score = jaccard_similarity(seed['genres'], row['genres'])
        scores.append((row['title'], round(score, 3)))

    return sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]

# Generate recommendations
recommendations = recommend_movies("Toy Story (1995)", 5)
recommendations
