In [46]:
"""
Movie Recommendation System
Author: Vansh
Description:
    This notebook provides a movie recommendation system using collaborative filtering
    and fuzzy search for movie titles. It uses the MovieLens 100k dataset.
"""

# ========= Step 0: Imports =========
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display

# ========= Step 1: Load Dataset =========
dataset_path = "/content/drive/MyDrive/Colab Notebooks/ml-100k/ml-100k/"

# Load movies
movies = pd.read_csv(
    dataset_path + "u.item",
    sep="|",
    encoding="latin-1",
    header=None,
    names=[
        "ID", "Name", "Year", "video_release_date", "IMDb_URL",
        "unknown", "Action", "Adventure", "Animation", "Children",
        "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
        "Film-Noir", "Horror", "Musical", "Mystery", "Romance",
        "Sci-Fi", "Thriller", "War", "Western"
    ]
)

# Keep only useful columns
movies = movies[["ID", "Name", "Year"]]

# Clean movie titles for searching
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title).lower()

movies["clean_name"] = movies["Name"].apply(clean_title)

# ========= Step 2: TF-IDF Vectorizer =========
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
tfidf = vectorizer.fit_transform(movies["clean_name"])

# ========= Step 3: Load Ratings =========
ratings = pd.read_csv(
    dataset_path + "u.data",
    sep="\t",
    names=["user_id", "ID", "rating", "timestamp"]
)

# ========= Step 4: Functions =========
def search(title, top_n=5, threshold=0.2):
    """Fuzzy search for movie titles."""
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()

    indices = np.argpartition(similarity, -top_n)[-top_n:]
    results = movies.iloc[indices].copy()
    results["similarity"] = similarity[indices]
    results = results.sort_values("similarity", ascending=False)
    results = results[results["similarity"] >= threshold].reset_index(drop=True)
    return results

def find_similar_movies(movie_id, top_n=10):
    """Collaborative filtering recommendations."""
    # Users who liked this movie
    similar_users = ratings[(ratings["ID"] == movie_id) & (ratings["rating"] > 3)]["user_id"].unique()
    if len(similar_users) == 0:
        return pd.DataFrame(columns=["score", "Name", "Year", "avg_rating", "rating_count"])

    # Movies liked by these users
    similar_user_recs = ratings[(ratings["user_id"].isin(similar_users)) & (ratings["rating"] > 3)]["ID"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    # Popularity among all users
    all_users = ratings[(ratings["ID"].isin(similar_user_recs.index)) & (ratings["rating"] > 3)]
    all_user_recs = all_users["ID"].value_counts() / all_users["user_id"].nunique()

    rec_scores = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_scores.columns = ["similar", "all"]
    rec_scores["score"] = rec_scores["similar"] / rec_scores["all"]
    rec_scores = rec_scores.sort_values("score", ascending=False)

    # Merge with movie details
    recs = rec_scores.head(top_n).merge(movies, left_index=True, right_on="ID")

    # Add avg rating and count
    avg_ratings = ratings.groupby("ID")["rating"].mean()
    rating_counts = ratings.groupby("ID")["rating"].count()
    recs["avg_rating"] = recs["ID"].map(avg_ratings).round(1)
    recs["rating_count"] = recs["ID"].map(rating_counts)

    # Sort by avg_rating and reset index
    recs = recs.sort_values("avg_rating", ascending=False)
    recs.index = np.arange(1, len(recs) + 1)

    return recs[["score", "Name", "Year", "avg_rating", "rating_count"]]

# ========= Step 5: Interactive Widget =========
# Movie search input
movie_input = widgets.Text(
    value='',
    placeholder='Type a movie name...',
    description='Movie:',
    continuous_update=True
)

# Minimum rating slider
min_rating_slider = widgets.FloatSlider(
    value=3.0, min=0.0, max=5.0, step=0.1,
    description='Min Rating:'
)

# Output display area
output_area = widgets.Output()

# Callback for interactive search
def on_type(change):
    with output_area:
        output_area.clear_output()
        title = change['new']
        min_rating = min_rating_slider.value

        if len(title) < 2:
            print("Type at least 2 letters to search.")
            return

        results = search(title, top_n=5)
        if results.empty:
            print(f"No match found for '{title}'")
            return

        # Top match as selected movie
        selected_movie = results.iloc[0]
        movie_id = selected_movie["ID"]

        # Recommendations
        recs = find_similar_movies(movie_id)

        # Include the selected movie at the top
        selected_movie_row = pd.DataFrame({
            "score": [np.nan],
            "Name": [selected_movie["Name"]],
            "Year": [selected_movie["Year"]],
            "avg_rating": [ratings[ratings["ID"]==movie_id]["rating"].mean().round(1)],
            "rating_count": [ratings[ratings["ID"]==movie_id]["rating"].count()]
        })

        recs = recs[recs["Name"] != selected_movie["Name"]]
        recs = pd.concat([selected_movie_row, recs], ignore_index=True)

        # Filter by minimum rating
        recs = recs[recs["avg_rating"] >= min_rating]
        recs.index = np.arange(1, len(recs)+1)

        display(recs)

# Bind callbacks
movie_input.observe(on_type, names='value')
min_rating_slider.observe(lambda change: on_type({'new': movie_input.value}), names='value')

# Display widget
display(movie_input, min_rating_slider, output_area)


Text(value='', description='Movie:', placeholder='Type a movie name...')

FloatSlider(value=3.0, description='Min Rating:', max=5.0)

Output()