In [12]:
import os
import numpy as np
import pandas as pd
import pickle
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix, save_npz, load_npz
import warnings
warnings.filterwarnings("ignore")


In [13]:
DATA_ROOT = Path(r"C:\Users\balki\Downloads\ml-32m\ml-32m")
MODEL_DIR = Path("models")
MODEL_DIR.mkdir(exist_ok=True)

print("Loading data...")
movies = pd.read_csv(DATA_ROOT / "movies.csv")
ratings = pd.read_csv(
    DATA_ROOT / "ratings.csv",
    dtype={"userId": np.int32, "movieId": np.int32, "rating": np.float32},
    usecols=["userId", "movieId", "rating"]
)

print(f"Movies: {movies.shape}, Ratings: {ratings.shape}")

Loading data...
Movies: (87585, 3), Ratings: (32000204, 3)


In [14]:
movies["movieId"] = movies["movieId"].astype(np.int32)
ratings["movieId"] = ratings["movieId"].astype(np.int32)
ratings["userId"] = ratings["userId"].astype(np.int32)

# Extract primary genre (first genre listed)
movies["primary_genre"] = movies["genres"].apply(
    lambda x: x.split("|")[0] if pd.notna(x) and len(x) > 0 else "Unknown"
)

# Create enhanced content_text with better weighting
movies["genres"] = movies["genres"].fillna("Unknown")
movies["title"] = movies["title"].fillna("")

# Weight genres higher in content_text for better matching
movies["content_text"] = (
    movies["title"].astype(str) + " " +
    (movies["genres"].str.replace("|", " ") + " " + movies["genres"].str.replace("|", " "))
).str.lower().str.strip()

print("Data prepared successfully!")
print(f"Sample content_text: {movies.iloc[0]['content_text']}")

Data prepared successfully!
Sample content_text: toy story (1995) adventure animation children comedy fantasy adventure animation children comedy fantasy


In [17]:
print("\nBuilding advanced TF-IDF vectorizer...")
tfv = TfidfVectorizer(
    max_features=3000,
    min_df=3,  # Increased to reduce noise
    max_df=0.90,
    ngram_range=(1, 2),
    stop_words="english",
    sublinear_tf=True,
    strip_accents="unicode"
)
tfidf_matrix = tfv.fit_transform(movies["content_text"])
print(f"‚úÖ TF-IDF matrix shape: {tfidf_matrix.shape}")



Building advanced TF-IDF vectorizer...
‚úÖ TF-IDF matrix shape: (87585, 3000)


In [18]:
print("\nBuilding collaborative filtering data...")

# Create mappings
unique_movie_ids = np.sort(ratings["movieId"].unique())
unique_user_ids = np.sort(ratings["userId"].unique())

movieid_to_idx = {mid: i for i, mid in enumerate(unique_movie_ids)}
userid_to_idx = {uid: i for i, uid in enumerate(unique_user_ids)}
idx_to_movieid = {i: mid for mid, i in movieid_to_idx.items()}

n_items = len(unique_movie_ids)
n_users = len(unique_user_ids)

# Build sparse matrix (items x users)
rows = ratings["movieId"].map(movieid_to_idx).to_numpy()
cols = ratings["userId"].map(userid_to_idx).to_numpy()
data = ratings["rating"].to_numpy(dtype=np.float32)

item_user_matrix = csr_matrix((data, (rows, cols)), shape=(n_items, n_users))

# Normalize for better similarity
item_user_matrix_normalized = item_user_matrix.copy()
item_squared = item_user_matrix_normalized.multiply(item_user_matrix_normalized)
item_norms = np.sqrt(np.array(item_squared.sum(axis=1)).reshape(-1))
item_norms[item_norms == 0] = 1.0

print(f"‚úÖ Item-user matrix: {item_user_matrix.shape}")



Building collaborative filtering data...
‚úÖ Item-user matrix: (84432, 200948)


In [19]:
print("\nBuilding popularity scores...")

pop = ratings.groupby("movieId").agg(
    count=("rating", "size"),
    mean_rating=("rating", "mean"),
    std_rating=("rating", "std")
).reset_index()

pop = pop.merge(
    movies[["movieId", "title", "primary_genre", "genres"]],
    on="movieId",
    how="left"
)

# Better popularity score: rating √ó log(count) √ó confidence
pop["popularity_score"] = (
    pop["mean_rating"] ** 1.5 *  # Emphasize high ratings
    np.log1p(pop["count"]) *      # Log of count
    (1 + 0.5 * (1 - pop["std_rating"].fillna(1) / 5))  # Confidence boost
)

pop_sorted = pop.sort_values(
    ["popularity_score", "mean_rating"],
    ascending=False
).reset_index(drop=True)

print("Top 10 popular movies:")
for idx, row in pop_sorted.head(10).iterrows():
    print(f"  {idx+1}. {row['title']} ({row['primary_genre']}) - Rating: {row['mean_rating']:.2f}, Count: {row['count']}")



Building popularity scores...
Top 10 popular movies:
  1. Shawshank Redemption, The (1994) (Crime) - Rating: 4.40, Count: 102929
  2. Godfather, The (1972) (Crime) - Rating: 4.32, Count: 66440
  3. Usual Suspects, The (1995) (Crime) - Rating: 4.27, Count: 67750
  4. Pulp Fiction (1994) (Comedy) - Rating: 4.20, Count: 98409
  5. Fight Club (1999) (Action) - Rating: 4.23, Count: 77332
  6. Schindler's List (1993) (Drama) - Rating: 4.24, Count: 73849
  7. Matrix, The (1999) (Action) - Rating: 4.16, Count: 93808
  8. Silence of the Lambs, The (1991) (Crime) - Rating: 4.15, Count: 90330
  9. Godfather: Part II, The (1974) (Crime) - Rating: 4.26, Count: 43111
  10. Dark Knight, The (2008) (Action) - Rating: 4.17, Count: 59334


In [20]:
def find_movie(title_query):
    """Find movie by title (case-insensitive, partial match)"""
    query = title_query.lower().strip()

    # Try exact match
    exact = movies[movies["title"].str.lower() == query]
    if not exact.empty:
        return exact.index[0]

    # Try partial match - prioritize exact title match
    partial = movies[movies["title"].str.lower().str.contains(query, regex=False, na=False)]
    if not partial.empty:
        return partial.index[0]

    return None

def recommend_content_based(movie_idx, topk=10):
    """Content-based recommendation using TF-IDF with genre consideration"""
    if movie_idx is None or movie_idx >= len(movies):
        return pd.DataFrame()

    query_movie = movies.iloc[movie_idx]
    query_genre = query_movie["primary_genre"]

    target_vec = tfidf_matrix[movie_idx]

    # Compute cosine similarity
    similarities = cosine_similarity(target_vec, tfidf_matrix)[0]

    # Get top candidates (3x to filter by genre)
    top_indices = np.argsort(similarities)[::-1][1:topk*3+1]

    # Prefer same genre but allow others with high similarity
    genre_bonus = np.array([
        1.2 if movies.iloc[i]["primary_genre"] == query_genre else 1.0
        for i in top_indices
    ])

    adjusted_scores = similarities[top_indices] * genre_bonus

    # Get final top-k
    final_indices = top_indices[np.argsort(-adjusted_scores)][:topk]

    results = movies.iloc[final_indices][["movieId", "title", "genres", "primary_genre"]].copy()
    results["similarity_score"] = similarities[final_indices]
    results["genre"] = results["primary_genre"]

    return results[["movieId", "title", "genre", "similarity_score"]].reset_index(drop=True)

def recommend_cf_advanced(movie_idx, topk=10, min_common_users=5):
    """Advanced CF with better filtering"""
    if movie_idx is None or movie_idx >= len(movies):
        return pd.DataFrame()

    movie_id = int(movies.iloc[movie_idx]["movieId"])
    query_genre = movies.iloc[movie_idx]["primary_genre"]

    if movie_id not in movieid_to_idx:
        return pd.DataFrame()

    row_idx = movieid_to_idx[movie_id]
    target_vec = item_user_matrix[row_idx]

    # Compute similarity
    similarities = cosine_similarity(target_vec, item_user_matrix)[0]

    # Count common users with significant ratings
    target_users = (target_vec > 2.5).astype(int)
    all_users = (item_user_matrix > 2.5).astype(int)

    common_high_ratings = target_users.dot(all_users.T).toarray()[0]

    # Filter: min common users, similarity > threshold
    threshold = 0.05
    valid_mask = (common_high_ratings >= min_common_users) & (similarities > threshold)
    valid_indices = np.where(valid_mask)[0]

    # Remove self
    valid_indices = valid_indices[valid_indices != row_idx]

    if len(valid_indices) == 0:
        return pd.DataFrame()

    # Sort by similarity
    valid_indices = valid_indices[np.argsort(-similarities[valid_indices])][:topk]

    # Get movie IDs and build results
    rec_movie_ids = [idx_to_movieid[i] for i in valid_indices]
    rec_movies = movies[movies["movieId"].isin(rec_movie_ids)][["movieId", "title", "primary_genre"]].copy()
    rec_movies["cf_score"] = similarities[valid_indices]

    # Reorder by CF score
    rec_movies = rec_movies.sort_values("cf_score", ascending=False)
    rec_movies["genre"] = rec_movies["primary_genre"]

    return rec_movies[["movieId", "title", "genre", "cf_score"]].reset_index(drop=True)

def recommend_hybrid(movie_idx, topk=10, content_weight=0.5, cf_weight=0.5):
    """Hybrid recommendation combining content + CF with genre awareness"""
    if movie_idx is None:
        return pd.DataFrame()

    query_genre = movies.iloc[movie_idx]["primary_genre"]

    # Get both recommendations
    content_recs = recommend_content_based(movie_idx, topk=topk*2)
    cf_recs = recommend_cf_advanced(movie_idx, topk=topk*2)

    if content_recs.empty and cf_recs.empty:
        return pd.DataFrame()

    # Normalize scores
    if not content_recs.empty:
        content_recs["norm_score"] = (
            content_recs["similarity_score"] / content_recs["similarity_score"].max()
        )
        content_recs.rename(columns={"norm_score": "norm_content"}, inplace=True)
    else:
        content_recs["norm_content"] = 0.0

    if not cf_recs.empty:
        cf_recs["norm_score"] = cf_recs["cf_score"] / cf_recs["cf_score"].max()
        cf_recs.rename(columns={"norm_score": "norm_cf"}, inplace=True)
    else:
        cf_recs["norm_cf"] = 0.0

    # Combine recommendations
    all_recs = pd.concat([
        content_recs[["movieId", "title", "genre", "norm_content"]],
        cf_recs[["movieId", "title", "genre", "norm_cf"]]
    ], ignore_index=True)

    all_recs = all_recs.groupby("movieId").agg({
        "title": "first",
        "genre": "first",
        "norm_content": "max",
        "norm_cf": "max"
    }).reset_index()

    # Fill NaN with 0
    all_recs["norm_content"] = all_recs["norm_content"].fillna(0)
    all_recs["norm_cf"] = all_recs["norm_cf"].fillna(0)

    # Genre bonus (20% boost for same genre)
    genre_bonus = all_recs["genre"].apply(
        lambda g: 1.2 if g == query_genre else 1.0
    )

    # Hybrid score
    all_recs["hybrid_score"] = (
        (content_weight * all_recs["norm_content"] +
         cf_weight * all_recs["norm_cf"]) * genre_bonus
    )

    # Remove query movie
    query_movie_id = int(movies.iloc[movie_idx]["movieId"])
    all_recs = all_recs[all_recs["movieId"] != query_movie_id]

    # Get top-k
    all_recs = all_recs.sort_values("hybrid_score", ascending=False).head(topk)

    return all_recs[["movieId", "title", "genre", "hybrid_score"]].reset_index(drop=True)

def get_popular_by_genre(genre=None, topk=10):
    """Get popular movies, optionally filtered by genre"""
    if genre is None:
        return pop_sorted.head(topk)[["movieId", "title", "primary_genre", "mean_rating", "count"]].reset_index(drop=True)

    genre_filtered = pop_sorted[pop_sorted["primary_genre"] == genre]
    if genre_filtered.empty:
        return pop_sorted.head(topk)[["movieId", "title", "primary_genre", "mean_rating", "count"]].reset_index(drop=True)

    return genre_filtered.head(topk)[["movieId", "title", "primary_genre", "mean_rating", "count"]].reset_index(drop=True)


In [21]:
print("\n" + "="*60)
print("TESTING ADVANCED RECOMMENDATIONS")
print("="*60)

test_movies = ["Toy Story (1995)", "The Matrix (1999)", "Pulp Fiction (1994)"]

for test_title in test_movies:
    print(f"\nüé¨ Query: {test_title}")
    idx = find_movie(test_title)

    if idx is None:
        print(f"  ‚ùå Movie not found")
        continue

    query_movie = movies.iloc[idx]
    print(f"   Genre: {query_movie['primary_genre']}")
    print(f"   All Genres: {query_movie['genres']}")

    print(f"\n  üìñ Content-Based (Same Genre Preferred):")
    content_recs = recommend_content_based(idx, topk=5)
    for i, row in content_recs.iterrows():
        print(f"    {i+1}. {row['title']} ({row['genre']}) [Score: {row['similarity_score']:.3f}]")

    print(f"\n  ü§ù Collaborative Filtering:")
    cf_recs = recommend_cf_advanced(idx, topk=5)
    for i, row in cf_recs.iterrows():
        print(f"    {i+1}. {row['title']} ({row['genre']}) [Score: {row['cf_score']:.3f}]")

    print(f"\n  ‚≠ê Hybrid Recommendations (BEST):")
    hybrid_recs = recommend_hybrid(idx, topk=5, content_weight=0.5, cf_weight=0.5)
    for i, row in hybrid_recs.iterrows():
        print(f"    {i+1}. {row['title']} ({row['genre']}) [Score: {row['hybrid_score']:.3f}]")

    print(f"\n  üèÜ Popular Movies in Same Genre ({query_movie['primary_genre']}):")
    pop_recs = get_popular_by_genre(query_movie['primary_genre'], topk=5)
    for i, row in pop_recs.iterrows():
        print(f"    {i+1}. {row['title']} (Rating: {row['mean_rating']:.2f}, Count: {row['count']})")



TESTING ADVANCED RECOMMENDATIONS

üé¨ Query: Toy Story (1995)
   Genre: Adventure
   All Genres: Adventure|Animation|Children|Comedy|Fantasy

  üìñ Content-Based (Same Genre Preferred):
    1. Toy Story 2 (1999) (Adventure) [Score: 0.858]
    2. Toy Story Toons: Small Fry (2011) (Adventure) [Score: 0.836]
    3. Toy Story Toons: Hawaiian Vacation (2011) (Adventure) [Score: 0.834]
    4. Boxtrolls, The (2014) (Adventure) [Score: 0.790]
    5. Tangled: Before Ever After (2017) (Adventure) [Score: 0.790]

  ü§ù Collaborative Filtering:
    1. Star Wars: Episode IV - A New Hope (1977) (Action) [Score: 0.575]
    2. Forrest Gump (1994) (Comedy) [Score: 0.562]
    3. Jurassic Park (1993) (Action) [Score: 0.545]
    4. Back to the Future (1985) (Adventure) [Score: 0.542]
    5. Toy Story 2 (1999) (Adventure) [Score: 0.540]

  ‚≠ê Hybrid Recommendations (BEST):
    1. Toy Story 2 (1999) (Adventure) [Score: 1.150]
    2. Toy Story Toons: Small Fry (2011) (Adventure) [Score: 0.585]
    3. To

In [22]:
print("\n" + "="*60)
print("SAVING MODELS")
print("="*60)

# Save movies
with open(MODEL_DIR / "movies_df.pkl", "wb") as f:
    pickle.dump(movies[["movieId", "title", "genres", "content_text", "primary_genre"]], f)
print("‚úÖ Saved movies_df.pkl")

# Save TF-IDF vectorizer
with open(MODEL_DIR / "tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfv, f)
print("‚úÖ Saved tfidf_vectorizer.pkl")

# Save TF-IDF matrix
save_npz(MODEL_DIR / "tfidf_matrix.npz", tfidf_matrix)
print("‚úÖ Saved tfidf_matrix.npz")

# Save mappings
mappings = {
    "movieid_to_idx": movieid_to_idx,
    "idx_to_movieid": idx_to_movieid,
    "userid_to_idx": userid_to_idx,
    "item_norms": item_norms
}
with open(MODEL_DIR / "mappings.pkl", "wb") as f:
    pickle.dump(mappings, f)
print("‚úÖ Saved mappings.pkl")

# Save popularity data
with open(MODEL_DIR / "pop_sorted.pkl", "wb") as f:
    pickle.dump(pop_sorted, f)
print("‚úÖ Saved pop_sorted.pkl")

# Save item-user matrix
save_npz(MODEL_DIR / "item_user_matrix.npz", item_user_matrix)
print("‚úÖ Saved item_user_matrix.npz")

print("\n" + "="*60)
print("‚úÖ ALL MODELS SAVED SUCCESSFULLY!")
print("Ready for production Streamlit app!")
print("="*60)


SAVING MODELS
‚úÖ Saved movies_df.pkl
‚úÖ Saved tfidf_vectorizer.pkl
‚úÖ Saved tfidf_matrix.npz
‚úÖ Saved mappings.pkl
‚úÖ Saved pop_sorted.pkl
‚úÖ Saved item_user_matrix.npz

‚úÖ ALL MODELS SAVED SUCCESSFULLY!
Ready for production Streamlit app!
