In [3]:
# ðŸŽ¬ Movie Recommendation System (2025 Version)
# Author: Yash Kumar Mehta

# --- STEP 1: Import Required Libraries ---
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# --- STEP 2: Load Datasets ---
# Make sure you have these CSVs in the same folder: movies.csv, credits.csv
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

# --- STEP 3: Merge the two datasets ---
movies = movies.merge(credits, on='title')

# --- STEP 4: Select only the useful columns ---
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# --- STEP 5: Handle Missing Values ---
movies.dropna(inplace=True)

# --- STEP 6: Convert JSON-like columns into proper Python objects ---
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# --- STEP 7: Extract top 3 cast members ---
def convert_cast(obj):
    L = []
    for i in ast.literal_eval(obj)[:3]:
        L.append(i['name'])
    return L

movies['cast'] = movies['cast'].apply(convert_cast)

# --- STEP 8: Extract director name ---
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movies['crew'] = movies['crew'].apply(fetch_director)

# --- STEP 9: Tokenize Overview ---
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# --- STEP 10: Remove spaces in names for uniformity ---
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# --- STEP 11: Combine all tags into one column ---
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# --- STEP 12: Create a new dataframe with only required columns ---
new_df = movies[['movie_id', 'title', 'tags']]

# --- STEP 13: Convert list of tags to a single string ---
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

# --- STEP 14: Convert all text to lowercase ---
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

# --- STEP 15: Feature Extraction (Vectorization) ---
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

# --- STEP 16: Similarity Matrix using Cosine Similarity ---
similarity = cosine_similarity(vectors)

# --- STEP 17: Save Data for Streamlit App ---
pickle.dump(new_df, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

print("âœ… movie_list.pkl and similarity.pkl created successfully!")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


âœ… movie_list.pkl and similarity.pkl created successfully!
