In [None]:
# ========================================
# MOVIE RECOMMENDER SYSTEM (Content-Based)
# ========================================

# Import Required Libraries
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load the Datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge Datasets on 'title'
movies = movies.merge(credits, on='title')

# Select Relevant Columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop Missing Values
movies.dropna(inplace=True)

# Helper Function to Convert Stringified JSON
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

# Apply to columns
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Extract Top 3 Cast Members
def convert_cast(text):
    L = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            L.append(i['name'])
            counter += 1
    return L

movies['cast'] = movies['cast'].apply(convert_cast)

# Extract Director Name
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

movies['crew'] = movies['crew'].apply(fetch_director)

# Clean Names (remove spaces)
def collapse(L):
    return [i.replace(" ", "") for i in L]

movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

# Process Overview (split into words)
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Combine All Features into 'tags'
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create Final Clean Dataset
new = movies[['movie_id', 'title', 'tags']].copy()

# Join words into a single string
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

# Convert Text Data into Vectors
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new['tags']).toarray()

# Compute Cosine Similarity
similarity = cosine_similarity(vector)

# Recommendation Function
def recommend(movie):
    if movie not in new['title'].values:
        print("Movie not found in database.")
        return
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    print(f"\n🎥 Recommended movies similar to '{movie}':\n")
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)

# Example Test
recommend('Gandhi')

# Save Models for Later Use
pickle.dump(new, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))



🎥 Recommended movies similar to 'Gandhi':

Gandhi, My Father
The Wind That Shakes the Barley
A Passage to India
Guiana 1838
Ramanujan
