In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import string
import ast

# Load your dataset
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')




In [3]:
print(movies.columns)
print(credits.columns)


Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')
Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')


In [4]:
# Merge datasets
df = movies.merge(credits, left_on='id', right_on='movie_id')

# Print column names to check
print(df.columns)

# If you have a 'title_x' column after merge, use that
if 'title_x' in df.columns:
    df['title'] = df['title_x']




Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'movie_id', 'title_y', 'cast', 'crew'],
      dtype='object')


In [5]:
# Preprocessing: Convert JSON-like columns to actual lists
def parse_json_column(column):
    return column.apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

In [6]:
import ast

def parse_json_column(column):
    def parse_genres(x):
        try:
            # Try to parse the string as a JSON object (list of dictionaries)
            parsed = ast.literal_eval(x) if isinstance(x, str) else []
            # Ensure that the result is a list of dictionaries or empty list
            return parsed if isinstance(parsed, list) else []
        except (ValueError, SyntaxError):
            # If parsing fails, assume it's a simple string and return a list of words
            return x.split() if isinstance(x, str) else []
    
    return column.apply(parse_genres)

# Apply to 'genres' column (assuming genres are JSON-like in the original column)
df['genres'] = parse_json_column(df['genres']).apply(lambda x: ' '.join([i['name'] for i in x]) if isinstance(x, list) and len(x) > 0 and isinstance(x[0], dict) else ' '.join(x))

# Apply similar parsing to 'keywords', 'cast', 'crew' if necessary
df['keywords'] = parse_json_column(df['keywords']).apply(lambda x: ' '.join([i['name'] for i in x]) if isinstance(x, list) and len(x) > 0 and isinstance(x[0], dict) else ' '.join(x))
df['cast'] = parse_json_column(df['cast']).apply(lambda x: ' '.join([i['name'] for i in x]) if isinstance(x, list) and len(x) > 0 and isinstance(x[0], dict) else ' '.join(x))
df['crew'] = parse_json_column(df['crew']).apply(lambda x: ' '.join([i['name'] for i in x]) if isinstance(x, list) and len(x) > 0 and isinstance(x[0], dict) else ' '.join(x))


In [7]:
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Lowercasing the text
        text = text.translate(str.maketrans('', '', string.punctuation))  # Removing punctuation
    else:
        text = ''
    return text
    

In [8]:
df['genres'] = df['genres'].apply(lambda x: clean_text(x))
df['keywords'] = df['keywords'].apply(lambda x: clean_text(x))
df['cast'] = df['cast'].apply(lambda x: clean_text(x))
df['crew'] = df['crew'].apply(lambda x: clean_text(x))
df['overview'] = df['overview'].apply(lambda x: clean_text(x))

In [9]:
df['soup'] = df['genres'] + ' ' + df['keywords'] + ' ' + df['cast'] + ' ' + df['crew'] + ' ' + df['overview']

In [10]:
df['soup'] = df['soup'].fillna('')


In [11]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['soup'])


In [12]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [13]:
def recommend_movie(title, cosine_sim=cosine_sim):
    # Convert the title to lowercase 
    title = title.lower()
    
    # Ensure the title is in the dataset 
    if title not in df['title'].str.lower().values:
        return f"Movie titled '{title}' not found in the dataset."

    # Get the index of the movie that matches the title
    idx = df[df['title'].str.lower() == title].index[0]
    
    # Get the cosine similarity scores for the movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the cosine similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the top 10 similar movies
    sim_scores = sim_scores[1:11]  
    movie_indices = [i[0] for i in sim_scores]
    
    # Return top 10 most similar movie 
    return df['title'].iloc[movie_indices].values


In [14]:
recommended_movies = recommend_movie('fight club')
print(recommended_movies)

['Jurassic World' 'The Curious Case of Benjamin Button' 'Se7en'
 'L.A. Confidential' '15 Minutes' 'Contact' 'Ant-Man'
 'Batman v Superman: Dawn of Justice' 'Armageddon'
 'Austin Powers in Goldmember']
