In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("tmdb_5000_movies.csv")  
print(df.head())
print(df.info())


      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [3]:
print(df.isnull().sum())

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64


In [5]:
df['budget'].fillna(df['budget'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['budget'].fillna(df['budget'].median(), inplace=True)


In [6]:
print(df.duplicated().sum())  # Count duplicate rows
df.drop_duplicates(inplace=True)  # Remove duplicates

0


In [7]:
import pandas as pd

df = pd.read_csv("tmdb_5000_movies.csv")

print(df.columns)


Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


In [8]:
import ast  # For parsing JSON-like strings

def extract_genres(genre_str):
    genres = ast.literal_eval(genre_str)  # Convert string to Python object
    return [genre['name'] for genre in genres]  # Extract genre names

df['genres'] = df['genres'].apply(extract_genres)
print(df[['title', 'genres']].head())

                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                          genres  
0  [Action, Adventure, Fantasy, Science Fiction]  
1                   [Adventure, Fantasy, Action]  
2                     [Action, Adventure, Crime]  
3               [Action, Crime, Drama, Thriller]  
4           [Action, Adventure, Science Fiction]  


In [9]:
def extract_keywords(keyword_str):
    keywords = ast.literal_eval(keyword_str)
    return [keyword['name'] for keyword in keywords]

df['keywords'] = df['keywords'].apply(extract_keywords)
print(df[['title', 'keywords']].head())

                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                            keywords  
0  [culture clash, future, space war, space colon...  
1  [ocean, drug abuse, exotic island, east india ...  
2  [spy, based on novel, secret agent, sequel, mi...  
3  [dc comics, crime fighter, terrorist, secret i...  
4  [based on novel, mars, medallion, space travel...  


In [18]:
df_movies = pd.read_csv("tmdb_5000_movies.csv")
df_credits = pd.read_csv("tmdb_5000_credits.csv")

# Merge both datasets on movie ID
df = df_movies.merge(df_credits, left_on='id', right_on='movie_id', how='inner')

# Drop duplicate 'movie_id' column
df.drop(columns=['movie_id'], inplace=True)

# Display merged dataset
print(df.columns)
print(df.head())



Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'title_y', 'cast', 'crew'],
      dtype='object')
      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206

In [None]:
import ast

# Ensure 'cast' and 'crew' are properly parsed
df['cast'] = df['cast'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
df['crew'] = df['crew'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Function to extract top 3 actors
def extract_cast(cast_list):
    return [actor['name'] for actor in cast_list[:3]]

# Function to extract the director
def extract_director(crew_list):
    directors = [person['name'] for person in crew_list if person['job'] == 'Director']
    return directors[0] if directors else None

# Apply extraction
df['cast'] = df['cast'].apply(extract_cast)
df['director'] = df['crew'].apply(extract_director)

# Display the updated dataset
print(df[['title', 'cast', 'director']].head())


In [21]:
# Combine all text-based features into one column
df['combined_features'] = df['genres'].apply(lambda x: ' '.join(x)) + ' ' + \
                          df['keywords'].apply(lambda x: ' '.join(x)) + ' ' + \
                          df['cast'].apply(lambda x: ' '.join(x)) + ' ' + \
                          df['director'].apply(lambda x: str(x))

print(df[['original_title', 'combined_features']].head())


                             original_title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                   combined_features  
0  [ { " i d " :   2 8 ,   " n a m e " :   " A c ...  
1  [ { " i d " :   1 2 ,   " n a m e " :   " A d ...  
2  [ { " i d " :   2 8 ,   " n a m e " :   " A c ...  
3  [ { " i d " :   2 8 ,   " n a m e " :   " A c ...  
4  [ { " i d " :   2 8 ,   " n a m e " :   " A c ...  


In [23]:
def weighted_rating(x, m, C):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + m) * R) + (m / (m + v) * C)

# Compute minimum votes required for consideration
m = df['vote_count'].quantile(0.90)  # Only consider top 10% of movies
C = df['vote_average'].mean()  # Average rating across all movies

df['score'] = df.apply(lambda x: weighted_rating(x, m, C), axis=1)

# Recommend top movies
top_movies = df.sort_values('score', ascending=False)[['original_title', 'score']].head(10)
print(top_movies)


                                     original_title     score
1881                       The Shawshank Redemption  8.059258
662                                      Fight Club  7.939256
65                                  The Dark Knight  7.920020
3232                                   Pulp Fiction  7.904645
96                                        Inception  7.863239
3337                                  The Godfather  7.851236
95                                     Interstellar  7.809479
809                                    Forrest Gump  7.803188
329   The Lord of the Rings: The Return of the King  7.727243
1990                        The Empire Strikes Back  7.697884


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert text data into numerical format
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Compute cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get movie recommendations
def recommend_movies(movie_title, df, cosine_sim):
    indices = pd.Series(df.index, index=df['original_title']).drop_duplicates()
    idx = indices[movie_title]  # Get index of the movie

    # Get similarity scores and sort them
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]  # Top 10 similar

    # Get movie indices
    movie_indices = [i[0] for i in sim_scores]

    return df['original_title'].iloc[movie_indices]

# Example: Recommend movies similar to "Avatar"
print(recommend_movies("Avatar", df, cosine_sim))


2403                          Aliens
1804    Snow White: A Tale of Terror
94           Guardians of the Galaxy
2060              Out of the Furnace
2361                   The Ice Storm
47           Star Trek Into Darkness
2138                         Copycat
3474                       The Words
1892                      The Losers
2106                        Drumline
Name: original_title, dtype: object


In [33]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

# Load the ratings dataset
reader = Reader(rating_scale=(0, 10))
ratings = pd.read_csv("tmdb_5000_ratings.csv")  # Assuming ratings file exists
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Train a collaborative filtering model
svd = SVD()
cross_validate(svd, data, cv=5)

# Fit model on entire dataset
trainset = data.build_full_trainset()
svd.fit(trainset)

# Predict rating for a given user and movie
pred = svd.predict(uid=1, iid=2)
print(pred.est)  # Estimated rating


ModuleNotFoundError: No module named 'surprise'