# Movie Recommendation System using Content-based Filtering on the TMDB Dataset

## 1. Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

## 2. Load and Explore the Dataset

In [2]:
# Load the dataset
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

# Display the first few rows of each dataset
print(movies.head(3))
print(credits.head(3))

# Display dataset shapes
print(movies.shape)
print(credits.shape)

# Display movie columns
print(movies.columns)

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "name": "spy"}, {"id": 818, "name...                en   

                             original_title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   

        

## 3. Merge Datasets and Select Relevant Columns

In [3]:
# Merge movies and credits datasets
movies = movies.merge(credits, on='title')

# Select relevant columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

## 4. Data Preprocessing

In [4]:
# Check for null values
print(movies.isnull().sum())

# Drop rows with null values
movies.dropna(inplace=True)

# Convert stringified columns to lists
from ast import literal_eval

features = ['genres', 'keywords', 'cast', 'crew']
for feature in features:
    movies[feature] = movies[feature].apply(literal_eval)

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64


## 5. Extract Director Information

In [5]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return [i['name']]
    return []

movies['director'] = movies['crew'].apply(get_director)

## 6. Extract Top 3 Actors

In [6]:
def convert_cast(cast_list):
    return [i['name'] for i in cast_list[:3]]

movies['cast'] = movies['cast'].apply(convert_cast)

## 7. Extract Names from Genres and Keywords

In [7]:
def get_name(x):
    return [i['name'] for i in x] if isinstance(x, list) else []

for feature in ['genres', 'keywords']:
    movies[feature] = movies[feature].apply(get_name)

## 8. Clean and Standardize Data

In [8]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    elif isinstance(x, str):
        return str.lower(x.replace(" ", ""))
    else:
        return ''

for feature in ['genres', 'keywords', 'cast', 'director']:
    movies[feature] = movies[feature].apply(clean_data)

## 9. Create Tags Column

In [9]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['director']

## 10. Create Final Dataset

In [10]:
df = movies[['movie_id', 'title', 'tags']]

# Join tags into a single string
df['tags'] = df['tags'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x: ' '.join(x))


## 11. Apply Stemming

In [11]:
ps = PorterStemmer()

def stem(text):
    return [ps.stem(word) for word in text.split()]

df['tags'] = df['tags'].apply(stem)
df['tags'] = df['tags'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(stem)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x: ' '.join(x))


## 12. Create TF-IDF Matrix

In [12]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['tags'])

## 13. Calculate Cosine Similarity

In [13]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

## 14. Create Recommendation Function

In [14]:
def recommend(movie):
    idx = df[df['title'] == movie].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

## 15. Test Recommendation Function

In [15]:
print(recommend('The Dark Knight'))

3                         The Dark Knight Rises
428                              Batman Returns
119                               Batman Begins
3861    Batman: The Dark Knight Returns, Part 2
299                              Batman Forever
1362                                     Batman
1363                                     Batman
9            Batman v Superman: Dawn of Justice
210                              Batman & Robin
879                         Law Abiding Citizen
Name: title, dtype: object


## 16. Save Model and Data

In [16]:
pickle.dump(cosine_sim, open('cosine_sim.pkl', 'wb'))
pickle.dump(df, open('movielist.pkl', 'wb'))