In [1]:
import numpy as np
import pandas as pd
import ast
import nltk
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer

In [2]:
# Load the datasets
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [3]:
# Merge the datasets
movies = movies.merge(credits, on='title')

In [4]:
# Select relevant columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [5]:
# Preprocess genres column
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies['genres'] = movies['genres'].apply(convert)

In [6]:
# Preprocess keywords column
movies['keywords'] = movies['keywords'].apply(convert)

In [7]:
# Preprocess cast column
def convert_cast(obj):
    counter = 0
    L = []
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

movies['cast'] = movies['cast'].apply(convert_cast)

In [8]:
# Preprocess crew column
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movies['crew'] = movies['crew'].apply(fetch_director)

In [9]:
# Preprocess overview column
movies['overview'] = movies['overview'].apply(lambda x: [x] if isinstance(x, str) else [])

In [10]:
# Create a new dataframe with selected columns
new_df = movies[['movie_id', 'title']].copy()

In [11]:
# Generate tags for each movie
new_df['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new_df['tags'] = new_df['tags'].apply(lambda x: [str(tag).lower() for tag in x])

In [None]:
# Stem the tags
nltk.download('punkt')
ps = PorterStemmer()
new_df['tags'] = new_df['tags'].apply(lambda x: [ps.stem(word) for tag in x for word in nltk.word_tokenize(tag)])

In [13]:
# Convert the tags to a single string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

In [14]:
# Vectorize the tags using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

In [15]:
# Calculate cosine similarity
similarity = cosine_similarity(vectors)

In [16]:
# Function to recommend movies
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [17]:
# Save the data and models
pickle.dump(new_df, open('movies.pkl', 'wb'))
pickle.dump(new_df.to_dict(), open('movie_dict.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [18]:
recommend('Spectre')

Quantum of Solace
Never Say Never Again
Dr. No
Skyfall
From Russia with Love


In [20]:
recommend('Interstellar')

Silent Running
2001: A Space Odyssey
Planet of the Apes
Lost in Space
Spaceballs


In [21]:
recommend('The Martian')

Space Pirate Captain Harlock
Gravity
Armageddon
Red Planet
John Carter
