In [2]:
import numpy as np
import pandas as pd

In [None]:
movie_df = pd.read_csv('movies_metadata.csv')
credits_df = pd.read_csv('credits.csv')
keywords_df = pd.read_csv('keywords.csv')
links_df = pd.read_csv('links.csv')

In [None]:
movie_df.head()

In [None]:
movie_df.columns

In [None]:
credits_df.head()

In [None]:
links_df.head()

In [None]:
keywords_df.head()

In [9]:
links_df = links_df.rename(columns={"tmdbId": "id"})

In [None]:
links_df.head()

In [None]:
links_df["id"].dtypes

In [None]:
links_df.describe()

In [None]:
links_df.isnull().sum()

In [14]:
links_df = links_df.dropna(subset=['id'])
links_df['id'] = links_df['id'].astype(int)

In [15]:
movie_df['id'] = pd.to_numeric(movie_df['id'], errors='coerce')
keywords_df['id'] = pd.to_numeric(keywords_df['id'], errors='coerce')
credits_df['id'] = pd.to_numeric(credits_df['id'], errors='coerce')
links_df['id'] = pd.to_numeric(links_df['id'], errors='coerce')

movie_df = movie_df.dropna(subset=['id'])
keywords_df = keywords_df.dropna(subset=['id'])
credits_df = credits_df.dropna(subset=['id'])
links_df = links_df.dropna(subset=['id'])

movie_df['id'] = movie_df['id'].astype(int)
keywords_df['id'] = keywords_df['id'].astype(int)
credits_df['id'] = credits_df['id'].astype(int)
links_df['id'] = links_df['id'].astype(int)


In [16]:
movie_df = pd.merge(movie_df, keywords_df, on='id', how='inner')
movie_df = pd.merge(movie_df, credits_df, on='id', how='inner')
movie_df = pd.merge(movie_df, links_df, on='id', how='inner')

In [None]:
movie_df.head(1)

In [None]:
movie_df.shape

In [19]:
movie_df = movie_df[['imdb_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'poster_path']]

In [None]:
info = movie_df.info()

In [None]:
movie_df.isnull().sum()

In [22]:
movie_df = movie_df.dropna()

In [None]:
info = movie_df.info()

In [None]:
movie_df.head()

In [25]:
import ast

In [26]:
def convert(obj):
    L= [] 
    for i in ast.literal_eval(obj):
        L.append(i['name']) 
    return L

In [27]:
movie_df['genres'] = movie_df['genres'].apply(convert)

In [28]:
movie_df['keywords'] = movie_df['keywords'].apply(convert)

In [None]:
movie_df.head()

In [None]:
movie_df.cast[0]

In [31]:
def converter(obj):
    L = [] 
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name']) 
            counter+=1 
        else: 
            break 
    return L

In [32]:
movie_df['cast'] = movie_df['cast'].apply(converter)

In [None]:
movie_df.head()

In [34]:
def fetch_director(obj):
    l=[]
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            l.append(i['name'])
            break
    return l

In [35]:
movie_df['crew'] = movie_df['crew'].apply(fetch_director)

In [None]:
movie_df.head()

In [37]:
movie_df['overview'] = movie_df['overview'].apply(lambda x:x.split())

In [None]:
movie_df.head()

In [39]:
movie_df['genres'] = movie_df['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movie_df['crew'] = movie_df['crew'].apply(lambda x: [i.replace(" ","") for i in x])
movie_df['keywords'] = movie_df['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movie_df['cast'] = movie_df['cast'].apply(lambda x: [i.replace(" ","") for i in x])

In [None]:
movie_df.head()

In [41]:
movie_df['tags'] = movie_df['overview'] + movie_df['genres'] + movie_df['keywords'] + movie_df['cast'] + movie_df['crew']

In [None]:
movie_df.head()

In [43]:
df = movie_df[['imdb_id' , 'title' , 'tags','poster_path']]

In [None]:
df

In [None]:
df['tags'] = df['tags'].apply(lambda x:" ".join(x))

In [None]:
df['tags'] = df['tags'].apply(lambda x:x.lower())

In [47]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [48]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
df['tags'] = df['tags'].apply(stem)

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['tags'])

In [52]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
N = X.shape[0]
cos_sim = np.memmap("cos_sim.dat", dtype="float32", mode="w+", shape=(N, N))

batch_size = 10000
for i in range(0, N, batch_size):
    print(f"Computing batch {i} to {min(i+batch_size, N)}")
    sim = cosine_similarity(X[i:i+batch_size], X)
    cos_sim[i:i+batch_size, :] = sim.astype(np.float32)
    del sim

cos_sim.flush()
print("Cosine similarity matrix saved to .dat file.")

In [2]:
def recommend(movie_title, n=5):
    if movie_title not in df['title'].values:
        print(f"'{movie_title}' not found in dataset.")
        return []

    index = df[df['title'] == movie_title].index[0]
    cos_sim = np.memmap("cos_sim.dat", dtype="float32", mode="r", shape=(len(df), len(df)))
    sim_scores = cos_sim[index]
    top_indices = np.argsort(sim_scores)[::-1][1:n+1]
    recommended_titles = df['title'].iloc[top_indices].tolist()
    return recommended_titles

In [None]:
recommend('Seven')