In [13]:
#import dependencies
import pandas as pd
#read datasets into dataframe
df_cred = pd.read_csv("/Users/suvirsingh/Downloads/movies_dataset/tmdb_5000_credits.csv")
df_mov = pd.read_csv("/Users/suvirsingh/Downloads/movies_dataset/tmdb_5000_movies.csv")

In [14]:
#see the size of datasets
df_cred.shape, df_mov.shape

((4803, 4), (4803, 20))

In [15]:
#check if movie id columns differ or not
(df_cred.movie_id != df_mov.id).any().sum()


0

In [16]:
#rename column name
df_cred.rename(columns = {'movie_id':'id'}, inplace = True)

In [17]:
#merge both dataframes and store in a separate dataframe
movie_db = df_cred.merge(df_mov, on='id')


In [18]:
#relevant information about our merged dataframe
movie_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    4803 non-null   int64  
 1   title_x               4803 non-null   object 
 2   cast                  4803 non-null   object 
 3   crew                  4803 non-null   object 
 4   budget                4803 non-null   int64  
 5   genres                4803 non-null   object 
 6   homepage              1712 non-null   object 
 7   keywords              4803 non-null   object 
 8   original_language     4803 non-null   object 
 9   original_title        4803 non-null   object 
 10  overview              4800 non-null   object 
 11  popularity            4803 non-null   float64
 12  production_companies  4803 non-null   object 
 13  production_countries  4803 non-null   object 
 14  release_date          4802 non-null   object 
 15  revenue              

In [19]:
#dataset preprocessing

#drop null values
movie_db.dropna(subset = ['overview'], inplace= True )

#filter out target columns
movie_db = movie_db[['id','title_x','genres','overview','cast','crew']]

#check information
movie_db.info()


<class 'pandas.core.frame.DataFrame'>
Index: 4800 entries, 0 to 4802
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        4800 non-null   int64 
 1   title_x   4800 non-null   object
 2   genres    4800 non-null   object
 3   overview  4800 non-null   object
 4   cast      4800 non-null   object
 5   crew      4800 non-null   object
dtypes: int64(1), object(5)
memory usage: 262.5+ KB


In [20]:
#Generation of corpus
movie_db.genres[0]
' '.join([i['name'] for i in eval(movie_db.genres[0])])

'Action Adventure Fantasy Science Fiction'

In [21]:
# taking top 3 cast
' '.join([i['name'] for i in eval(movie_db.cast[0])[:3]])

'Sam Worthington Zoe Saldana Sigourney Weaver'

In [22]:
# taking crew (director & producer)
' '.join(list(set([i['name'] for i in eval(movie_db.crew[0]) if i['job']=='Director' or i['job']=='Producer'])))

'Jon Landau James Cameron'

In [23]:
# define function to generate corpus
def generate_corpus(overview, genre, cast, crew):
    
    corpus = ""
    
    genre = ' '.join([i['name'] for i in eval(genre)])
    
    cast = ' '.join([i['name'] for i in eval(cast)[:3]])
       
    crew = ' '.join(list(set([i['name'] for i in eval(crew) if i['job']=='Director' or i['job']=='Producer'])))
    
    corpus+= overview + " " + genre + " " + cast + " " + crew
    
    return corpus

corpus = []
for i in range(len(movie_db)):
    corpus.append(generate_corpus(movie_db.iloc[i].overview, movie_db.iloc[i].genres, movie_db.iloc[i].cast, movie_db.iloc[i].crew))



In [24]:
len(corpus)

4800

In [25]:
# check the corpus
corpus[0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy Science Fiction Sam Worthington Zoe Saldana Sigourney Weaver Jon Landau James Cameron'

In [26]:
# rename the column
movie_db.rename(columns = {'title_x':'title'}, inplace = True)

# drop old columns
movie_db.drop(columns=['genres', 'overview', 'cast', 'crew'], inplace=True)

# add corpus
movie_db['corpus'] = corpus

In [27]:
# import deps
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the Object and remove stopwords
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie_db['corpus'])


In [28]:
movie_db.shape

(4800, 3)

In [29]:
tfidf_matrix.shape

(4800, 29102)

In [30]:
# import deps
from sklearn.metrics.pairwise import linear_kernel

# compute the similarity matirx
cos_mat = linear_kernel(tfidf_matrix, tfidf_matrix)

cos_mat.shape

(4800, 4800)

In [31]:
#define function for movie recommendation fetching

def get_recommendations(movie, n):
    
    # get index from dataframe
    index = movie_db[movie_db['title']== movie].index[0]
    
    # sort top n similar movies     
    similar_movies = sorted(list(enumerate(cos_mat[index])), reverse=True, key=lambda x: x[1]) 
    
    # extract names from dataframe and return movie names
    recomm = []
    for i in similar_movies[1:n+1]:
        recomm.append(movie_db.iloc[i[0]].title)
        
    return recomm

In [32]:
#test the function
get_recommendations("The Dark Knight", 3)

['The Dark Knight Rises', 'Batman Begins', 'Batman Returns']

In [33]:
get_recommendations("Mission: Impossible", 3)

['Mission: Impossible III', 'Mission: Impossible II', 'Vanilla Sky']

In [34]:
#Define function for recommendations based on keywords
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
def get_keywords_recommendations(keywords, n):    
    
    keywords = keywords.split()
    keywords = " ".join(keywords)
    
    # transform the string to vector representation
    key_tfidf = tfidf.transform([keywords]) 
    
    # compute cosine similarity    
    result = sklearn.metrics.pairwise.cosine_similarity(key_tfidf, tfidf_matrix)
    
    # sort top n similar movies   
    similar_key_movies = sorted(list(enumerate(result[0])), reverse=True, key=lambda x: x[1])
    
    # extract names from dataframe and return movie names
    recomm = []
    for i in similar_key_movies[1:n+1]:
        recomm.append(movie_db.iloc[i[0]].title)
        
    return recomm

In [35]:
#test function
get_keywords_recommendations("Christopher Nolan", 4)

['Insomnia', 'Man of Steel', 'Batman Begins', 'Interstellar']

In [36]:
import joblib

joblib.dump(movie_db, 'models/movie_db.df')
joblib.dump(cos_mat, 'models/cos_mat.mt')
joblib.dump(tfidf, 'models/vectorizer.tf')
joblib.dump(tfidf_matrix, 'models/tfidf_mat.tf')

['models/tfidf_mat.tf']

In [37]:
get_keywords_recommendations("Christian Bale", 4)

['American Psycho', 'Saved!', 'The Prestige', 'American Hustle']