In [39]:
import numpy as np
import pandas as pd
import ast

In [40]:
movies=pd.read_csv("./dataset/tmdb_5000_movies.csv")
credits=pd.read_csv("./dataset/tmdb_5000_credits.csv")

In [41]:
# merging both dataset

df=movies.merge(credits,on="title")

## EDA ##

### Remove unnecessary columns ###

In [43]:
df=df[["movie_id","title","overview","genres","keywords","cast","crew"]]

In [44]:
df.dropna(inplace=True)

In [45]:
df.isna().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [46]:
df["genres"][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [47]:
def convert(obj):
    L_genre=[]
    for i in ast.literal_eval(obj):
        L_genre.append(i["name"])
    return L_genre

In [48]:
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i["name"])
            break
    return L

In [49]:
df["genres"]=df["genres"].apply(convert)

In [50]:
df["keywords"]=df["keywords"].apply(convert)

In [51]:
df["cast"]=df["cast"].apply(convert).apply(lambda x:x[:3])

In [52]:
df["crew"]=df["crew"].apply(fetch_director)

In [53]:
df["overview"]=df["overview"].apply(lambda x:x.split())

Removing spaces between the names to avoid confusion for recommender systems

For example

Sam Mendes -> SamMendis
Sam Worthington ->SamWorthington

otherwise system gets confused which sam are we looking for

In [54]:
def remove_spaces(obj):
    return obj.apply(lambda x:[i.replace(" ","") for i in x])

In [55]:
df["genres"]=remove_spaces(df["genres"])
df["keywords"]=remove_spaces(df["keywords"])
df["cast"]=remove_spaces(df["cast"])
df["crew"]=remove_spaces(df["crew"])

In [56]:
df["tags"]=df["overview"]+df["genres"]+df["keywords"]+df["cast"]+df["crew"]

In [57]:
df=df[["movie_id","title","tags"]]

In [58]:
df["tags"]=df["tags"].apply(lambda x:" ".join(x))

In [59]:
df["tags"]=df["tags"].apply(lambda x:x.lower())

In [60]:
df["tags"][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

we have converted tags now. 
to find similiarity between movies, we find similiarty between tags

convert tags to vector using bag of words and perform vectorization without stop words



## Vectorization ##

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer(max_features=5000, stop_words="english", ngram_range=(1,2), min_df=2)
vectors = tfidf.fit_transform(df["tags"])



## Stemming ##

We do stemming now to avoid words like (action actions), (love loved loving) to be counted as different.

In [65]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [66]:
def stem(txt):
    y=[]
    for i in txt.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [67]:
df["tags"]=df["tags"].apply(stem)

In [68]:
from sklearn.metrics.pairwise import cosine_similarity

In [69]:
similarity=cosine_similarity(vectors)

In [70]:
def recommend(movie):
    #get index of movie
    
    movie_index=df[df['title']==movie].index[0]
    movie_distance=similarity[movie_index]
    movies_list=sorted(list(enumerate(movie_distance)), reverse=True,key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(df.iloc[i[0]].title)

In [73]:
import pickle

In [74]:
pickle.dump(df,open('./dataset/movies.pkl','wb'))

In [75]:
pickle.dump(similarity,open('./dataset/similarity.pkl','wb'))