In [29]:
import numpy as np
import pandas as pd
import ast

In [3]:
movies=pd.read_csv("./dataset/tmdb_5000_movies.csv")
credits=pd.read_csv("./dataset/tmdb_5000_credits.csv")

In [5]:
# merging both dataset

df=movies.merge(credits,on="title")

## EDA ##

In [10]:
df.isna().sum()

budget                     0
genres                     0
homepage                3096
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
movie_id                   0
cast                       0
crew                       0
dtype: int64

### Remove unnecessary columns ###

In [15]:
df=df[["movie_id","title","overview","genres","keywords","cast","crew"]]

In [20]:
df.dropna(inplace=True)

In [21]:
df.isna().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [22]:
df["genres"][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [41]:
def convert(obj):
    L_genre=[]
    for i in ast.literal_eval(obj):
        L_genre.append(i["name"])
    return L_genre

In [76]:
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i["name"])
            break
    return L

In [43]:
df["genres"]=df["genres"].apply(convert)

In [44]:
df["keywords"]=df["keywords"].apply(convert)

In [51]:
df["cast"]=df["cast"].apply(convert).apply(lambda x:x[:3])

In [83]:
df["crew"]=df["crew"].apply(fetch_director)

In [86]:
df["overview"]=df["overview"].apply(lambda x:x.split())

Removing spaces between the names to avoid confusion for recommender systems

For example

Sam Mendes -> SamMendis
Sam Worthington ->SamWorthington

otherwise system gets confused which sam are we looking for

In [93]:
def remove_spaces(obj):
    return obj.apply(lambda x:[i.replace(" ","") for i in x])

In [94]:
df["genres"]=remove_spaces(df["genres"])
df["keywords"]=remove_spaces(df["keywords"])
df["cast"]=remove_spaces(df["cast"])
df["crew"]=remove_spaces(df["crew"])

In [96]:
df["tags"]=df["overview"]+df["genres"]+df["keywords"]+df["cast"]+df["crew"]

In [99]:
df=df[["movie_id","title","tags"]]

In [102]:
df["tags"]=df["tags"].apply(lambda x:" ".join(x))

In [104]:
df["tags"]=df["tags"].apply(lambda x:x.lower())

In [105]:
df["tags"][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

we have converted tags now. 
to find similiarity between movies, we find similiarty between tags

convert tags to vector using bag of words and perform vectorization without stop words



In [112]:
df.info()

<class 'pandas.DataFrame'>
Index: 4806 entries, 0 to 4808
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   movie_id  4806 non-null   int64
 1   title     4806 non-null   str  
 2   tags      4806 non-null   str  
dtypes: int64(1), str(2)
memory usage: 279.2 KB


## Vectorization ##

In [108]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')
vectors=cv.fit_transform(df["tags"]).toarray()

We do stemming now to avoid words like (action actions), (love loved loving) to be counted as different

In [116]:
cv.get_feature_names_out()[51]

'able'