**Import Libraries**

In [2]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")

**Load Dataset**

In [4]:
#load dataset
movies=pd.read_csv("tmdb_5000_movies.csv")
credits=pd.read_csv("tmdb_5000_credits.csv")

In [5]:
#understanding dataset
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')

**Data Preprocessing**

In [7]:
#merge both the datasets to make processing easy
movies=movies.merge(credits,on="title")

In [8]:
movies.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [9]:
#extracted relevant features
movies=movies[["id","genres","keywords","overview","title","cast","crew"]]

In [10]:
movies.columns 

Index(['id', 'genres', 'keywords', 'overview', 'title', 'cast', 'crew'], dtype='object')

In [11]:
#understand the required features and check for null values
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        4809 non-null   int64 
 1   genres    4809 non-null   object
 2   keywords  4809 non-null   object
 3   overview  4806 non-null   object
 4   title     4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 263.1+ KB


In [12]:
#checking for duplicate values
movies.duplicated().sum()

0

In [13]:
#dropped rows with null values
movies.dropna(inplace=True)

In [14]:
movies.isnull().sum().sum()

0

In [15]:
#Converting genres,keywords,cast and crew into standard form
import ast
def convert(obj):
    List=[]
    for i in ast.literal_eval(obj):
        List.append(i["name"])
    return List

In [16]:
#convert genres which is a list of dictionaries of meta data inside a string to a list of genres
movies.genres = movies.genres.apply(convert)

In [17]:
movies.genres

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4806, dtype: object

In [18]:
#current state of keywords column
movies.iloc[0].keywords

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [19]:
#convert String of List of dictionaries to list
movies.keywords=movies.keywords.apply(convert)

In [20]:
#current state of cast column
movies.cast.iloc[0]

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 

In [21]:
#convert cast function 
def convertCast(obj):
    List=[]
    counter=0
    for i in ast.literal_eval(obj):
        List.append(i["name"])
        counter+=1
        if(counter==3):
            return List

In [22]:
#convert cast
movies.cast=movies.cast.apply(convertCast)

In [23]:
#current state of crew column
movies.iloc[0].crew

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [24]:
#function to fetch director from meta data
def fetchDirector(obj):
    List=[]
    for i in ast.literal_eval(obj):
        if(i["job"]=="Director"):
            List.append(i["name"])
    return List

In [25]:
#fetch director
movies.crew=movies.crew.apply(fetchDirector)

In [26]:
movies.head()

Unnamed: 0,id,genres,keywords,overview,title,cast,crew
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...",Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",A cryptic message from Bond’s past sends him o...,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",Following the death of District Attorney Harve...,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","John Carter is a war-weary, former military ca...",John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [27]:
movies[movies.title=="Iron Man"]

Unnamed: 0,id,genres,keywords,overview,title,cast,crew
68,1726,"[Action, Science Fiction, Adventure]","[middle east, arms dealer, malibu, marvel comi...","After being held captive in an Afghan cave, bi...",Iron Man,"[Robert Downey Jr., Terrence Howard, Jeff Brid...",[Jon Favreau]


In [28]:
#analyzing dataframe
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4806 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        4806 non-null   int64 
 1   genres    4806 non-null   object
 2   keywords  4806 non-null   object
 3   overview  4806 non-null   object
 4   title     4806 non-null   object
 5   cast      4745 non-null   object
 6   crew      4806 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.4+ KB


In [29]:
movies.isnull().sum().sum()

61

In [30]:
type(movies.iloc[0].overview)

str

In [31]:
#Removing space in order to avoid any confusion
movies.overview=movies.overview.apply(lambda x:x.split())

In [32]:
#remove space function for list
def removeSpace(obj):
    if obj is None:
        return None
    L=[]
    for i in obj:
        L.append(i.replace(" ",""))
    return L

In [33]:
#remove space from columns
movies.genres=movies["genres"].apply(removeSpace)
movies.keywords=movies["keywords"].apply(removeSpace)
movies.crew=movies["crew"].apply(removeSpace)
movies.cast=movies['cast'].apply(removeSpace)

In [34]:
movies.head()

Unnamed: 0,id,genres,keywords,overview,title,cast,crew
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...",Pirates of the Caribbean: At World's End,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...",Spectre,"[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...",The Dark Knight Rises,"[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...",John Carter,"[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [35]:
#Combining entire description of a movie into single column "tags"
movies["tags"]=movies.overview+movies.genres+movies.keywords+movies.cast+movies.crew

In [36]:
movies.head()

Unnamed: 0,id,genres,keywords,overview,title,cast,crew,tags
0,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...",Avatar,"[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...",Pirates of the Caribbean: At World's End,"[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...",Spectre,"[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...",The Dark Knight Rises,"[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...",John Carter,"[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [37]:
movies.iloc[0].tags

['In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.',
 'Action',
 'Adventure',
 'Fantasy',
 'ScienceFiction',
 'cultureclash',
 'future',
 'spacewar',
 'spacecolony',
 'society',
 'spacetravel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alienplanet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'loveaffair',
 'antiwar',
 'powerrelations',
 'mindandsoul',
 '3d',
 'SamWorthington',
 'ZoeSaldana',
 'SigourneyWeaver',
 'JamesCameron']

In [38]:
#combined the necessary features into a new dataframe so to avoid losing data
df=movies[["id","title","tags"]]

In [39]:
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [40]:
df=df.dropna()

In [41]:
df.isnull().sum().sum()

0

In [42]:
#joining tags together
df["tags"]=df["tags"].apply(lambda x:" ".join(x)) 

In [43]:
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [44]:
#converting tags to lower case
df["tags"]=df["tags"].apply(lambda x: x.lower())

In [45]:
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


**Modelling**

**Vectorization**

In [48]:
#Vectorization is done to convert the movies from textual data to vectors so that in the future it will be easy to determine the similarity

In [49]:
#import library
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')

In [50]:
#fit tags column to the vectorization object
vectors=cv.fit_transform(df["tags"]).toarray()

In [51]:
#top 5000 features and removed stop words 
f=cv.get_feature_names_out()

In [52]:
print(f[:100])

['000' '007' '10' '100' '11' '12' '13' '14' '15' '16' '17' '17th' '18'
 '18th' '18thcentury' '19' '1930s' '1940s' '1944' '1950' '1950s' '1960s'
 '1970s' '1974' '1976' '1980' '1980s' '1985' '1990s' '1999' '19th'
 '19thcentury' '20' '200' '2003' '2009' '20th' '23' '24' '25' '30' '300'
 '3d' '40' '50' '500' '60' '60s' '70' '70s' 'aaron' 'aaroneckhart'
 'abandoned' 'abducted' 'abigailbreslin' 'abilities' 'ability' 'able'
 'aboard' 'abuse' 'abusive' 'academic' 'academy' 'accept' 'accepted'
 'accepts' 'access' 'accident' 'accidental' 'accidentally' 'accompanied'
 'accomplish' 'account' 'accountant' 'accused' 'ace' 'achieve' 'act'
 'acting' 'action' 'actionhero' 'actions' 'activist' 'activities'
 'activity' 'actor' 'actors' 'actress' 'acts' 'actual' 'actually' 'adam'
 'adams' 'adamsandler' 'adamshankman' 'adaptation' 'adapted' 'addict'
 'addicted' 'addiction']


**Analysis :**
***- The model treats "accept," "accepted," and "accepts" as distinct words, which could potentially lead to issues in the future.***

***- This analysis points out that the model's differentiation among "accept," "accepted," and "accepts" as separate entities might introduce challenges or inaccuracies in tasks where these forms should ideally be considered as variations of the same***
word.

**Stemming**

In [55]:
#Using nltk.stem.porter to remove similar words like action,act,acting

In [56]:
#import library
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [57]:
#defining stem function
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [58]:
#applied above defined stem function
df["tags"]=df["tags"].apply(stem)

In [59]:
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [60]:
#fit tag with the updated changes
vectors=cv.fit_transform(df["tags"]).toarray()
f=cv.get_feature_names_out()
print(f[:100])

['000' '007' '10' '100' '11' '12' '13' '14' '15' '16' '17' '17th' '18'
 '18th' '18thcenturi' '19' '1910' '1920' '1930' '1940' '1944' '1950'
 '1950s' '1960' '1960s' '1970' '1970s' '1974' '1976' '1980' '1985' '1990'
 '1995' '1999' '19th' '19thcenturi' '20' '200' '2003' '2009' '20th' '23'
 '24' '25' '30' '300' '3d' '40' '50' '500' '60' '70' '80' 'aaron'
 'aaroneckhart' 'abandon' 'abduct' 'abigailbreslin' 'abil' 'abl' 'aboard'
 'abov' 'abus' 'academ' 'academi' 'accept' 'access' 'accid' 'accident'
 'acclaim' 'accompani' 'accomplish' 'account' 'accus' 'ace' 'achiev'
 'acquaint' 'act' 'action' 'actionhero' 'activ' 'activist' 'activities'
 'actor' 'actress' 'actual' 'adam' 'adamsandl' 'adamshankman' 'adapt'
 'add' 'addict' 'adjust' 'admir' 'admit' 'adolesc' 'adopt' 'ador'
 'adrienbrodi' 'adult']


In [61]:
len(vectors)

4745

In [62]:
#Determining similarity between all the movies based on cosine distance (which is basically the angle between the vectors)

**Cosine Similarity**

In [64]:
#import library
from sklearn.metrics.pairwise import cosine_similarity

In [65]:
#determine similarity between movies
similarity=cosine_similarity(vectors)

In [66]:
#similarity matrix
similarity[1]

array([0.08238526, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
       0.02615329])

**Building Recommendation Function**

In [68]:
''' Building recommendation function 
Logic : (Fetch index of the movie->get simialrity array of that index using similarity[index]
-> Sort the array in dec order -> Return first 5 movies)
'''

' Building recommendation function \nLogic : (Fetch index of the movie->get simialrity array of that index using similarity[index]\n-> Sort the array in dec order -> Return first 5 movies)\n'

In [69]:
#recommendation function
def recommend(movie):
    movie_index=df[df['title']==movie].index[0]
    distances=similarity[movie_index]
    l=list(enumerate(distances))
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    for i in movies_list:
        idx=i[0]
        print(df["title"][idx])

In [145]:
recommend("Iron Man")

Iron Man 3
Iron Man 2
Avengers: Age of Ultron
The Avengers
Captain America: Civil War
