# Import libraries

In [1]:
# For loading data and data manipulation
import numpy as np
import pandas as pd

# For converting string list to a actual list
import ast

# For cleaning the text
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# For converting them into vectors
from sklearn.feature_extraction.text import TfidfVectorizer

# To find the similarity distance between movies
from sklearn.metrics.pairwise import cosine_similarity

# Exploratory Data Analysis

### Load the data

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [5]:
movies.shape, credits.shape

((4803, 20), (4803, 4))

### Rename the column name in credits

In [6]:
credits.rename(columns={'movie_id': 'id'}, inplace=True)

In [7]:
credits.head(1)

Unnamed: 0,id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Merge the two datasets

In [8]:
movies = pd.merge(movies, credits)

In [9]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [10]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

### Required columns

In [11]:
req_columns = ['id', 'title', 'keywords', 'overview', 'cast', 'crew']

In [12]:
movies = movies[req_columns]

In [13]:
movies.head(1)

Unnamed: 0,id,title,keywords,overview,cast,crew
0,19995,Avatar,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Drop the null column data

In [14]:
rows_null = movies.isnull()
rows_has_null = rows_null.any(axis=1)
movies[rows_has_null]

Unnamed: 0,id,title,keywords,overview,cast,crew
2656,370980,Chiamatemi Francesco - Il Papa della gente,"[{""id"": 717, ""name"": ""pope""}, {""id"": 5565, ""na...",,"[{""cast_id"": 5, ""character"": ""Jorge Mario Berg...","[{""credit_id"": ""5660019ac3a36875f100252b"", ""de..."
4140,459488,"To Be Frank, Sinatra at 100","[{""id"": 6027, ""name"": ""music""}, {""id"": 225822,...",,"[{""cast_id"": 0, ""character"": ""Narrator"", ""cred...","[{""credit_id"": ""592b25e4c3a368783e065a2f"", ""de..."
4431,292539,Food Chains,[],,[],"[{""credit_id"": ""5470c3b1c3a368085e000abd"", ""de..."


In [15]:
movies.shape

(4803, 6)

In [16]:
movies.dropna(inplace=True)

In [17]:
movies.shape

(4800, 6)

### Helper functions

In [18]:
def extract(text):
    keyword_list = []
    for i in ast.literal_eval(text):
        keyword_list.append(i['name'])
    return keyword_list

In [19]:
movies['keywords'] = movies['keywords'].apply(extract)
movies['cast'] = movies['cast'].apply(extract)
movies['crew'] = movies['crew'].apply(extract)

In [20]:
movies.head(1)

Unnamed: 0,id,title,keywords,overview,cast,crew
0,19995,Avatar,"[culture clash, future, space war, space colon...","In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[Stephen E. Rivkin, Rick Carter, Christopher B..."


In [21]:
def combine_words(text):
    combine_str = []
    for i in text:
        words = i.split()
        single_word = ''.join(words)
        combine_str.append(single_word)
    return ' '.join(combine_str)

In [22]:
movies['keywords'] = movies['keywords'].apply(combine_words)
movies['cast'] = movies['cast'].apply(combine_words)
movies['crew'] = movies['crew'].apply(combine_words)

In [23]:
movies.head(1)

Unnamed: 0,id,title,keywords,overview,cast,crew
0,19995,Avatar,cultureclash future spacewar spacecolony socie...,"In the 22nd century, a paraplegic Marine is di...",SamWorthington ZoeSaldana SigourneyWeaver Step...,StephenE.Rivkin RickCarter ChristopherBoyes Ch...


### Create a new column tags that will be used to recommend

In [24]:
movies['tags'] = movies['title'] + ' ' + movies['keywords'] + ' ' + movies['overview'] + ' ' + movies['cast'] + ' ' + movies['crew']

In [25]:
movies.head(1)

Unnamed: 0,id,title,keywords,overview,cast,crew,tags
0,19995,Avatar,cultureclash future spacewar spacecolony socie...,"In the 22nd century, a paraplegic Marine is di...",SamWorthington ZoeSaldana SigourneyWeaver Step...,StephenE.Rivkin RickCarter ChristopherBoyes Ch...,Avatar cultureclash future spacewar spacecolon...


In [26]:
movies['tags'][0]

"Avatar cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. SamWorthington ZoeSaldana SigourneyWeaver StephenLang MichelleRodriguez GiovanniRibisi JoelDavidMoore CCHPounder WesStudi LazAlonso DileepRao MattGerald SeanAnthonyMoran JasonWhyte ScottLawrence KellyKilgour JamesPatrickPitt SeanPatrickMurphy PeterDillon KevinDorman KelsonHenderson DavidVanHorn JacobTomuri MichaelBlain-Rozgay JonCurry LukeHawker WoodySchultz PeterMensah SoniaYee JahnelCurfman IlramChoi KylaWarren LisaRoumain DebraWilson ChrisMala TaylorKibby JodieLandau JulieLamm CullenB.Madden JosephBradyMadden FrankieTorres AustinWilson SaraWilson TamicaWashington-Miller LucyBriant NathanMeister GerryBlair MatthewChamberl

### Clean the text

In [27]:
lemmatizer = WordNetLemmatizer()

In [28]:
def clean_text(text):
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(i) for i in words if not i in stopwords.words('english')]
    text = ' '.join(words)
    words = text.split('.')
    text = ''.join(words)
    words = text.split(',')
    return ''.join(words)

In [29]:
movies['tags'] = movies['tags'].apply(clean_text)

In [30]:
movies['tags'][0]

"avatar cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d 22nd century paraplegic marine dispatched moon pandora unique mission becomes torn following order protecting alien civilization samworthington zoesaldana sigourneyweaver stephenlang michellerodriguez giovanniribisi joeldavidmoore cchpounder wesstudi lazalonso dileeprao mattgerald seananthonymoran jasonwhyte scottlawrence kellykilgour jamespatrickpitt seanpatrickmurphy peterdillon kevindorman kelsonhenderson davidvanhorn jacobtomuri michaelblain-rozgay joncurry lukehawker woodyschultz petermensah soniayee jahnelcurfman ilramchoi kylawarren lisaroumain debrawilson chrismala taylorkibby jodielandau julielamm cullenbmadden josephbradymadden frankietorres austinwilson sarawilson tamicawashington-miller lucybriant nathanmeister gerryblair matthewchamberlain paulyates wraywilson jamesgaylyn melvinlenoc

### Create a new dataframe with only columns who are responsible for recommendation

In [31]:
df = movies[['id', 'title', 'tags']]

In [32]:
df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,avatar cultureclash future spacewar spacecolon...
1,285,Pirates of the Caribbean: At World's End,pirate caribbean: world's end ocean drugabuse ...
2,206647,Spectre,spectre spy basedonnovel secretagent sequel mi...
3,49026,The Dark Knight Rises,dark knight rise dccomics crimefighter terrori...
4,49529,John Carter,john carter basedonnovel mar medallion spacetr...


### Convert the tags column into the vectors

In [33]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')

In [34]:
vectors = tfidf.fit_transform(df['tags']).toarray()

In [35]:
vectors

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [36]:
vectors.shape

(4800, 5000)

### Calculate the cosine distance between the vectors, and the closest distance vectors represent the similar movies

In [37]:
similarity = cosine_similarity(vectors)

In [38]:
similarity

array([[1.        , 0.016239  , 0.00605736, ..., 0.01076369, 0.00980049,
        0.        ],
       [0.016239  , 1.        , 0.        , ..., 0.01942639, 0.00965121,
        0.        ],
       [0.00605736, 0.        , 1.        , ..., 0.01158238, 0.        ,
        0.        ],
       ...,
       [0.01076369, 0.01942639, 0.01158238, ..., 1.        , 0.02325945,
        0.07014689],
       [0.00980049, 0.00965121, 0.        , ..., 0.02325945, 1.        ,
        0.01606836],
       [0.        , 0.        , 0.        , ..., 0.07014689, 0.01606836,
        1.        ]])

In [39]:
similarity.shape

(4800, 4800)

# Recommender function that will give the recommended movies closest to the given one

In [40]:
def recommender(movie):
    movie_index = df[df['title'] == movie].index[0]
    distances = similarity[movie_index]
    recommended_movies = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    
    movies = []
    for index in recommended_movies:
        movies.append(df.iloc[index[0]].title)
        
    return movies

In [41]:
recommender("Superman")

['Superman II',
 'Superman IV: The Quest for Peace',
 'Superman III',
 'Superman Returns',
 'Batman v Superman: Dawn of Justice']

In [42]:
recommender("Batman")

['Batman Returns',
 'Batman: The Dark Knight Returns, Part 2',
 'Batman Forever',
 'Batman & Robin',
 'Batman Begins']

In [43]:
recommender("Harry Potter and the Philosopher's Stone")

['Harry Potter and the Chamber of Secrets',
 'Harry Potter and the Prisoner of Azkaban',
 'Harry Potter and the Goblet of Fire',
 'Harry Potter and the Order of the Phoenix',
 'Harry Potter and the Half-Blood Prince']