# Step 1: Import dependencies

In [120]:
import numpy as np
import pandas as pd
import ast
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Step 2: Load data


In [3]:
!pip install -U -q PyDrive

In [4]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
 
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [16]:
link1 = "https://drive.google.com/file/d/1MOYgqBQZ_dnmydOA9bQF65X4h64k4Mhi/view?usp=share_link"
link2 = "https://drive.google.com/file/d/1HWi2agYVFsjrUITMzPTYWDMo66Llte5F/view?usp=share_link"

id1 = link1.split("/")[-2]
id2 = link2.split("/")[-2]

downloaded1 = drive.CreateFile({'id':id1})
downloaded1.GetContentFile('tmdb_5000_credits.csv') 

downloaded2 = drive.CreateFile({'id':id2})
downloaded2.GetContentFile('tmdb_5000_movies.csv')

credits_data = pd.read_csv("tmdb_5000_credits.csv")
movies_data = pd.read_csv("tmdb_5000_movies.csv")

# Step 3: Merging data

In [17]:
print(f"Shape: {credits_data.shape}\n***********************")
credits_data.head(2)

Shape: (4803, 4)
***********************


Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [18]:
print(f"Shape: {movies_data.shape}\n***********************")
movies_data.head(2)

Shape: (4803, 20)
***********************


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [19]:
movies = movies_data.merge(credits_data, on="title")

In [21]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [20]:
movies.shape

(4809, 23)

# Step 4: Feature selection

Since this recommendation system is a content-based-recommendation-system hence important tags are required on the basis of which movies will be matched
- Features required
  - genres
  - id
  - keywords
  - overview
  - title
  - cast
  - crew

In [22]:
movies = movies[["title", "id", "genres", "keywords", "cast", "crew", "overview"]]
movies.head(3)

Unnamed: 0,title,id,genres,keywords,cast,crew,overview
0,Avatar,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,285,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","Captain Barbossa, long believed to be dead, ha..."
2,Spectre,206647,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",A cryptic message from Bond’s past sends him o...


# Step 5: Data cleaning and preprocessing

## Data cleaning

In [28]:
# checking number of duplicated rows
movies.duplicated().sum()

0

In [23]:
# checking null values
movies.isnull().sum()

title       0
id          0
genres      0
keywords    0
cast        0
crew        0
overview    3
dtype: int64

In [29]:
# dropping rows containing null values
movies = movies.dropna()
# resetting the index
movies = movies.reset_index(drop=True)

## Data preprocessing

In [31]:
movies.head(1)

Unnamed: 0,title,id,genres,keywords,cast,crew,overview
0,Avatar,19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","In the 22nd century, a paraplegic Marine is di..."


### Extracting genres

In [33]:
movies.iloc[0]["genres"]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [35]:
# converting a string representation of list into list
ast.literal_eval(movies.iloc[0]["genres"])

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [38]:
# creating a function to extract genres
def get_genres(genres):
  genres = ast.literal_eval(genres)
  genre_list = []
  for dictionary in genres:
    genre_list.append(dictionary["name"])
  return genre_list

In [37]:
get_genres(movies.iloc[0]["genres"])

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [39]:
## apply the get_genres function on the whole column
movies["genres"] = movies["genres"].apply(lambda x: get_genres(x))

### Extracting keywords

In [41]:
movies.iloc[0]["keywords"]

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [42]:
# converting a string representation of list into list
ast.literal_eval(movies.iloc[0]["keywords"])

[{'id': 1463, 'name': 'culture clash'},
 {'id': 2964, 'name': 'future'},
 {'id': 3386, 'name': 'space war'},
 {'id': 3388, 'name': 'space colony'},
 {'id': 3679, 'name': 'society'},
 {'id': 3801, 'name': 'space travel'},
 {'id': 9685, 'name': 'futuristic'},
 {'id': 9840, 'name': 'romance'},
 {'id': 9882, 'name': 'space'},
 {'id': 9951, 'name': 'alien'},
 {'id': 10148, 'name': 'tribe'},
 {'id': 10158, 'name': 'alien planet'},
 {'id': 10987, 'name': 'cgi'},
 {'id': 11399, 'name': 'marine'},
 {'id': 13065, 'name': 'soldier'},
 {'id': 14643, 'name': 'battle'},
 {'id': 14720, 'name': 'love affair'},
 {'id': 165431, 'name': 'anti war'},
 {'id': 193554, 'name': 'power relations'},
 {'id': 206690, 'name': 'mind and soul'},
 {'id': 209714, 'name': '3d'}]

In [43]:
# creating a function to extract keywords
def get_keywords(keywords):
  keywords = ast.literal_eval(keywords)
  keywords_list = []
  for dictionary in keywords:
    keywords_list.append(dictionary["name"])
  return keywords_list

In [44]:
get_keywords(movies.iloc[0]["keywords"])

['culture clash',
 'future',
 'space war',
 'space colony',
 'society',
 'space travel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alien planet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'love affair',
 'anti war',
 'power relations',
 'mind and soul',
 '3d']

In [45]:
## apply the get_keywords function on the whole column
movies["keywords"] = movies["keywords"].apply(lambda x: get_keywords(x))

### Extracting cast

In [46]:
movies.head(1)

Unnamed: 0,title,id,genres,keywords,cast,crew,overview
0,Avatar,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","In the 22nd century, a paraplegic Marine is di..."


In [47]:
movies.iloc[0]["cast"]

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 

In [49]:
# creating a function to extract top 5 cast cast members
def get_cast(cast):
  cast = ast.literal_eval(cast)
  cast_list = []
  counter = 0
  for dictionary in cast:
    if counter<5:
      cast_list.append(dictionary["name"])
      counter += 1
    else:
      break
  return cast_list

In [50]:
get_cast(movies.iloc[0]["cast"])

['Sam Worthington',
 'Zoe Saldana',
 'Sigourney Weaver',
 'Stephen Lang',
 'Michelle Rodriguez']

In [51]:
## apply the get_cast function on the whole column
movies["cast"] = movies["cast"].apply(lambda x: get_cast(x))

### Extracting crew (only director name)

In [52]:
movies.head(1)

Unnamed: 0,title,id,genres,keywords,cast,crew,overview
0,Avatar,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","In the 22nd century, a paraplegic Marine is di..."


In [55]:
movies.iloc[0]["crew"]

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [58]:
# creating a function to extract the name of director
def get_crew(crew):
  crew = ast.literal_eval(crew)
  crew_list = []
  for dictionary in crew:
    if dictionary["job"] == "Director":
      crew_list.append(dictionary["name"])
      break
  return crew_list

In [59]:
get_crew(movies.iloc[0]["crew"])

['James Cameron']

In [60]:
## apply the get_crew function on the whole column
movies["crew"] = movies["crew"].apply(lambda x: get_crew(x))

### Remove spaces from tags(genres, keywords, cast, crew) to uniquely identify tags

In [61]:
movies.head(1)

Unnamed: 0,title,id,genres,keywords,cast,crew,overview
0,Avatar,19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],"In the 22nd century, a paraplegic Marine is di..."


In [72]:
movies["genres"] = movies["genres"].apply(lambda x: [tag.replace(" ", "") for tag in x])
movies["keywords"] = movies["keywords"].apply(lambda x: [tag.replace(" ", "") for tag in x])
movies["cast"] = movies["cast"].apply(lambda x: [tag.replace(" ", "") for tag in x])
movies["crew"] = movies["crew"].apply(lambda x: [tag.replace(" ", "") for tag in x])

In [73]:
movies.head(1)

Unnamed: 0,title,id,genres,keywords,cast,crew,overview
0,Avatar,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],"In the 22nd century, a paraplegic Marine is di..."


### Combining multiple columns to form a single column

In [75]:
movies["overview"] = movies["overview"].apply(lambda x: x.split(" "))

In [76]:
movies["tags"] = movies["genres"] + movies["keywords"] + movies["cast"] + movies["cast"] + movies["crew"] + movies["overview"]
movies.head(1)

Unnamed: 0,title,id,genres,keywords,cast,crew,overview,tags
0,Avatar,19995,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction, c..."


In [77]:
movies = movies[["title", "id", "tags"]]
movies.head(1)

Unnamed: 0,title,id,tags
0,Avatar,19995,"[Action, Adventure, Fantasy, ScienceFiction, c..."


In [79]:
movies["tags"] = movies["tags"].apply(lambda x: " ".join(x))

In [81]:
movies.iloc[0]["tags"]

'Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver StephenLang MichelleRodriguez SamWorthington ZoeSaldana SigourneyWeaver StephenLang MichelleRodriguez JamesCameron In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

# Step 6: Count Vectorization

In [85]:
movies.head(1)

Unnamed: 0,title,id,tags
0,Avatar,19995,action adventure fantasy sciencefiction cultur...


In [84]:
# lowercase all the text in tags
movies["tags"] = movies["tags"].apply(lambda x: x.lower())

In [106]:
def preprocess_tags(text):
  ps = PorterStemmer()

  # removing punctuation
  text = re.sub(r'[^\w\s]', '', text)

  # converting words in a list
  text = word_tokenize(text)

  # removing stopwords and stemming each word
  text = [ps.stem(word) for word in text if word not in stopwords.words("english")]

  # combing all words to form a string
  text = " ".join(text)
  
  return text

In [107]:
movies["preprocessed_tags"] = movies["tags"].apply(lambda x: preprocess_tags(x))

In [108]:
movies.iloc[0]["preprocessed_tags"]

'action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav stephenlang michellerodriguez samworthington zoesaldana sigourneyweav stephenlang michellerodriguez jamescameron 22nd centuri parapleg marin dispatch moon pandora uniqu mission becom torn follow order protect alien civil'

In [116]:
cv = CountVectorizer(max_features=5000, stop_words="english")
vectors = cv.fit_transform(movies["preprocessed_tags"]).toarray()

In [117]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0])

# Step 7: Cosine similarity

In [121]:
cos_sim = cosine_similarity(vectors)

In [122]:
cos_sim.shape

(4806, 4806)

In [123]:
cos_sim[0]

array([1.        , 0.05892557, 0.06085806, ..., 0.05555556, 0.02179068,
       0.        ])

# Step 8: Recommendation System

In [126]:
def recommend(movie):
  movie_index = movies[movies["title"]==movie].index[0]
  distances = cos_sim[movie_index]
  movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

  for movie in movies_list:
    print(movies.iloc[movie[0]].title)

In [132]:
recommend("The Wolverine")

X2
X-Men
X-Men: The Last Stand
X-Men: Days of Future Past
X-Men Origins: Wolverine


# Step 9: Pickling

In [133]:
import pickle

In [134]:
pickle.dump(cos_sim, open("similarity.pkl", "wb"))

In [135]:
pickle.dump(movies.to_dict(), open("movies_dict.pkl", "wb"))