# **Import and install required libraries**

In [None]:
pip install gensim
pip install tqdm
pip install sentence-transformers

In [None]:
import numpy as np
import pandas as pd
import ast
import nltk
from nltk.stem.porter import PorterStemmer
import spacy
import gensim
from gensim.utils import simple_preprocess
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer,util
import pickle

# **Formating the Database**

In [None]:
movies = pd.read_csv('tmdb_movie_dataset.csv')
credits = pd.read_csv('tmdb_movie_credits.csv')

In [None]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,tmdbId,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,ratingId
0,4000000,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 35, ""name...",,5,"[{""id"": 612, ""name"": ""hotel""}, {""id"": 613, ""na...",en,Four Rooms,It's Ted the Bellhop's first night on the job....,22.87623,"[{""name"": ""Miramax Films"", ""id"": 14}, {""name"":...",...,1995-12-09,4300000,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,6.5,530,18


In [None]:
credits.head(1)

Unnamed: 0,tmdbId,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [None]:
movies = movies.merge(credits,on = 'title')

In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4608 entries, 0 to 4607
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4608 non-null   int64  
 1   genres                4608 non-null   object 
 2   homepage              1659 non-null   object 
 3   tmdbId_x              4608 non-null   int64  
 4   keywords              4608 non-null   object 
 5   original_language     4608 non-null   object 
 6   original_title        4608 non-null   object 
 7   overview              4607 non-null   object 
 8   popularity            4608 non-null   float64
 9   production_companies  4608 non-null   object 
 10  production_countries  4608 non-null   object 
 11  release_date          4608 non-null   object 
 12  revenue               4608 non-null   int64  
 13  runtime               4608 non-null   float64
 14  spoken_languages      4608 non-null   object 
 15  status               

In [None]:
#Important columns : genre,tmbId_x,keywords,title,overview,cast,crew
movies = movies[['title','tmdbId_x','genres','keywords','overview','cast','crew']]

In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4608 entries, 0 to 4607
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     4608 non-null   object
 1   tmdbId_x  4608 non-null   int64 
 2   genres    4608 non-null   object
 3   keywords  4608 non-null   object
 4   overview  4607 non-null   object
 5   cast      4608 non-null   object
 6   crew      4608 non-null   object
dtypes: int64(1), object(6)
memory usage: 252.1+ KB


In [None]:
movies.isnull().sum()

title       0
tmdbId_x    0
genres      0
keywords    0
overview    1
cast        0
crew        0
dtype: int64

In [None]:
movies.dropna(inplace=True)

In [None]:
movies.isnull().sum()

title       0
tmdbId_x    0
genres      0
keywords    0
overview    0
cast        0
crew        0
dtype: int64

In [None]:
movies.iloc[0].genres

'[{"id": 80, "name": "Crime"}, {"id": 35, "name": "Comedy"}]'

In [None]:
def convert(obj):
  L = []
  for i in ast.literal_eval(obj):
    L.append(i['name'])
  return L

In [None]:
movies['genres'] = movies['genres'].apply(convert)

In [None]:
movies.head(1)

Unnamed: 0,title,tmdbId_x,genres,keywords,overview,cast,crew
0,Four Rooms,5,"[Crime, Comedy]","[{""id"": 612, ""name"": ""hotel""}, {""id"": 613, ""na...",It's Ted the Bellhop's first night on the job....,"[{""cast_id"": 42, ""character"": ""Ted the Bellhop...","[{""credit_id"": ""52fe420dc3a36847f800012d"", ""de..."


In [None]:
movies.iloc[0].keywords

'[{"id": 612, "name": "hotel"}, {"id": 613, "name": "new year\'s eve"}, {"id": 616, "name": "witch"}, {"id": 622, "name": "bet"}, {"id": 922, "name": "hotel room"}, {"id": 2700, "name": "sperm"}, {"id": 12670, "name": "los angeles"}, {"id": 160488, "name": "hoodlum"}, {"id": 187056, "name": "woman director"}, {"id": 198129, "name": "episode film"}]'

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
movies.head(1)

Unnamed: 0,title,tmdbId_x,genres,keywords,overview,cast,crew
0,Four Rooms,5,"[Crime, Comedy]","[hotel, new year's eve, witch, bet, hotel room...",It's Ted the Bellhop's first night on the job....,"[{""cast_id"": 42, ""character"": ""Ted the Bellhop...","[{""credit_id"": ""52fe420dc3a36847f800012d"", ""de..."


In [None]:
def convert5(obj):
  L = []
  for i in ast.literal_eval(obj):
    L.append(i['name'])
    if(len(L)==5):
      break
  return L

In [None]:
movies['cast'] = movies['cast'].apply(convert5)

In [None]:
movies.iloc[1].crew

'[{"credit_id": "52fe420dc3a36847f8000437", "department": "Directing", "gender": 2, "id": 1, "job": "Director", "name": "George Lucas"}, {"credit_id": "52fe420dc3a36847f800045b", "department": "Production", "gender": 2, "id": 1, "job": "Executive Producer", "name": "George Lucas"}, {"credit_id": "562e75309251414006009955", "department": "Writing", "gender": 2, "id": 1, "job": "Writer", "name": "George Lucas"}, {"credit_id": "52fe420dc3a36847f8000461", "department": "Production", "gender": 2, "id": 12401, "job": "Producer", "name": "Gary Kurtz"}, {"credit_id": "52fe420dc3a36847f8000467", "department": "Production", "gender": 0, "id": 19801, "job": "Producer", "name": "Rick McCallum"}, {"credit_id": "52fe420dc3a36847f800046d", "department": "Sound", "gender": 2, "id": 491, "job": "Original Music Composer", "name": "John Williams"}, {"credit_id": "52fe420dc3a36847f8000473", "department": "Camera", "gender": 2, "id": 7753, "job": "Director of Photography", "name": "Gilbert Taylor"}, {"cred

In [None]:
def fetch(obj):
  L = []
  for i in ast.literal_eval(obj):
    if(i['job']=='Director'):
      L.append(i['name'])
      break
  return L

In [None]:
movies['crew'] = movies['crew'].apply(fetch)

In [None]:
movies.head(1)

Unnamed: 0,title,tmdbId_x,genres,keywords,overview,cast,crew
0,Four Rooms,5,"[Crime, Comedy]","[hotel, new year's eve, witch, bet, hotel room...",It's Ted the Bellhop's first night on the job....,"[Tim Roth, Antonio Banderas, Jennifer Beals, M...",[Allison Anders]


In [None]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [None]:
movies.head(1)

Unnamed: 0,title,tmdbId_x,genres,keywords,overview,cast,crew
0,Four Rooms,5,"[Crime, Comedy]","[hotel, new year's eve, witch, bet, hotel room...","[It's, Ted, the, Bellhop's, first, night, on, ...","[Tim Roth, Antonio Banderas, Jennifer Beals, M...",[Allison Anders]


In [None]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
movies.head(1)

Unnamed: 0,title,tmdbId_x,genres,keywords,overview,cast,crew
0,Four Rooms,5,"[Crime, Comedy]","[hotel, newyear'seve, witch, bet, hotelroom, s...","[It's, Ted, the, Bellhop's, first, night, on, ...","[TimRoth, AntonioBanderas, JenniferBeals, Mado...",[AllisonAnders]


In [None]:
movies['tags'] = movies['overview']+ movies['keywords'] + movies['crew'] + movies['genres']
movies['tags'] = movies.apply(lambda row: [f" The name of the movie is: {row['title']} "] + row['tags']  , axis=1)

In [None]:
movies.head(1)

Unnamed: 0,title,tmdbId_x,genres,keywords,overview,cast,crew,tags
0,Four Rooms,5,"[Crime, Comedy]","[hotel, newyear'seve, witch, bet, hotelroom, s...","[It's, Ted, the, Bellhop's, first, night, on, ...","[TimRoth, AntonioBanderas, JenniferBeals, Mado...",[AllisonAnders],"[ The name of the movie is: Four Rooms , It's,..."


In [None]:
new_df = movies[['title','tmdbId_x','tags']]

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [None]:
new_df.iloc[1].tags

' The name of the movie is: Star Wars  Princess Leia is captured and held hostage by the evil Imperial forces in their effort to take over the galactic Empire. Venturesome Luke Skywalker and dashing captain Han Solo team together with the loveable robot duo R2-D2 and C-3PO to rescue the beautiful princess and restore peace and justice in the Empire. android galaxy hermit deathstar lightsaber jedi rescuemission empire rebellion planet smuggler theforce spaceopera galacticwar stormtrooper totalitarianism GeorgeLucas Adventure Action ScienceFiction'

In [None]:
new_df.shape

(4607, 3)

# **Preprocessing the Data**

In [None]:
ps = PorterStemmer()

In [None]:
def stem(text):
  y = []
  for i in text.split():
    y.append(ps.stem(i))
  return  " ".join(y)

In [None]:
nlp = spacy.load("en_core_web_sm")



In [None]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS
corpus = []
def lemmatize(text):
  y = ""
  doc = nlp(text)
  for token in doc:
    if token not in stop_words and not token.is_punct :
      y+=token.lemma_ + " "
  corpus.append(y)
  return y.strip()

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)
new_df['tags'] = new_df['tags'].apply(lemmatize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lemmatize)


In [None]:
corpus[2]

'the name of the movi be find nemo nemo an adventur young clownfish be unexpectedli take from hi great barrier reef home to a dentist offic aquarium it up to hi worrisom father marlin and a friendli but forget fish dori to bring nemo home meet vegetarian sharks surfer dude turtle hypnot jellyfish hungri seagull and more along the way fathersonrelationship harbor underwat fishtank greatbarrierreef missingchild aftercreditssting duringcreditsste shorttermmemoryloss clownfish fathersonreunion protectivefath andrewstanton anim famili '

# **Base model using Word2Vec and TfidfVectorizer**

In [None]:
words = []
for sent in corpus:
  words.append(simple_preprocess(sent))

In [None]:
words[2]

In [None]:
model = gensim.models.Word2Vec(words, epochs=300,vector_size=200)

In [None]:
model.corpus_count

4607

In [None]:
def avg_word2vec(doc):
  return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)

In [None]:
X = []
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))


100%|██████████| 4607/4607 [00:21<00:00, 215.30it/s]


In [None]:
len(X)

4607

In [None]:
X_new = np.array(X)

In [None]:
X_new.shape

(4607, 200)

In [None]:
X_new[0].shape

(200,)

In [None]:
cv = TfidfVectorizer(stop_words = "english")

In [None]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [None]:
similarity = cosine_similarity(X_new)

In [None]:
len(similarity[0])

4607

In [None]:
def recomend(movie):
  movie_index = new_df[new_df['title']==movie].index[0]
  distances = similarity[movie_index]
  movies_list = sorted(list(enumerate(distances)),reverse = True,key = lambda x:x[1])[1:11]

  for i in movies_list:
    print(new_df.iloc[i[0]].title)

In [None]:
recomend("Harry Potter and the Philosopher's Stone")

ParaNorman
The Last Time I Committed Suicide
Just Visiting
Casper
Harry Potter and the Order of the Phoenix
The Chambermaid on the Titanic
The Adventures of Elmo in Grouchland
Wish I Was Here
The Smurfs 2
The Promise


# **Better model using SentenceTransformer : all-MiniLM-L6-v2**

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
embeddings = model.encode(corpus)

In [None]:
similarity = util.cos_sim(embeddings, embeddings)

In [None]:
type(similarity)

torch.Tensor

In [None]:
recomend("Harry Potter and the Philosopher's Stone")

Harry Potter and the Goblet of Fire
Harry Potter and the Chamber of Secrets
Harry Potter and the Half-Blood Prince
Harry Potter and the Order of the Phoenix
Harry Potter and the Prisoner of Azkaban
Stuart Little
Hocus Pocus
Diary of a Wimpy Kid
The Wizard of Oz
The Adventurer: The Curse of the Midas Box


# **Saving the Similarity data for later use**

In [None]:
similarity = similarity.tolist()

In [None]:
pickle.dump(new_df.to_dict(),open('movies_transformer.pkl','wb'))

In [None]:
pickle.dump(similarity,open('similarity_trans.pkl','wb'))