# Content Based Movie Recommender - Semantic BERT

In [2]:
import pandas as pd
import pickle
import torch
import transformers

Load dataset

In [2]:
df = pd.read_csv('./movie_lens_dataset/movies_metadata_processed.csv', low_memory=False)
df.head()

Unnamed: 0.1,Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,1,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,3,3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
df.shape

(41368, 26)

BERT is basically a stack of transformer/ encoder layers. It understands the context of a sentence efficiently by observing the sentence from the left as well as from right i.e., bidirectionally. It is a pre trained language model, which performs the following two tasks primarily.

Masked Language Modelling (MLM)

Next Sequence Prediction (NSP)

### Creating pretrained tokenizer and model

In [4]:
bert_model=transformers.DistilBertModel
berttokenizer=transformers.DistilBertTokenizer
weights_type="distilbert-base-uncased"

In [5]:
tokenizer=berttokenizer.from_pretrained(weights_type)
model=bert_model.from_pretrained(weights_type ,output_hidden_states=True)

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Encoding all the movie overviews 

In [6]:
inputs=df["overview"].apply((lambda plot: tokenizer.encode(plot, add_special_tokens=True,max_length=100,truncation=True, padding='longest')))

### Adding padding to max length of 100

In [7]:
import numpy as np
def padding(list_of_sent):
    output=[]
    max_len=100
    for sent in list_of_sent.values:
        padded_sent=sent+[0]*(max_len-len(sent))
        output.append(padded_sent)
    output = np.array(output)
    return output

In [8]:
inputs=padding(inputs)

In [9]:
inputs[1]

array([  101, 22941, 12120,  2848,  7523, 22454,  2604,  2208,  2330,
        2341,  8687,  2088,  4895,  9148, 13027,  2135, 13260,  5070,
        4639,  2040,  7567,  2503,  2208,  2656,  2095,  2542,  2282,
        5070,  2015,  3246,  4071,  3926,  2208, 11268, 19188,  2093,
        2424,  2770,  5016, 24091, 17119,  2891,  4763, 10608, 17082,
        6492,   102,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0])

### Creating attention mask

In [10]:
mask=np.where(inputs!=0,1,0)

Converting inputs and attention mask into tensors

In [11]:
embedded_inputs=torch.tensor(inputs)
attention_mask=torch.tensor(mask)

In [12]:
embedded_inputs.shape

torch.Size([41368, 100])

In [13]:
attention_mask.shape

torch.Size([41368, 100])

# Running pretrained model

Setting batch size to prevent CPU memory allocation error

In [14]:
BATCH_SIZE = 32

In [15]:
from torch.utils.data import DataLoader
embedded_inputs_dataloader = DataLoader(embedded_inputs, batch_size=BATCH_SIZE)
attention_mask_dataloader = DataLoader(attention_mask, batch_size=BATCH_SIZE)

Adding all the last hidden states of every batch into a list

In [16]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

output = []

with torch.no_grad():
    for emb, attn in zip(embedded_inputs_dataloader, attention_mask_dataloader):
        states = model(emb, attention_mask=attn)
        output.append(states.last_hidden_state)

In [18]:
with open('./pickle/output.pkl', 'wb') as f:
    pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL)
    print("output dumped")

output dumped


Concatenating all the hidden states from all batches into a large tensor and extracting the features into a numpy array

In [3]:
with open('./pickle/output.pkl', 'rb') as f:
    output = pickle.load(f)
    print("output loaded")

output loaded


In [4]:
output_tensor = torch.cat(output, 0)

In [5]:
extracted_features = output_tensor[:,0,:].numpy()

In [6]:
extracted_features.shape

(41368, 768)

### Saving extracted features into pickle

In [7]:
with open("./pickle/bert_extracted_features.pickle", "wb") as handle:
    pickle.dump(extracted_features, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Extracted features saved!")

Extracted features saved!


# Getting recommendations

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
with open("./pickle/bert_extracted_features.pickle", "rb") as handle:
    extracted_features = pickle.load(handle)

In [10]:
cos_sim=cosine_similarity(extracted_features,extracted_features)

In [None]:
cos_sim.shape

### Saving cosine similarity

In [12]:
with open("./pickle/bert_cosine_similarity.pickle", "wb") as handle:
    pickle.dump(cos_sim, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Consine similarity saved!")

Consine similarity saved!


In [20]:
with open('./pickle/bert_cosine_similarity.pickle', 'rb') as f:
    cos_sim = pickle.load(f)
    print("Cosine similarity loaded")

Cosine similarity loaded


Loading movie indices

In [13]:
with open('./pickle/movie_indices.pickle', 'rb') as handle:
    movie_indices = pickle.load(handle)

In [15]:
df = pd.read_csv('./movie_lens_dataset/movies_metadata_processed.csv', low_memory=False)
df.head()

Unnamed: 0.1,Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,1,1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,3,3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


Load processed dataset

In [14]:
movie_indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Caged Heat 3000                41363
Subdue                         41364
Century of Birthing            41365
Satan Triumphant               41366
Queerama                       41367
Length: 41368, dtype: int64

In [None]:
test_movie = "Toy Story"
movie_index = movie_indices[test_movie]
movie_index

In [None]:
similarity_scores = pd.Series(cos_sim[movie_index]).sort_values(ascending = False)[1:11]

In [None]:
similarity_scores

In [None]:
df[['title','imdb_id']].iloc[similarity_scores.index]

Loading cosine similarity

In [None]:
with open("./pickle/bert_cosine_similarity.pickle", "rb") as handle:
    cosine_similarity = pickle.load(handle)

In [17]:
def getBertRecommendations(name: str, cosine_sim, movie_indices, df):
    print("Getting recommendations for:", name)

    # Get index of query movie
    movie_index = movie_indices[name]
    
    similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=['score'])
    
    # Top 10
    top_indices = similarity_scores.sort_values(by="score", ascending=False)[1:11].index
    
    # Get movie title, release date and poster path
    output = df[['title', 'release_date', 'poster_path']].iloc[top_indices]

    output['score'] = similarity_scores.sort_values(by="score", ascending=False)['score'][1:11]

    output.to_csv('./output/bert/{}.csv'.format(name))
    
    return output

# Evaluation Set                              

In [25]:
test_movies = ["Harry Potter and the Philosopher's Stone",
"The Matrix",
"The Dark Knight",
"Toy Story",
"The Avengers",
"The Bourne Identity",
"The Devil Wears Prada",
"Mean Girls",
"Sex and the City",
"Mission: Impossible - Ghost Protocol"]

In [26]:
for movie in test_movies:
    print(getTfIdfRecommendations(movie, cos_sim, movie_indices, df))
    print("\n")

Getting recommendations for: Harry Potter and the Philosopher's Stone
                                     title release_date  \
1217                    Young Frankenstein   1974-12-15   
14885        Nanny McPhee and the Big Bang   2010-03-26   
25611                        Halloweentown   1998-11-10   
24376                         The Shortcut   2009-02-07   
27573  Barbie in the 12 Dancing Princesses   2006-09-19   
2978                 The Cider House Rules   1999-12-17   
36149                           The Portal   2014-03-22   
7786               Dr. Jekyll and Ms. Hyde   1995-08-25   
20211                         The Smurfs 2   2013-07-30   
3483                         Puppet Master   1989-10-12   

                            poster_path     score  
1217   /tQJAWbIjvvqVKLLbIZHtwGw2HTf.jpg  0.943052  
14885   /yBNytVM6U3godvMcwcOQWOvL0f.jpg  0.940344  
25611  /ih2qNTuutdoG5Ll7CHxNgFpCBX3.jpg  0.940237  
24376  /tB4LeBIuKA6e8uLmvjKaEeDwfl1.jpg  0.939665  
27573  /8PBjz1xVq7oR