<h1 align='center'>Embedding Extraction</h1>

In [1]:
import pandas as pd
import numpy as np
import pickle
import faiss
import joblib

In [2]:
df = pd.read_csv('cleaned_data_v2.csv')
df.head(3)

Unnamed: 0,title,overview,keywords,genres,original_language,spoken_languages,vote_average,vote_count,popularity,weighted_avg_vote,weighted_avg_vote_log,popularity_log
0,Inception,"Cobb, a skilled thief who commits corporate es...","['rescue', 'mission', 'dream', 'airplane', 'pa...","['Action', 'Science Fiction', 'Adventure']",['en'],"['English', 'French', 'Japanese', 'Swahili']",8.364,34495,83.952,288516.18,12.57251,4.442086
1,The Dark Knight,Batman raises the stakes in his war on crime. ...,"['joker', 'sadism', 'chaos', 'secret identity'...","['Drama', 'Action', 'Crime', 'Thriller']",['en'],"['English', 'Mandarin']",8.512,30619,130.643,260628.928,12.470857,4.880094
2,Avatar,"In the 22nd century, a paraplegic Marine is di...","['future', 'society', 'culture clash', 'space ...","['Action', 'Adventure', 'Fantasy', 'Science Fi...",['en'],"['English', 'Spanish']",7.573,29815,79.932,225788.995,12.327361,4.393609


In [3]:
df['text'] = df['title'] + ' ' + df['keywords'] + ' ' +	df['genres'] + ' ' + df['original_language'] + ' ' + df['spoken_languages'] + ' ' + df['overview']

In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
text_embeddings =model.encode(df['text'].tolist(),
                                  show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 3446/3446 [30:12<00:00,  1.90it/s]


In [5]:
# Normalie vectors for cosine similarities
faiss.normalize_L2(text_embeddings)

In [6]:
text_embeddings

array([[-0.0357994 , -0.02774584, -0.01180475, ...,  0.03280392,
        -0.0139987 , -0.02733093],
       [-0.0146964 , -0.05817799, -0.07252546, ..., -0.01600985,
         0.02771219,  0.0389316 ],
       [-0.00307021, -0.02521594,  0.06083349, ...,  0.01398388,
        -0.02339387,  0.02808953],
       ...,
       [-0.02598427, -0.06881675, -0.01877942, ...,  0.05031431,
        -0.02529542,  0.04412768],
       [-0.02374344,  0.02367551, -0.04123709, ...,  0.03113052,
         0.00662624,  0.00464557],
       [-0.05398469,  0.01938047, -0.00567989, ...,  0.04980075,
        -0.05352924,  0.04732918]], dtype=float32)

In [7]:
with open('embeddings_v02.pkl', 'wb') as f:
    pickle.dump(text_embeddings, f)

In [4]:
with open('embeddings_v02.pkl', 'rb') as f:
    embeddings = pickle.load(f)

In [5]:
print('shape: ', embeddings.shape)
print('dtype: ', embeddings.dtype)

shape:  (110265, 384)
dtype:  float32


In [6]:
# Build quantized index

index = faiss.IndexIVFPQ(
    faiss.IndexFlatIP(384),   # Inner product metric
    384,                      # Dimention
    1024,                     # Number of Voronoi Cells (nlist)
    64,                       # Subquantizers (m)
    8                         # Bits per subquantizers
)

In [7]:
index.train(embeddings)
index.add(embeddings)

In [8]:
# Search for top k similar movies
query_index = 284
k = 10
distances, indices = index.search(embeddings[query_index:query_index+1], k)

In [9]:
for i in indices[0]:
    print(df.loc[i, ['title', 'genres']])

title                               Hotel Transylvania
genres    ['Animation', 'Comedy', 'Family', 'Fantasy']
Name: 284, dtype: object
title                             Hotel Transylvania 2
genres    ['Animation', 'Comedy', 'Family', 'Fantasy']
Name: 576, dtype: object
title            Hotel Transylvania 3: Summer Vacation
genres    ['Animation', 'Comedy', 'Family', 'Fantasy']
Name: 809, dtype: object
title     The Halloween That Almost Wasn't
genres      ['Horror', 'Comedy', 'Family']
Name: 66879, dtype: object
title     The Lively Vampires of Vögel
genres            ['Comedy', 'Horror']
Name: 87593, dtype: object
title            Dracula Sucks
genres    ['Comedy', 'Horror']
Name: 32408, dtype: object
title                     Hotel Transylvania: Transformania
genres    ['Animation', 'Comedy', 'Family', 'Adventure',...
Name: 2079, dtype: object
title                Subspecies
genres    ['Horror', 'Fantasy']
Name: 14470, dtype: object
title                 Dracula's Daughter
genres    

In [10]:
# Using compressed version of joblib to make docker image lighter
joblib.dump(
    {
        'df': df,
        'embeddings': embeddings,
        'index': index
    },
    'movie_recommender_v02.joblib',
    compress=('lz4', 3)
)

['movie_recommender_v02.joblib']

In [13]:
!pip install lz4

Collecting lz4
  Downloading lz4-4.4.4-cp39-cp39-win_amd64.whl.metadata (3.9 kB)
Downloading lz4-4.4.4-cp39-cp39-win_amd64.whl (99 kB)
Installing collected packages: lz4
Successfully installed lz4-4.4.4


# `overview` Embeddings

In [16]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
overview_embeddings =model.encode(df['overview'].tolist(),
                                  show_progress_bar=True)

Batches: 100%|██████████| 3446/3446 [18:49<00:00,  3.05it/s]


In [17]:
faiss.normalize_L2(overview_embeddings)
overview_embeddings

array([[-0.10088539,  0.07117508, -0.0289718 , ..., -0.01160109,
         0.0589892 , -0.06030006],
       [-0.02430348,  0.00925271, -0.06100011, ..., -0.02184225,
         0.05695461,  0.01922566],
       [-0.00636554,  0.06156255,  0.03046027, ...,  0.00987918,
         0.03299569,  0.00213429],
       ...,
       [-0.04632293, -0.05000658, -0.02211937, ...,  0.01589813,
        -0.01267236,  0.05453489],
       [-0.0231142 ,  0.07574666, -0.02869395, ...,  0.05317041,
         0.02263007, -0.0564732 ],
       [-0.01679474,  0.10613248, -0.01274135, ..., -0.02384875,
        -0.03770639,  0.08151328]], dtype=float32)

In [19]:
with open('overview_embeddings.pkl', 'wb') as f:
    pickle.dump(overview_embeddings, f)

In [20]:
with open('overview_embeddings.pkl', 'rb') as f:
    overview_embd = pickle.load(f)

In [21]:
# Build quantized index

index = faiss.IndexIVFPQ(
    faiss.IndexFlatIP(384),   # Inner product metric
    384,                      # Dimention
    1024,                     # Number of Voronoi Cells (nlist)
    64,                       # Subquantizers (m)
    8                         # Bits per subquantizers
)

In [22]:
index.train(overview_embd)
index.add(overview_embd)

In [23]:
# Search for top k similar movies
query_index = 284
k = 10
distances, indices = index.search(overview_embd[query_index:query_index+1], k)

In [24]:
for i in indices[0]:
    print(df.loc[i, ['title', 'genres']])

title                               Hotel Transylvania
genres    ['Animation', 'Comedy', 'Family', 'Fantasy']
Name: 284, dtype: object
title                             Hotel Transylvania 2
genres    ['Animation', 'Comedy', 'Family', 'Fantasy']
Name: 576, dtype: object
title     The Halloween That Almost Wasn't
genres      ['Horror', 'Comedy', 'Family']
Name: 66879, dtype: object
title            Hotel Transylvania 3: Summer Vacation
genres    ['Animation', 'Comedy', 'Family', 'Fantasy']
Name: 809, dtype: object
title                     Dracula and Son
genres    ['Fantasy', 'Comedy', 'Horror']
Name: 38780, dtype: object
title     The Dracula Saga
genres          ['Horror']
Name: 50312, dtype: object
title     Ghost Bride of Dracula
genres                ['Horror']
Name: 81467, dtype: object
title     The Lively Vampires of Vögel
genres            ['Comedy', 'Horror']
Name: 87593, dtype: object
title        Story of My Death
genres    ['History', 'Drama']
Name: 31246, dtype: object
tit

In [25]:
with open('faiss_index.pkl', 'wb') as f:
    pickle.dump(index, f)

with open('movie_recommender_v03.pkl', 'wb') as f:
    pickle.dump({
        'df': df,
        'embeddings': overview_embd,
        'index': index
    }, f)