In [1]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import re, string
import torch
from sklearn.metrics.pairwise import cosine_similarity as sk_cosine_similarity
from scipy.spatial.distance import euclidean, cityblock

In [2]:
# 1. Load Data and Basic Cleaning
data = pd.read_csv('movies_clean.csv', sep=',')
data.drop(['Studios','Year','Genre','Director','Producers','Cast','AvgRating','Duration'], axis=1, inplace=True)

def clean(text):
    # Lowercase and remove URLs, mentions, hashtags, digits, punctuation, and other undesired tokens
    text = str(text).lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#\w+", " ", text)
    text = re.sub(r"\d+", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"[️«»—]", " ", text)
    return text

# Clean the movie descriptions
clean_text = [clean(text) for text in data['Description']]
data['cleaned_text'] = clean_text

In [3]:
# 2. Compute and Save Embeddings for Multiple Models
models = {
    "distilbert": "distilbert-base-nli-stsb-mean-tokens",
    "minilm": "all-MiniLM-L6-v2",
    "roberta": "stsb-roberta-base"
}

for label, model_name in models.items():
    print(f"Computing embeddings for {model_name} ...")
    model = SentenceTransformer(model_name)
    descriptions = data['cleaned_text'].tolist()
    embeddings = model.encode(descriptions, convert_to_tensor=True)
    # Save embeddings to a new DataFrame column (as numpy arrays)
    data[f"embeddings_{label}"] = list(embeddings.cpu().numpy())
    # Optionally save as .npy file for later use:
    np.save(f'film_embeddings_{label}.npy', embeddings.cpu().numpy())
    print(f"Saved embeddings in column 'embeddings_{label}' and file film_embeddings_{label}.npy")

Computing embeddings for distilbert-base-nli-stsb-mean-tokens ...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Saved embeddings in column 'embeddings_distilbert' and file film_embeddings_distilbert.npy
Computing embeddings for all-MiniLM-L6-v2 ...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Saved embeddings in column 'embeddings_minilm' and file film_embeddings_minilm.npy
Computing embeddings for stsb-roberta-base ...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/672 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Saved embeddings in column 'embeddings_roberta' and file film_embeddings_roberta.npy


In [4]:
# 3. Define Similarity Functions (Different Metrics)
def compute_cosine_similarity(query_embedding, film_embeddings):
    query_embedding = query_embedding.unsqueeze(0) if query_embedding.dim() == 1 else query_embedding
    similarities = util.pytorch_cos_sim(query_embedding, film_embeddings)[0]
    return similarities.cpu().numpy()

def compute_euclidean_similarity(query_embedding, film_embeddings):
    query_np = query_embedding.cpu().numpy()
    film_np = film_embeddings.cpu().numpy()
    distances = np.array([euclidean(query_np, emb) for emb in film_np])
    return -distances  # Lower distance => higher similarity

def compute_manhattan_similarity(query_embedding, film_embeddings):
    query_np = query_embedding.cpu().numpy()
    film_np = film_embeddings.cpu().numpy()
    distances = np.array([cityblock(query_np, emb) for emb in film_np])
    return -distances

def compute_dot_product_similarity(query_embedding, film_embeddings):
    query_np = query_embedding.cpu().numpy()
    film_np = film_embeddings.cpu().numpy()
    dot_products = np.dot(film_np, query_np)
    return dot_products

In [5]:
# 4. Recommendation Function Using Chosen Embeddings
def get_top_recommendations(query, embedding_col, similarity_metric='cosine', top_k=10):
    # Determine which model to use based on the embedding_col name
    if 'distilbert' in embedding_col:
        model_name = models['distilbert']
    elif 'minilm' in embedding_col:
        model_name = models['minilm']
    elif 'roberta' in embedding_col:
        model_name = models['roberta']
    else:
        raise ValueError("Unrecognized embedding column/model")

    # Load the corresponding model to encode the query
    model = SentenceTransformer(model_name)
    # Convert the stored embeddings (in the DataFrame column) to a tensor
    film_embeddings = torch.tensor(np.vstack(data[embedding_col].values))

    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute similarity using the selected metric
    if similarity_metric == 'cosine':
        sims = compute_cosine_similarity(query_embedding, film_embeddings)
    elif similarity_metric == 'euclidean':
        sims = compute_euclidean_similarity(query_embedding, film_embeddings)
    elif similarity_metric == 'manhattan':
        sims = compute_manhattan_similarity(query_embedding, film_embeddings)
    elif similarity_metric == 'dot':
        sims = compute_dot_product_similarity(query_embedding, film_embeddings)
    else:
        raise ValueError("Unsupported similarity metric provided.")

    # Retrieve top_k indices (higher scores indicate higher similarity)
    top_indices = np.argsort(sims)[::-1][:top_k]
    results = data.iloc[top_indices].copy()
    results['similarity_score'] = sims[top_indices]
    return results

In [6]:
# 5. Example Usage: Compare Top-K Results per Model & Metric
query = "corruption in city"

print("Recommendations using distilbert embeddings (cosine similarity):")
print(get_top_recommendations(query, embedding_col="embeddings_distilbert", similarity_metric='cosine', top_k=5))

print("\nRecommendations using minilm embeddings (cosine similarity):")
print(get_top_recommendations(query, embedding_col="embeddings_minilm", similarity_metric='cosine', top_k=5))

print("\nRecommendations using roberta embeddings (cosine similarity):")
print(get_top_recommendations(query, embedding_col="embeddings_roberta", similarity_metric='cosine', top_k=5))

Recommendations using distilbert embeddings (cosine similarity):
                        Title  \
6849  Where the Sidewalk Ends   
9628           The Conference   
660            The Stronghold   
4039             Urban Legend   
7106                Happiness   

                                            Description  \
6849  A police detective’s violent nature keeps him ...   
9628  A team-building conference for municipal emplo...   
660   A police brigade works in the dangerous northe...   
4039  A college campus is plagued by a vicious seria...   
7106  A new type of deadly virus spread throughout t...   

                                             Poster URL  \
6849  https://a.ltrbxd.com/resized/film-poster/4/1/1...   
9628  https://a.ltrbxd.com/resized/film-poster/1/0/4...   
660   https://a.ltrbxd.com/resized/film-poster/5/6/3...   
4039  https://a.ltrbxd.com/resized/film-poster/4/6/7...   
7106  https://a.ltrbxd.com/resized/film-poster/8/2/3...   

                          

In [7]:
# 6. Save the Updated DataFrame with Embeddings for Local Use
data.to_csv('films_with_embeddings.csv', index=False)

In [8]:
from google.colab import files
files.download('films_with_embeddings.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>