In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer

In [None]:
data_folder = Path("..", "data")
data_path = data_folder / "processed" / "sg_sanctions_on_russia.parquet"

In [None]:
df = (
    pd.read_parquet(data_path)
)[lambda df: df["source"] == "Online News"]
# Sentences we want sentence embeddings for
titles = df["title"].to_list()
content = df["content"].to_list()

In [None]:
def prepare_sentence_embeddings(data_folder, content, titles, checkpoint):
    content_embeddings_path = data_folder / "embeddings" / f"{checkpoint}_content_embeddings.npy"
    title_embeddings_path = data_folder / "embeddings" / f"{checkpoint}_title_embeddings.npy"
    
    model = SentenceTransformer(checkpoint)
    if not content_embeddings_path.exists():
        content_embeddings = model.encode(content, batch_size=32, show_progress_bar=True)
        np.save(content_embeddings_path, content_embeddings)
    else:
        content_embeddings = np.load(content_embeddings_path)

    if not title_embeddings_path.exists():
        title_embeddings = model.encode(titles, batch_size=32, show_progress_bar=True)
        np.save(title_embeddings_path, title_embeddings)
    else:
        title_embeddings = np.load(title_embeddings_path)
    return content_embeddings, title_embeddings

In [None]:
# For each embedding type, save embeddings
checkpoint = "multi-qa-MiniLM-L6-cos-v1"
content_embeddings, title_embeddings = prepare_sentence_embeddings(data_folder, content, titles, checkpoint)