In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

In [None]:
csv_file = "../data/evaluation/OLD_EVAL_F1.csv"
data = pd.read_csv(csv_file)

expected_cols = {'sentence1', 'sentence2', 'label'}
if not expected_cols.issubset(data.columns):
    raise ValueError("CSV file must contain columns: {}".format(expected_cols))

# Relabel: map all labels that are not "irrelevant" to 1, and "irrelevant" to 0.
data['label'] = data['label'].apply(lambda x: 0 if str(x).lower() == "irrelevant" else 1)

print("Starting Retrieval Benchmark...")
similarities = benchmark_retrieval(data, model_name="bowphs/SPhilBERTa")

for threshold in [0.5, 0.6, 0.7, 0.8, 0.9]:
    retrieval_preds = (similarities >= threshold).astype(int)
    print("Retrieval Baseline Evaluation (threshold = {}):".format(threshold))
    print(classification_report(data['label'], retrieval_preds))
    print(" ") 

print("Starting Reranker Benchmark...")
reranker_preds = benchmark_reranker(data, model_name="bowphs/LaBerta")

print("Reranker Baseline Evaluation:")
print(classification_report(data['label'], reranker_preds))

In [None]:
def benchmark_retrieval(data, model_name):
    print("Loading retrieval model:", model_name)
    retrieval_model = SentenceTransformer(model_name)

    # Encode sentences from both columns
    print("Computing embeddings for sentence1...")
    embeddings1 = retrieval_model.encode(data['sentence1'].tolist(), show_progress_bar=True)
    print("Computing embeddings for sentence2...")
    embeddings2 = retrieval_model.encode(data['sentence2'].tolist(), show_progress_bar=True)

    # Compute cosine similarities for each pair
    similarities = np.array([
        cosine_similarity([emb1], [emb2])[0][0] 
        for emb1, emb2 in zip(embeddings1, embeddings2)
    ])

    return similarities

In [None]:
def benchmark_reranker(data, model_name):
    print("Loading reranker model:", model_name)
    cross_encoder = CrossEncoder(model_name)

    reranker_preds = []

    # Process each sentence pair
    for s1, s2 in tqdm(zip(data['sentence1'], data['sentence2']), total=len(data)):
        # CrossEncoder expects a list of sentence pairs
        # The model returns a continuous score; here we threshold at 0.5 to get a binary label.
        score = cross_encoder.predict([(s1, s2)])[0]
        pred_label = int(score > 0.5)
        reranker_preds.append(pred_label)

    return reranker_preds