In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import product
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Load Data

In [None]:
OLD_EVAL_F1 = "../data/evaluation/OLD_EVAL_F1.csv"
MANUAL_FINDINGS_COMBINED_1000 = "../data/samples/sample_manual_findings_combined_1000.csv"

# Choose data set 
data = pd.read_csv(MANUAL_FINDINGS_COMBINED_1000)

expected_cols = {'sentence1', 'sentence2', 'label'}
if not expected_cols.issubset(data.columns):
    raise ValueError("CSV file must contain columns: {}".format(expected_cols))

# Relabel: map all labels that are not "irrelevant" to 1, and "irrelevant" to 0.
data['label'] = data['label'].apply(lambda x: 0 if str(x).lower() == "irrelevant" else 1)

## Dashboard

**Evaluation TODOs** 
- create datasets for the other tests
- implement predefined benchmark tests
- configure pipeline for all the tests
- log the results to a file (with visualization)
- add confusion matrices 

### Top-k Retrieval

In [None]:
RETRIEVER = "bowphs/SPhilBERTa" 

In [None]:
candidate_embeddings, query_embeddings = compute_embeddings(data, RETRIEVER)

In [None]:
for k, threshold in product([2, 5, 8], [0.5, 0.6, 0.7, 0.8]):
    benchmark_top_k_retrieval_dual(candidate_embeddings, query_embeddings, k=k, threshold=threshold)

In [None]:
for threshold in [0.6, 0.62, 0.64, 0.66, 0.68, 0.7, 0.72, 0.74, 0.76, 0.78, 0.8]:
    benchmark_top_k_retrieval_dual(candidate_embeddings, query_embeddings, k=5, threshold=threshold)

### Reranking

In [None]:
RERANKER = "bowphs/LaBERTa" 
THRESHOLD = 0.5

In [None]:
reranker_preds = benchmark_reranker(data, RERANKER, THRESHOLD)
report = classification_report(data['label'], reranker_preds)
    
print(f"Reranker Baseline Evaluation (threshold = {THRESHOLD}):")
print(report)

## Functions 

### Top-k Retrieval

In [None]:
def build_positive_mapping(data):
    """
    Build a mapping from each unique query (sentence1) to a set of corresponding positive candidates (sentence2)
    based on the label (after relabeling: 1 for positive, 0 for "irrelevant").
    """
    positive_map = defaultdict(set)
    for _, row in data.iterrows():
        query = row['sentence1']
        candidate = row['sentence2']
        label = row['label']  # Assumed: 1 for positive, 0 for irrelevant
        if label == 1:
            positive_map[query].add(candidate)
    return positive_map

In [None]:
def compute_embeddings(data, model_name):
    print("Loading retrieval model:", model_name)
    retrieval_model = SentenceTransformer(model_name)

    # Build mapping of positive candidates per query.
    positive_map = build_positive_mapping(data)
    
    # Obtain all unique queries.
    all_queries = data['sentence1'].unique()
    positive_queries = [q for q in all_queries if q in positive_map and len(positive_map[q]) > 0]
    negative_queries = [q for q in all_queries if q not in positive_map or len(positive_map[q]) == 0]
    
    # Build candidate pool: use all sentence2 entries.
    candidate_sentences = data['sentence2'].tolist()
    
    print("Computing candidate embeddings for the full pool...")
    candidate_embeddings = retrieval_model.encode(candidate_sentences, show_progress_bar=True)
    
    # Combine queries for evaluation.
    evaluation_queries = list(positive_queries) + list(negative_queries)
    
    print("Computing query embeddings for evaluation queries...")
    query_embeddings = retrieval_model.encode(evaluation_queries, show_progress_bar=True)


    return candidate_embeddings, query_embeddings

In [None]:
def benchmark_top_k_retrieval_dual(candidate_embeddings, query_embeddings, k, threshold):

    # Build mapping of positive candidates per query.
    positive_map = build_positive_mapping(data)
    
    # Obtain all unique queries.
    all_queries = data['sentence1'].unique()
    positive_queries = [q for q in all_queries if q in positive_map and len(positive_map[q]) > 0]
    negative_queries = [q for q in all_queries if q not in positive_map or len(positive_map[q]) == 0]
    
    # Build candidate pool: use all sentence2 entries.
    candidate_sentences = data['sentence2'].tolist()
    
    # Combine queries for evaluation.
    evaluation_queries = list(positive_queries) + list(negative_queries)
    
    # Initialize metrics counters.
    positive_hits = 0      # Count of positive queries with at least one hit.
    total_positive = len(positive_queries)
    
    negative_false_positives = 0  # Count of negative queries with any retrieval.
    total_negative = len(negative_queries)
    
    all_positive_ranks = []       # Best rank among positive queries (np.inf if no hit).
    negative_retrieval_counts = []  # Count of retrieved candidates for negative queries.
    
    for i, query in enumerate(evaluation_queries):
        # Use the precomputed embedding.
        q_emb = query_embeddings[i]
        sims = cosine_similarity([q_emb], candidate_embeddings)[0]
        
        # Apply threshold filtering.
        valid_indices = np.where(sims >= threshold)[0]
        if valid_indices.size > 0:
            sorted_indices = valid_indices[np.argsort(-sims[valid_indices])]
            retrieved_indices = sorted_indices[:k]
        else:
            retrieved_indices = np.array([])
        
        # Check whether query is positive or negative.
        if query in positive_map and len(positive_map[query]) > 0:
            hit = False
            best_rank = np.inf
            if retrieved_indices.size > 0:
                for rank, idx in enumerate(retrieved_indices, start=1):
                    if candidate_sentences[idx] in positive_map[query]:
                        hit = True
                        best_rank = rank
                        break
            if hit:
                positive_hits += 1
            all_positive_ranks.append(best_rank)
        else:
            count_retrieved = len(retrieved_indices)
            negative_retrieval_counts.append(count_retrieved)
            if count_retrieved > 0:
                negative_false_positives += 1

    recall_at_k = positive_hits / total_positive if total_positive > 0 else None
    false_positive_rate = negative_false_positives / total_negative if total_negative > 0 else None

    print(f"Top-k Retrieval Baseline Evaluation (k = {k}, threshold = {threshold}):\n")
    print(f"Positive Queries: {total_positive}")
    print(f"Hits: {positive_hits}")
    print(f"Recall@{k}: {recall_at_k:.4f}\n")
    print(f"Negative Queries: {total_negative}")
    print(f"False Positive Rate: {false_positive_rate:.4f}")
    print()
    print()
    
    # Return results along with the computed embeddings for further experimentation.
    return {
        "recall_at_k": recall_at_k,
        "false_positive_rate": false_positive_rate,
        "positive_ranks": all_positive_ranks,
        "negative_counts": negative_retrieval_counts,
        "candidate_embeddings": candidate_embeddings,
        "query_embeddings": query_embeddings,
        "evaluation_queries": evaluation_queries
    }


### Reranking

In [None]:
def benchmark_reranker(data, model_name, threshold):
    cross_encoder = CrossEncoder(model_name)
    reranker_preds = []

    # Process each sentence pair
    for s1, s2 in tqdm(zip(data['sentence1'], data['sentence2']), total=len(data)):
        # CrossEncoder expects a list of sentence pairs
        # The model returns a continuous score; here we threshold at 0.5 to get a binary label.
        score = cross_encoder.predict([(s1, s2)])[0]
        pred_label = int(score > threshold)
        reranker_preds.append(pred_label)

    return reranker_preds