In [1]:
from torch import nn
import torch
import os
from IRutils.models import TripletRankerModel
from ir_measures import calc_aggregate
from tqdm.notebook import tqdm

def load_model(path, device):
    model = TripletRankerModel(model_name).to(device=device)
    model.load_state_dict(torch.load(path, map_location=device))
    model.to(device)
    model.eval()
    return model


In [2]:
def evaluate(models, test_loader, device, qrels):
    run = {}  # Format: {qid: {doc_id: score}}

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            qids = batch["qid"]
            pos_dids = batch["pos_did"]
            neg_dids = batch["neg_did"]

            # Process embeddings and calculate distances
            anchor_inputs = batch["anchor_input_ids"].to(device)
            anchor_masks = batch["anchor_attention_mask"].to(device)
            positive_inputs = batch["positive_input_ids"].to(device)
            positive_masks = batch["positive_attention_mask"].to(device)
            negative_inputs = batch["negative_input_ids"].to(device)
            negative_masks = batch["negative_attention_mask"].to(device)
            
            pd = []
            nd = []
            
            for model in models:
                model.eval()
                model.to(device)
                
                anchor_embeddings = model(anchor_inputs, anchor_masks)
                positive_embeddings = model(positive_inputs, positive_masks)
                negative_embeddings = model(negative_inputs, negative_masks)
    
                pos_distances = torch.norm(anchor_embeddings - positive_embeddings, p=2, dim=1)
                neg_distances = torch.norm(anchor_embeddings - negative_embeddings, p=2, dim=1)
                
                pd.append(pos_distances)
                nd.append(neg_distances)
                
            # Compute the average distances across all three models
            final_pos_distances = torch.stack(pd).mean(dim=0)  # Average over models
            final_neg_distances = torch.stack(nd).mean(dim=0)  # Average over models

            # Build the run dictionary
            i = 0
            while i < len(qids):
                qid = qids[i]
                pos_did = pos_dids[i]
                neg_did = neg_dids[i]

                pos_score = -final_pos_distances[i].item()
                neg_score = -final_neg_distances[i].item()

                if qid not in run:
                    run[qid] = {}

                # Add scores directly (no list of dicts)
                run[qid][pos_did] = pos_score
                run[qid][neg_did] = neg_score

                i += 1

    # Calculate metrics
    metrics = [
        nDCG @ 10, nDCG @ 100,
        AP @ 10, AP @ 100,
        P @ 10, R @ 10,
        P @ 100, R @ 100,
        RR
    ]

    metric_scores = calc_aggregate(metrics, qrels, run)

    return metric_scores

In [3]:
model_name = 'distilbert-base-uncased'
dataset_name = 'fiqa'

model_dir = f'models\\{model_name}\\{dataset_name}'
model_paths = [os.path.join(model_dir, name) for name in os.listdir(model_dir) if "full" not in name]
models = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
for path in model_paths:
    model = load_model(path, device)
    models.append(model)
    
print(len(models))

3


In [4]:
from beir.datasets.data_loader import GenericDataLoader
from beir import util

datasets = {'msmarco': ['train', 'dev'],
            'hotpotqa': ['train', 'dev', 'test'],
            'arguana': ['test'],
            'quora': ['dev', 'test'],
            'scidocs': ['test'],  # small
            'fever': ['train', 'dev', 'test'],  # large
            'climate-fever': ['test'],
            'scifact': ['train', 'test'],
            'fiqa': ['train', 'dev', 'test'],
            'nfcorpus': ['train', 'dev', 'test']
            }

# Download and unzip the dataset
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset_name}.zip"
data_path = util.download_and_unzip(url, "datasets")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_available = False
if 'train' in datasets[dataset_name]:
    # Load the dataset
    docs, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="train")
    docs_test, queries_test, qrels_test = GenericDataLoader(data_folder=data_path).load(split="test")
    train_available = True
    print('Train and test set available!')
else:
    # Load the dataset
    docs, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test")
    print('Only test set available!')

  0%|          | 0/57638 [00:00<?, ?it/s]

  0%|          | 0/57638 [00:00<?, ?it/s]

Train and test set available!


In [5]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
import numpy as np

def calculate_percentiles(query_lengths):
    # Calculate the percentiles
    t1 = np.percentile(query_lengths, 33)
    t2 = np.percentile(query_lengths, 67)
    return int(t1), int(t2)

In [7]:
from IRutils import dataprocessor

dp = dataprocessor.DataProcessor(queries, docs, qrels)

print(f'Dataset size: {len(queries)}')

# first seperate the test set (include queries of all lengths)
if not train_available:
    query_test, qrel_test = dp.get_testset(test_ratio=0.2, random_state=42)
    print(f'test size: {len(query_test)}')
else:
    print(f'test size: {len(queries_test)}')

Dataset size: 5500
test size: 648


In [8]:
from torch.utils.data import DataLoader
from IRutils.dataset import TripletRankingDataset

qrel_scores = list(qrels.values()) 
relevance_scores = [list(item.values()) for item in qrel_scores]
num_negatives = relevance_scores[0].count(0)
print(f'Number of negatives in qrels: {num_negatives}')

print('Creating testing dataset...')
if train_available:
    test_dataset = TripletRankingDataset(queries_test, docs_test, qrels_test, tokenizer, num_negatives,max_length=512)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)
else:
    test_dataset = TripletRankingDataset(query_test, docs, qrel_test, tokenizer, num_negatives,max_length=512) 
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

Number of negatives in qrels: 0
Creating testing dataset...


100%|██████████| 648/648 [00:01<00:00, 328.90it/s]


In [9]:
from IRutils import inference
from ir_measures import nDCG, AP, P, R, RR

metrics = [nDCG@10, nDCG@100, AP@10, AP@100, P@10, R@10, P@100, R@100, RR]

# Example usage (replace with your data and model)
if train_available:
    metric_scores = evaluate(models, test_loader, device, qrels_test)
else:
    metric_scores = evaluate(models, test_loader, device, qrel_test)
    
for metric in metrics:
    print(f'Metric {metric} score: {metric_scores[metric]:.4f}')

Evaluating:   0%|          | 0/4265 [00:00<?, ?it/s]

Metric nDCG@10 score: 0.7402
Metric nDCG@100 score: 0.7802
Metric AP@10 score: 0.6417
Metric AP@100 score: 0.6644
Metric P@10 score: 0.2106
Metric R@10 score: 0.8857
Metric P@100 score: 0.0261
Metric R@100 score: 0.9977
Metric RR score: 0.7430


# Ensemble test 2

In [9]:
def evaluate(models, test_loader, device, qrels):
    run = {}  # Format: {qid: {doc_id: score}}

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            qids = batch["qid"]
            pos_dids = batch["pos_did"]
            neg_dids = batch["neg_did"]

            # Process embeddings and calculate distances
            anchor_inputs = batch["anchor_input_ids"].to(device)
            anchor_masks = batch["anchor_attention_mask"].to(device)
            positive_inputs = batch["positive_input_ids"].to(device)
            positive_masks = batch["positive_attention_mask"].to(device)
            negative_inputs = batch["negative_input_ids"].to(device)
            negative_masks = batch["negative_attention_mask"].to(device)
            
            pd = []
            nd = []
            
            for model in models:
                model.eval()
                model.to(device)
                
                anchor_embeddings = model(anchor_inputs, anchor_masks)
                positive_embeddings = model(positive_inputs, positive_masks)
                negative_embeddings = model(negative_inputs, negative_masks)
    
                pos_distances = torch.norm(anchor_embeddings - positive_embeddings, p=2, dim=1)
                neg_distances = torch.norm(anchor_embeddings - negative_embeddings, p=2, dim=1)
                
                pd.append(pos_distances)
                nd.append(neg_distances)
                
            # Compute the average distances across all three models
            final_pos_distances = torch.stack(pd).mean(dim=0)  # Average over models
            final_neg_distances = torch.stack(nd).mean(dim=0)  # Average over models

            # Build the run dictionary
            i = 0
            while i < len(qids):
                qid = qids[i]
                pos_did = pos_dids[i]
                neg_did = neg_dids[i]

                pos_score = -final_pos_distances[i].item()
                neg_score = -final_neg_distances[i].item()

                if qid not in run:
                    run[qid] = {}

                # Add scores directly (no list of dicts)
                run[qid][pos_did] = pos_score
                run[qid][neg_did] = neg_score

                i += 1

    # Calculate metrics
    metrics = [
        nDCG @ 10, nDCG @ 100,
        AP @ 10, AP @ 100,
        P @ 10, R @ 10,
        P @ 100, R @ 100,
        RR
    ]

    metric_scores = calc_aggregate(metrics, qrels, run)

    return metric_scores