In [1]:
import os
import torch
from IRutils.inference import evaluate, evaluate_average_ensemble, evaluate_conditional_ensemble, evaluate_weighted_average_ensemble
from IRutils.load_data import load, preprocess
from ir_measures import calc_aggregate
from tqdm.notebook import tqdm
from IRutils.models import load_model, load_models
from ir_measures import nDCG, AP, P, R, RR

  from tqdm.autonotebook import tqdm


In [2]:
model_name = 'huawei-noah/TinyBERT_General_4L_312D'
dataset_name = 'fiqa'
length_setting = 'full'
metrics = [nDCG@10, nDCG@100, AP@10, AP@100, P@10, R@10, P@100, R@100, RR]
max_len_doc = 512
random_state = 42

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_dir = f'models\\{model_name}\\{dataset_name}'


models = load_models(model_dir, model_name, device)

Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\long_queries.pth...
Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\medium_queries.pth...
Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\short_queries.pth...
Ensembling models from models\huawei-noah/TinyBERT_General_4L_312D\fiqa!


In [3]:
train_available, docs, queries, qrels, docs_test, queries_test, qrels_test  = load(dataset_name)
print('Loading complete!')

  0%|          | 0/57638 [00:00<?, ?it/s]

  0%|          | 0/57638 [00:00<?, ?it/s]

Train and test set available!
Loading complete!


In [4]:
import sys
from IRutils.load_data import calculate_percentiles

query_lengths = [len(txt.split()) for txt in list(queries.values())]
t1, t2 = calculate_percentiles(query_lengths)
ranges = {'short': (1, t1), 'medium': (t1, t2), 'long': (t2, sys.maxsize), 'full': (1, sys.maxsize)}

In [5]:
if train_available:
    train_loader, val_loader, test_loader, split_queries_test, split_qrels_test = preprocess(queries, docs, qrels, model_name, length_setting, train_available, 
                                                       queries_test=queries_test, docs_test=docs_test, qrels_test=qrels_test, 
                                                       max_len_doc=max_len_doc, random_state=random_state)
else:
    train_loader, val_loader, test_loader, split_queries_test, split_qrels_test = preprocess(queries, docs, qrels, model_name, length_setting, train_available, 
                                                       max_len_doc=max_len_doc, random_state=random_state)
    
print('Preprocessing complete!')

Dataset size: 5500
test size: 648
Example query from full subset:
('11104', 'Selling a stock for gain to offset other stock loss')
Length of subset of full validation queries: 1100
Length of subset of full training queries: 4399
Length of subset of full queries: 5499
Number of negatives in qrels: 0
Creating training dataset...


100%|██████████| 4399/4399 [00:13<00:00, 335.74it/s]


Creating validation dataset...


100%|██████████| 1100/1100 [00:03<00:00, 330.52it/s]


Creating testing dataset...


100%|██████████| 648/648 [00:01<00:00, 353.15it/s]

Preprocessing complete!





## Evaluate on baseline model (trained on all query lengths)

In [6]:
# model_dir = os.path.join(os.getcwd(), f'models/{model_name}/{dataset_name}')
# model_path = os.path.join(model_dir, f'{length_setting}_queries.pth')
# model = load_model(model_path, model_name, device)
# 
# if train_available:
#     metric_scores = evaluate(model, test_loader, device, qrels_test)
# else:
#     metric_scores = evaluate(model, test_loader, device, split_qrels_test)
#     
# for metric in metrics:
#     print(f'Metric {metric} score: {metric_scores[metric]:.4f}')

In [7]:
# save_dir = f"results/{model_name}/{dataset_name}"
# os.makedirs(save_dir, exist_ok=True)
# 
# # Save results to a file
# with open(f"{os.path.join(save_dir, 'full_queries.txt')}", "w") as f:
#     f.write(f"Evaluation of results for {model_name} model trained on {dataset_name} dataset:\n")
#     f.write(f"normalized Discounted Cumulative Gain@10: {metric_scores[nDCG@10]:.4f}\n")
#     f.write(f"normalized Discounted Cumulative Gain@100: {metric_scores[nDCG@100]:.4f}\n")
#     f.write(f"\n")
#     f.write(f"[Mean] Average Precision@10: {metric_scores[AP@10]:.4f}\n")
#     f.write(f"[Mean] Average Precision@100: {metric_scores[AP@100]:.4f}\n")
#     f.write(f"\n")
#     f.write(f"Precision@10: {metric_scores[P@10]:.4f}\n")
#     f.write(f"Recall@10: {metric_scores[R@10]:.4f}\n")
#     f.write(f"\n")
#     f.write(f"Precision@100: {metric_scores[P@100]:.4f}\n")
#     f.write(f"Recall@100: {metric_scores[R@100]:.4f}\n")
#     f.write(f"\n")
#     f.write(f"[Mean] Reciprocal Rank: {metric_scores[RR]:.4f}\n")
#     f.write(f"\n")
#     f.write(f"----------------------------------------------------\n")
#     f.write(f"\n")
#     f.write(f"Explanation of metrics:\n")
#     f.write(f"NDCG@k (Normalized Discounted Cumulative Gain: Ranking Quality | Prioritizes highly relevant documents appearing earlier in the ranking.\n")
#     f.write(f"MAP (Mean Average Precision): Overall Relevance | Measures ranking precision across all relevant documents. Best for small-scale retrieval tasks.\n")
#     f.write(f"Precision@k: Relevance | Measures how many of the top-k documents are relevant. Works well in precision-sensitive applications.\n")
#     f.write(f"Recall@k: Coverage | Measures how many relevant documents appear in the top-k results. Important in recall-sensitive tasks.\n")
#     f.write(f"MRR (Mean Reciprocal Rank): Single Relevant Result | Focuses on ranking the first relevant document. Good for QA tasks.\n")

## Evaluate on ensemble of models (Averaging method)

In [8]:
# models = load_models(model_dir, model_name, device)
# 
# if train_available:
#     metric_scores = evaluate_average_ensemble(models, test_loader, device, qrels_test)
# else:
#     metric_scores = evaluate_average_ensemble(models, test_loader, device, split_qrels_test)
#     
# for metric in metrics:
#     print(f'Metric {metric} score: {metric_scores[metric]:.4f}')

Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\long_queries.pth...
Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\medium_queries.pth...
Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\short_queries.pth...
Ensembling models from models\huawei-noah/TinyBERT_General_4L_312D\fiqa!
{'long': TripletRankerModel(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312,

Evaluating: 100%|██████████| 4265/4265 [20:38<00:00,  3.44it/s]

Metric nDCG@10 score: 0.6886
Metric nDCG@100 score: 0.7378
Metric AP@10 score: 0.5825
Metric AP@100 score: 0.6080
Metric P@10 score: 0.1985
Metric R@10 score: 0.8554
Metric P@100 score: 0.0260
Metric R@100 score: 0.9961
Metric RR score: 0.6841





In [12]:
# save_dir = f"results/{model_name}/{dataset_name}"
# os.makedirs(save_dir, exist_ok=True)
# 
# # Save results to a file
# with open(f"{os.path.join(save_dir, 'ensemble-avg.txt')}", "w") as f:
#     f.write(f"Evaluation of results for {model_name} model trained on {dataset_name} dataset:\n")
#     f.write(f"normalized Discounted Cumulative Gain@10: {metric_scores[nDCG@10]:.4f}\n")
#     f.write(f"normalized Discounted Cumulative Gain@100: {metric_scores[nDCG@100]:.4f}\n")
#     f.write(f"\n")
#     f.write(f"[Mean] Average Precision@10: {metric_scores[AP@10]:.4f}\n")
#     f.write(f"[Mean] Average Precision@100: {metric_scores[AP@100]:.4f}\n")
#     f.write(f"\n")
#     f.write(f"Precision@10: {metric_scores[P@10]:.4f}\n")
#     f.write(f"Recall@10: {metric_scores[R@10]:.4f}\n")
#     f.write(f"\n")
#     f.write(f"Precision@100: {metric_scores[P@100]:.4f}\n")
#     f.write(f"Recall@100: {metric_scores[R@100]:.4f}\n")
#     f.write(f"\n")
#     f.write(f"[Mean] Reciprocal Rank: {metric_scores[RR]:.4f}\n")
#     f.write(f"\n")
#     f.write(f"----------------------------------------------------\n")
#     f.write(f"\n")
#     f.write(f"Explanation of metrics:\n")
#     f.write(f"NDCG@k (Normalized Discounted Cumulative Gain: Ranking Quality | Prioritizes highly relevant documents appearing earlier in the ranking.\n")
#     f.write(f"MAP (Mean Average Precision): Overall Relevance | Measures ranking precision across all relevant documents. Best for small-scale retrieval tasks.\n")
#     f.write(f"Precision@k: Relevance | Measures how many of the top-k documents are relevant. Works well in precision-sensitive applications.\n")
#     f.write(f"Recall@k: Coverage | Measures how many relevant documents appear in the top-k results. Important in recall-sensitive tasks.\n")
#     f.write(f"MRR (Mean Reciprocal Rank): Single Relevant Result | Focuses on ranking the first relevant document. Good for QA tasks.\n")

## Evaluate on ensemble of models (Selective method)

In [14]:
model_dir = f'models\\{model_name}\\{dataset_name}'

models = load_models(model_dir, model_name, device)
    
if train_available:
    metric_scores = evaluate_conditional_ensemble(models, t1, t2, test_loader, device, qrels_test, queries_test)
else:
    metric_scores = evaluate_conditional_ensemble(models, t1, t2, test_loader, device, split_qrels_test, split_queries_test)

for metric in metrics:
    print(f'Metric {metric} score: {metric_scores[metric]:.4f}')

Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\long_queries.pth...


RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
save_dir = f"results/{model_name}/{dataset_name}"
os.makedirs(save_dir, exist_ok=True)

# Save results to a file
with open(f"{os.path.join(save_dir, 'ensemble-selective.txt')}", "w") as f:
    f.write(f"Evaluation of results for {model_name} model trained on {dataset_name} dataset:\n")
    f.write(f"normalized Discounted Cumulative Gain@10: {metric_scores[nDCG@10]:.4f}\n")
    f.write(f"normalized Discounted Cumulative Gain@100: {metric_scores[nDCG@100]:.4f}\n")
    f.write(f"\n")
    f.write(f"[Mean] Average Precision@10: {metric_scores[AP@10]:.4f}\n")
    f.write(f"[Mean] Average Precision@100: {metric_scores[AP@100]:.4f}\n")
    f.write(f"\n")
    f.write(f"Precision@10: {metric_scores[P@10]:.4f}\n")
    f.write(f"Recall@10: {metric_scores[R@10]:.4f}\n")
    f.write(f"\n")
    f.write(f"Precision@100: {metric_scores[P@100]:.4f}\n")
    f.write(f"Recall@100: {metric_scores[R@100]:.4f}\n")
    f.write(f"\n")
    f.write(f"[Mean] Reciprocal Rank: {metric_scores[RR]:.4f}\n")
    f.write(f"\n")
    f.write(f"----------------------------------------------------\n")
    f.write(f"\n")
    f.write(f"Explanation of metrics:\n")
    f.write(f"NDCG@k (Normalized Discounted Cumulative Gain: Ranking Quality | Prioritizes highly relevant documents appearing earlier in the ranking.\n")
    f.write(f"MAP (Mean Average Precision): Overall Relevance | Measures ranking precision across all relevant documents. Best for small-scale retrieval tasks.\n")
    f.write(f"Precision@k: Relevance | Measures how many of the top-k documents are relevant. Works well in precision-sensitive applications.\n")
    f.write(f"Recall@k: Coverage | Measures how many relevant documents appear in the top-k results. Important in recall-sensitive tasks.\n")
    f.write(f"MRR (Mean Reciprocal Rank): Single Relevant Result | Focuses on ranking the first relevant document. Good for QA tasks.\n")

## Evaluate on ensemble of models (Weighted average method)

In [None]:
model_dir = f'models\\{model_name}\\{dataset_name}'
models = load_models(model_dir, model_name, device)

weights_config = {'short': [0.6, 0.2, 0.2], # Weights for [short, medium, long] models when query is short
                  'medium': [0.2, 0.6, 0.2],# Weights when query is medium
                  'long': [0.2, 0.2, 0.6]  # Weights when query is long
                 }
    
if train_available:
    metric_scores = evaluate_weighted_average_ensemble(models, weights_config, t1, t2, test_loader, device, qrels_test, queries_test)
else:
    metric_scores = evaluate_weighted_average_ensemble(models, weights_config, t1, t2, test_loader, device, split_qrels_test, split_queries_test)

for metric in metrics:
    print(f'Metric {metric} score: {metric_scores[metric]:.4f}')

In [None]:
save_dir = f"results/{model_name}/{dataset_name}"
os.makedirs(save_dir, exist_ok=True)

# Save results to a file
with open(f"{os.path.join(save_dir, 'ensemble-weighted.txt')}", "w") as f:
    f.write(f"Evaluation of results for {model_name} model trained on {dataset_name} dataset:\n")
    f.write(f"normalized Discounted Cumulative Gain@10: {metric_scores[nDCG@10]:.4f}\n")
    f.write(f"normalized Discounted Cumulative Gain@100: {metric_scores[nDCG@100]:.4f}\n")
    f.write(f"\n")
    f.write(f"[Mean] Average Precision@10: {metric_scores[AP@10]:.4f}\n")
    f.write(f"[Mean] Average Precision@100: {metric_scores[AP@100]:.4f}\n")
    f.write(f"\n")
    f.write(f"Precision@10: {metric_scores[P@10]:.4f}\n")
    f.write(f"Recall@10: {metric_scores[R@10]:.4f}\n")
    f.write(f"\n")
    f.write(f"Precision@100: {metric_scores[P@100]:.4f}\n")
    f.write(f"Recall@100: {metric_scores[R@100]:.4f}\n")
    f.write(f"\n")
    f.write(f"[Mean] Reciprocal Rank: {metric_scores[RR]:.4f}\n")
    f.write(f"\n")
    f.write(f"----------------------------------------------------\n")
    f.write(f"\n")
    f.write(f"Explanation of metrics:\n")
    f.write(f"NDCG@k (Normalized Discounted Cumulative Gain: Ranking Quality | Prioritizes highly relevant documents appearing earlier in the ranking.\n")
    f.write(f"MAP (Mean Average Precision): Overall Relevance | Measures ranking precision across all relevant documents. Best for small-scale retrieval tasks.\n")
    f.write(f"Precision@k: Relevance | Measures how many of the top-k documents are relevant. Works well in precision-sensitive applications.\n")
    f.write(f"Recall@k: Coverage | Measures how many relevant documents appear in the top-k results. Important in recall-sensitive tasks.\n")
    f.write(f"MRR (Mean Reciprocal Rank): Single Relevant Result | Focuses on ranking the first relevant document. Good for QA tasks.\n")