In [19]:
import os
import torch
import sys
from IRutils.load_data import calculate_percentiles
from IRutils.inference import evaluate, evaluate_average_ensemble, evaluate_conditional_ensemble, evaluate_weighted_average_ensemble, write_results
from IRutils.load_data import load, preprocess
from IRutils.models import load_model, load_models
from IRutils.plotting_utils import *
from IRutils.weight_optimizer import precompute_validation_scores, find_optimal_weights_config
from ir_measures import nDCG, AP, P, R, RR

In [20]:
results = {'baseline': {}, 'ens-avg': {}, 'ens-select': {}, 'ens-weighted': {}, 'ens-learned-weighted': {}} # Added new key
metric_to_optimize_weights = nDCG @ 10 # Choose the metric to optimize weights for
weight_opt_trials = 50 # Number of Optuna trials per category (adjust as needed)

model_name = 'huawei-noah/TinyBERT_General_4L_312D'
dataset_name = 'fiqa'
length_setting = 'full'
metrics = [nDCG @ 3, nDCG @ 5, nDCG @ 10, RR, P @ 1, P @ 3, P @ 5, R @ 1, R @ 3, R @ 5, R @ 10]
max_len_doc = 512
random_state = 42

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_dir = f'models\\{model_name}\\{dataset_name}'

models = load_models(model_dir, model_name, device)

Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\long_queries.pth...
Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\medium_queries.pth...
Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\short_queries.pth...
Ensembling models from models\huawei-noah/TinyBERT_General_4L_312D\fiqa!


In [21]:
train_available, docs, queries, qrels, docs_test, queries_test, qrels_test  = load(dataset_name)
print('Loading complete!')

  0%|          | 0/57638 [00:00<?, ?it/s]

  0%|          | 0/57638 [00:00<?, ?it/s]

Train and test set available!
Loading complete!


In [22]:
query_lengths = [len(txt.split()) for txt in list(queries.values())]
t1, t2 = calculate_percentiles(query_lengths)
ranges = {'short': (1, t1), 'medium': (t1, t2), 'long': (t2, sys.maxsize), 'full': (1, sys.maxsize)}

In [23]:
if train_available:
    train_loader, val_loader, test_loader, split_queries_test, split_qrels_test, query_val, qrels_val = preprocess(queries, docs, qrels, model_name, length_setting, train_available, 
                                                       queries_test=queries_test, docs_test=docs_test, qrels_test=qrels_test, 
                                                       max_len_doc=max_len_doc, random_state=random_state)
else:
    train_loader, val_loader, test_loader, split_queries_test, split_qrels_test, query_val, qrels_val = preprocess(queries, docs, qrels, model_name, length_setting, train_available, 
                                                       max_len_doc=max_len_doc, random_state=random_state)
    
print('Preprocessing complete!')

Dataset size: 5500
test size: 648
Example query from full subset:
('11104', 'Selling a stock for gain to offset other stock loss')
Length of subset of full validation queries: 1100
Length of subset of full training queries: 4399
Length of subset of full queries: 5499
Number of negatives in qrels: 0
Creating training dataset...


100%|██████████| 4399/4399 [00:11<00:00, 375.65it/s]


Creating validation dataset...


100%|██████████| 1100/1100 [00:03<00:00, 331.64it/s]


Creating testing dataset...


100%|██████████| 648/648 [00:01<00:00, 348.52it/s]


Preprocessing complete!


## Evaluate on baseline model (trained on all query lengths)

In [24]:
model_dir = os.path.join(os.getcwd(), f'models/{model_name}/{dataset_name}')
model_path = os.path.join(model_dir, f'{length_setting}_queries.pth')
model = load_model(model_path, model_name, device)

if train_available:
    metric_scores = evaluate(model, test_loader, device, qrels_test)
else:
    metric_scores = evaluate(model, test_loader, device, split_qrels_test)

for metric in metrics:
    print(f'Metric {metric} score: {metric_scores[metric]:.4f}')

results['baseline'] = metric_scores

Evaluating:   0%|          | 0/1067 [00:00<?, ?it/s]

Metric nDCG@3 score: 0.5044
Metric nDCG@5 score: 0.5743
Metric nDCG@10 score: 0.6474
Metric RR score: 0.6047
Metric P@1 score: 0.4228
Metric P@3 score: 0.3349
Metric P@5 score: 0.2858
Metric R@1 score: 0.2613
Metric R@3 score: 0.5171
Metric R@5 score: 0.6825
Metric R@10 score: 0.8599


In [25]:
save_dir = f"results/{model_name}/{dataset_name}"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, 'full_queries.txt')

write_results(metric_scores, save_path, model_name, dataset_name, length_setting)


Successfully written results to results/huawei-noah/TinyBERT_General_4L_312D/fiqa\full_queries.txt.


## Evaluate on ensemble of models (Averaging method)

In [26]:
models = load_models(model_dir, model_name, device)

if train_available:
    metric_scores = evaluate_average_ensemble(models, test_loader, device, qrels_test)
else:
    metric_scores = evaluate_average_ensemble(models, test_loader, device, split_qrels_test)

for metric in metrics:
    print(f'Metric {metric} score: {metric_scores[metric]:.4f}')

results['ens-avg'] = metric_scores

Loading model C:\Users\chena\PycharmProjects\IR-rankingmodels\models/huawei-noah/TinyBERT_General_4L_312D/fiqa\long_queries.pth...
Loading model C:\Users\chena\PycharmProjects\IR-rankingmodels\models/huawei-noah/TinyBERT_General_4L_312D/fiqa\medium_queries.pth...
Loading model C:\Users\chena\PycharmProjects\IR-rankingmodels\models/huawei-noah/TinyBERT_General_4L_312D/fiqa\short_queries.pth...
Ensembling models from C:\Users\chena\PycharmProjects\IR-rankingmodels\models/huawei-noah/TinyBERT_General_4L_312D/fiqa!


Evaluating:   0%|          | 0/1067 [00:00<?, ?it/s]

Metric nDCG@3 score: 0.6120
Metric nDCG@5 score: 0.6602
Metric nDCG@10 score: 0.7138
Metric RR score: 0.7148
Metric P@1 score: 0.5617
Metric P@3 score: 0.4048
Metric P@5 score: 0.3151
Metric R@1 score: 0.3275
Metric R@3 score: 0.6086
Metric R@5 score: 0.7446
Metric R@10 score: 0.8811


In [27]:
save_dir = f"results/{model_name}/{dataset_name}"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, 'ensemble-avg.txt')

write_results(metric_scores, save_path, model_name, dataset_name, length_setting)

Successfully written results to results/huawei-noah/TinyBERT_General_4L_312D/fiqa\ensemble-avg.txt.


## Evaluate on ensemble of models (Selective method)

In [28]:
model_dir = f'models\\{model_name}\\{dataset_name}'

models = load_models(model_dir, model_name, device)

if train_available:
    metric_scores = evaluate_conditional_ensemble(models, t1, t2, test_loader, device, qrels_test, queries_test)
else:
    metric_scores = evaluate_conditional_ensemble(models, t1, t2, test_loader, device, split_qrels_test, split_queries_test)

for metric in metrics:
    print(f'Metric {metric} score: {metric_scores[metric]:.4f}')

results['ens-select'] = metric_scores

Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\long_queries.pth...
Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\medium_queries.pth...
Loading model models\huawei-noah/TinyBERT_General_4L_312D\fiqa\short_queries.pth...
Ensembling models from models\huawei-noah/TinyBERT_General_4L_312D\fiqa!


Evaluating Conditional Ensemble:   0%|          | 0/1067 [00:00<?, ?it/s]


Calculating aggregate metrics...
Metrics calculation complete.
Metric nDCG@3 score: 0.4469
Metric nDCG@5 score: 0.5044
Metric nDCG@10 score: 0.5763
Metric RR score: 0.5500
Metric P@1 score: 0.3580
Metric P@3 score: 0.2963
Metric P@5 score: 0.2432
Metric R@1 score: 0.2169
Metric R@3 score: 0.4762
Metric R@5 score: 0.6170
Metric R@10 score: 0.7958


In [29]:
save_dir = f"results/{model_name}/{dataset_name}"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, 'ensemble-selective.txt')

write_results(metric_scores, save_path, model_name, dataset_name, length_setting)

Successfully written results to results/huawei-noah/TinyBERT_General_4L_312D/fiqa\ensemble-selective.txt.


## Evaluate on ensemble of models (Weighted average method)

In [None]:
model_dir = f'models\\{model_name}\\{dataset_name}'
models = load_models(model_dir, model_name, device)

weights_config = {'short': [0.4, 0.3, 0.3], # Weights for [short, medium, long] models when query is short
                  'medium': [0.3, 0.4, 0.3],# Weights when query is medium
                  'long': [0.3, 0.3, 0.4]  # Weights when query is long
                 }

if train_available:
    metric_scores = evaluate_weighted_average_ensemble(models, weights_config, t1, t2, test_loader, device, qrels_test, queries_test)
else:
    metric_scores = evaluate_weighted_average_ensemble(models, weights_config, t1, t2, test_loader, device, split_qrels_test, split_queries_test)

for metric in metrics:
    print(f'Metric {metric} score: {metric_scores[metric]:.4f}')

results['ens-weighted'] = metric_scores

In [None]:
save_dir = f"results/{model_name}/{dataset_name}"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, 'ensemble-weighted.txt')

write_results(metric_scores, save_path, model_name, dataset_name, length_setting)

## Evaluate on ensemble of models (Weighted average method + regression on weights)

In [None]:
# Precompute scores on the validation set
print("\n--- Optimizing Ensemble Weights using Validation Set ---")
models_all = load_models(model_dir, model_name, device) # Reload or ensure models are available
precomputed_val_scores = precompute_validation_scores(models_all, val_loader, device)

In [None]:
# Find the single optimal weights configuration using the validation set
weight_opt_trials = 300 # Or more, e.g., 100-300 depending on time

learned_weights_config = find_optimal_weights_config(
    precomputed_val_scores,
    query_val,
    qrels_val,
    t1, t2,
    metric_to_optimize=metric_to_optimize_weights, # NDCG@10
    n_trials=weight_opt_trials,
    random_state=random_state
)

print("\nLearned Weights Config:")
print(learned_weights_config)

# Evaluate on the TEST set using the single learned weights config
print("\n--- Evaluating on TEST set using LEARNED weights configuration ---")
# Models should still be loaded in models_all
if train_available:
    metric_scores_learned_w = evaluate_weighted_average_ensemble(models_all, learned_weights_config, t1, t2, test_loader, device, qrels_test, queries_test)
else:
    metric_scores_learned_w = evaluate_weighted_average_ensemble(models_all, learned_weights_config, t1, t2, test_loader, device, split_qrels_test, split_queries_test)

print("\nFinal Test Set Performance with Learned Weights:")
for metric in metrics:
     print(f'Metric {metric} score: {metric_scores_learned_w[metric]:.4f}')

results['ens-learned-weighted'] = metric_scores_learned_w

# Save learned weighted results
save_dir = f"results/{model_name}/{dataset_name}"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, 'ensemble-weighted-reg.txt')
write_results(metric_scores_learned_w, save_path, model_name, dataset_name, "learned-weighted-config") # Updated description

## Plot results

In [None]:
print(results)

In [None]:
create_comparison_plot(results, metrics, model_name, dataset_name, save_dir)