In [33]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run, ir_datasets
import pyterrier as pt
import os

In [34]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()

In [35]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
dataset = 'ir-lab-sose-2024/ir-acl-anthology-20240504-training'
pt_dataset = pt.get_dataset('irds:' + dataset)

pt_index_path = './terrier-index'

if not os.path.exists(pt_index_path + "/data.properties"):
  # create the index, using the IterDictIndexer indexer 
  indexer = pt.index.IterDictIndexer(pt_index_path, blocks=True)

  # we give the dataset get_corpus_iter() directly to the indexer
  # while specifying the fields to index and the metadata to record
  index_ref = indexer.index(pt_dataset.get_corpus_iter(), 
                            meta=('docno',))

else:
  # if you already have the index, use it.
  index_ref = pt.IndexRef.of(pt_index_path + "/data.properties")
index = pt.IndexFactory.of(index_ref)

# Running a Grid Search Algorithm to find the best configuration for BM25
We will use the best configuration afterwards.

In [36]:
def run_bm25_grid_search_run(index, output_dir, queries):
    """
        defaults: http://terrier.org/docs/current/javadoc/org/terrier/matching/models/BM25.html
        k_1 = 1.2d, k_3 = 8d, b = 0.75d
        We do not tune parameter k_3, as this parameter only impacts queries with reduntant terms.
    """
    for b in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
        for k_1 in [0.5, 1.0, 1.5, 2.0, 2.5]:
            system = f'bm25-b={b}-k_1={k_1}'
            configuration = {"bm25.b" : b, "bm25.k_1": k_1}
            run_output_dir = output_dir + '/' + system
            !rm -Rf {run_output_dir}
            !mkdir -p {run_output_dir}
            print(f'Run {system}')
            BM25 = pt.BatchRetrieve(index, wmodel="BM25", controls=configuration, verbose=True)
            run = BM25(queries)
            persist_and_normalize_run(run, system, run_output_dir)

In [37]:
dataset_tira = ir_datasets.load(dataset)
queries = pt.io.read_topics(ir_datasets.topics_file(dataset), format='trecxml')
run_bm25_grid_search_run(index, 'grid-search/training', queries)

Run bm25-b=0.1-k_1=0.5


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 23.67q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.1-k_1=0.5".
Done. run file is stored under "grid-search/training/bm25-b=0.1-k_1=0.5/run.txt".
Run bm25-b=0.1-k_1=1.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 44.85q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.1-k_1=1.0".
Done. run file is stored under "grid-search/training/bm25-b=0.1-k_1=1.0/run.txt".
Run bm25-b=0.1-k_1=1.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 36.88q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.1-k_1=1.5".
Done. run file is stored under "grid-search/training/bm25-b=0.1-k_1=1.5/run.txt".
Run bm25-b=0.1-k_1=2.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 46.83q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.1-k_1=2.0".
Done. run file is stored under "grid-search/training/bm25-b=0.1-k_1=2.0/run.txt".
Run bm25-b=0.1-k_1=2.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 39.70q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.1-k_1=2.5".
Done. run file is stored under "grid-search/training/bm25-b=0.1-k_1=2.5/run.txt".
Run bm25-b=0.2-k_1=0.5


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 30.70q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.2-k_1=0.5".
Done. run file is stored under "grid-search/training/bm25-b=0.2-k_1=0.5/run.txt".
Run bm25-b=0.2-k_1=1.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 34.72q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.2-k_1=1.0".
Done. run file is stored under "grid-search/training/bm25-b=0.2-k_1=1.0/run.txt".
Run bm25-b=0.2-k_1=1.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 37.89q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.2-k_1=1.5".
Done. run file is stored under "grid-search/training/bm25-b=0.2-k_1=1.5/run.txt".
Run bm25-b=0.2-k_1=2.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 47.01q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.2-k_1=2.0".
Done. run file is stored under "grid-search/training/bm25-b=0.2-k_1=2.0/run.txt".
Run bm25-b=0.2-k_1=2.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 43.04q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.2-k_1=2.5".
Done. run file is stored under "grid-search/training/bm25-b=0.2-k_1=2.5/run.txt".
Run bm25-b=0.3-k_1=0.5


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 30.86q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.3-k_1=0.5".
Done. run file is stored under "grid-search/training/bm25-b=0.3-k_1=0.5/run.txt".
Run bm25-b=0.3-k_1=1.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 42.39q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.3-k_1=1.0".
Done. run file is stored under "grid-search/training/bm25-b=0.3-k_1=1.0/run.txt".
Run bm25-b=0.3-k_1=1.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 42.45q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.3-k_1=1.5".
Done. run file is stored under "grid-search/training/bm25-b=0.3-k_1=1.5/run.txt".
Run bm25-b=0.3-k_1=2.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 35.57q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.3-k_1=2.0".
Done. run file is stored under "grid-search/training/bm25-b=0.3-k_1=2.0/run.txt".
Run bm25-b=0.3-k_1=2.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 47.54q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.3-k_1=2.5".
Done. run file is stored under "grid-search/training/bm25-b=0.3-k_1=2.5/run.txt".
Run bm25-b=0.4-k_1=0.5


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 24.22q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.4-k_1=0.5".
Done. run file is stored under "grid-search/training/bm25-b=0.4-k_1=0.5/run.txt".
Run bm25-b=0.4-k_1=1.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 41.19q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.4-k_1=1.0".
Done. run file is stored under "grid-search/training/bm25-b=0.4-k_1=1.0/run.txt".
Run bm25-b=0.4-k_1=1.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 43.82q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.4-k_1=1.5".
Done. run file is stored under "grid-search/training/bm25-b=0.4-k_1=1.5/run.txt".
Run bm25-b=0.4-k_1=2.0


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 32.32q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.4-k_1=2.0".
Done. run file is stored under "grid-search/training/bm25-b=0.4-k_1=2.0/run.txt".
Run bm25-b=0.4-k_1=2.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 45.90q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.4-k_1=2.5".
Done. run file is stored under "grid-search/training/bm25-b=0.4-k_1=2.5/run.txt".
Run bm25-b=0.5-k_1=0.5


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 31.81q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.5-k_1=0.5".
Done. run file is stored under "grid-search/training/bm25-b=0.5-k_1=0.5/run.txt".
Run bm25-b=0.5-k_1=1.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 39.86q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.5-k_1=1.0".
Done. run file is stored under "grid-search/training/bm25-b=0.5-k_1=1.0/run.txt".
Run bm25-b=0.5-k_1=1.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 44.43q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.5-k_1=1.5".
Done. run file is stored under "grid-search/training/bm25-b=0.5-k_1=1.5/run.txt".
Run bm25-b=0.5-k_1=2.0


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 31.36q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.5-k_1=2.0".
Done. run file is stored under "grid-search/training/bm25-b=0.5-k_1=2.0/run.txt".
Run bm25-b=0.5-k_1=2.5


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 33.45q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.5-k_1=2.5".
Done. run file is stored under "grid-search/training/bm25-b=0.5-k_1=2.5/run.txt".
Run bm25-b=0.6-k_1=0.5


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 33.42q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.6-k_1=0.5".
Done. run file is stored under "grid-search/training/bm25-b=0.6-k_1=0.5/run.txt".
Run bm25-b=0.6-k_1=1.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 44.58q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.6-k_1=1.0".
Done. run file is stored under "grid-search/training/bm25-b=0.6-k_1=1.0/run.txt".
Run bm25-b=0.6-k_1=1.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 40.71q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.6-k_1=1.5".
Done. run file is stored under "grid-search/training/bm25-b=0.6-k_1=1.5/run.txt".
Run bm25-b=0.6-k_1=2.0


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 31.90q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.6-k_1=2.0".
Done. run file is stored under "grid-search/training/bm25-b=0.6-k_1=2.0/run.txt".
Run bm25-b=0.6-k_1=2.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 41.58q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.6-k_1=2.5".
Done. run file is stored under "grid-search/training/bm25-b=0.6-k_1=2.5/run.txt".
Run bm25-b=0.7-k_1=0.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 40.07q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.7-k_1=0.5".
Done. run file is stored under "grid-search/training/bm25-b=0.7-k_1=0.5/run.txt".
Run bm25-b=0.7-k_1=1.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 41.53q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.7-k_1=1.0".
Done. run file is stored under "grid-search/training/bm25-b=0.7-k_1=1.0/run.txt".
Run bm25-b=0.7-k_1=1.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 37.06q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.7-k_1=1.5".
Done. run file is stored under "grid-search/training/bm25-b=0.7-k_1=1.5/run.txt".
Run bm25-b=0.7-k_1=2.0


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 25.22q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.7-k_1=2.0".
Done. run file is stored under "grid-search/training/bm25-b=0.7-k_1=2.0/run.txt".
Run bm25-b=0.7-k_1=2.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 42.72q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.7-k_1=2.5".
Done. run file is stored under "grid-search/training/bm25-b=0.7-k_1=2.5/run.txt".
Run bm25-b=0.8-k_1=0.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 40.75q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.8-k_1=0.5".
Done. run file is stored under "grid-search/training/bm25-b=0.8-k_1=0.5/run.txt".
Run bm25-b=0.8-k_1=1.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 39.58q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.8-k_1=1.0".
Done. run file is stored under "grid-search/training/bm25-b=0.8-k_1=1.0/run.txt".
Run bm25-b=0.8-k_1=1.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 40.97q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.8-k_1=1.5".
Done. run file is stored under "grid-search/training/bm25-b=0.8-k_1=1.5/run.txt".
Run bm25-b=0.8-k_1=2.0


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 32.43q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.8-k_1=2.0".
Done. run file is stored under "grid-search/training/bm25-b=0.8-k_1=2.0/run.txt".
Run bm25-b=0.8-k_1=2.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 38.97q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.8-k_1=2.5".
Done. run file is stored under "grid-search/training/bm25-b=0.8-k_1=2.5/run.txt".
Run bm25-b=0.9-k_1=0.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 44.54q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.9-k_1=0.5".
Done. run file is stored under "grid-search/training/bm25-b=0.9-k_1=0.5/run.txt".
Run bm25-b=0.9-k_1=1.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 38.46q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.9-k_1=1.0".
Done. run file is stored under "grid-search/training/bm25-b=0.9-k_1=1.0/run.txt".
Run bm25-b=0.9-k_1=1.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 35.34q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.9-k_1=1.5".
Done. run file is stored under "grid-search/training/bm25-b=0.9-k_1=1.5/run.txt".
Run bm25-b=0.9-k_1=2.0


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 31.63q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.9-k_1=2.0".
Done. run file is stored under "grid-search/training/bm25-b=0.9-k_1=2.0/run.txt".
Run bm25-b=0.9-k_1=2.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 43.94q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.9-k_1=2.5".
Done. run file is stored under "grid-search/training/bm25-b=0.9-k_1=2.5/run.txt".
Run bm25-b=1.0-k_1=0.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 37.00q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=1.0-k_1=0.5".
Done. run file is stored under "grid-search/training/bm25-b=1.0-k_1=0.5/run.txt".
Run bm25-b=1.0-k_1=1.0


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 38.29q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=1.0-k_1=1.0".
Done. run file is stored under "grid-search/training/bm25-b=1.0-k_1=1.0/run.txt".
Run bm25-b=1.0-k_1=1.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 39.16q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=1.0-k_1=1.5".
Done. run file is stored under "grid-search/training/bm25-b=1.0-k_1=1.5/run.txt".
Run bm25-b=1.0-k_1=2.0


BR(BM25): 100%|██████████| 68/68 [00:02<00:00, 31.01q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=1.0-k_1=2.0".
Done. run file is stored under "grid-search/training/bm25-b=1.0-k_1=2.0/run.txt".
Run bm25-b=1.0-k_1=2.5


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 44.87q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=1.0-k_1=2.5".
Done. run file is stored under "grid-search/training/bm25-b=1.0-k_1=2.5/run.txt".


# Evaluate all Configurations from Grid Search

In [39]:
!pip3 install trectools

from trectools import TrecRun, TrecQrel, TrecEval
from tira.rest_api_client import Client
from glob import glob
import pandas as pd
tira = Client()

def load_qrels(dataset):
    return TrecQrel(tira.download_dataset('ir-lab-sose-2024', dataset, truth_dataset=True) + '/qrels.txt')

training_qrels = load_qrels('ir-acl-anthology-20240504-training')


def evaluate_run(run_dir, qrels):
    run = TrecRun(run_dir + '/run.txt')
    trec_eval = TrecEval(run, qrels)

    return {
        'run': run.get_runid(),
        'nDCG@10': trec_eval.get_ndcg(depth=10),
        'nDCG@10 (unjudgedRemoved)': trec_eval.get_ndcg(depth=10, removeUnjudged=True),
        'MAP': trec_eval.get_map(depth=10),
        'MRR': trec_eval.get_reciprocal_rank(),
        'P@10': trec_eval.get_precision(depth=10)
    }


df = []
for r in glob('grid-search/training/bm25*'):
    df += [evaluate_run(r, training_qrels)]
df = pd.DataFrame(df)
df_sorted = df.sort_values('nDCG@10', ascending=False)
best_configuration = df_sorted['run'].values[0]
best_b = best_configuration.split("-")[1].split("=")[1]
best_k_1 = best_configuration.split("-")[2].split("=")[1]

[0m

  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first()

In [40]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25", controls= {"bm25.b" : best_b, "bm25.k_1": best_k_1})
sdm = pt.rewrite.SDM()

In [52]:
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])
    
topics = pt_dataset.get_topics(variant='title')

SEED=42

!pip3 install scikit-learn

from sklearn.model_selection import train_test_split

tr_va_topics, test_topics = train_test_split(topics, test_size=15, random_state=SEED)
train_topics, valid_topics =  train_test_split(tr_va_topics, test_size=5, random_state=SEED)

ltr_feats1 = bm25 >> pt.text.get_text(pt_dataset, ["text", "doc_id"]) >> (
    pt.transformer.IdentityTransformer()
    ** # sequential dependence
    (sdm >> bm25)
    ** # score of text (not originally indexed)
    (pt.text.scorer(body_attr="text", wmodel="TF_IDF", background_index=index)) 
    ** # abstract coordinate match
    pt.BatchRetrieve(index, wmodel="CoordinateMatch")
    ** # Js_Kls
    pt.BatchRetrieve(index, wmodel="Js_KLs")
    ** # BO1 Expansion
    pt.BatchRetrieve(index, wmodel="BM25", controls={"qe":"on", "qemodel" : "Bo1"})
    ** # KL Expansion
    pt.BatchRetrieve(index, wmodel="BM25", controls={"qe":"on", "qemodel" : "KL"})
)

# for reference, lets record the feature names here too
fnames=["BM25", "SDM", 'text', "CoordinateMatch", "Js_KLs", "BO1", "KL"]

[0m

In [59]:
from sklearn.ensemble import RandomForestRegressor
!pip3 install joblib
import joblib

model_path = "/app/baseline-retrieval-system/model2.joblib"

if os.path.exists(model_path):
    # Load the model
    model = joblib.load(model_path)
    print("Model loaded from file.")
    rf_pipe = ltr_feats1 >> pt.ltr.apply_learned_model(model)
else:
    
    model = RandomForestRegressor(n_estimators=400, verbose=1, random_state=SEED, n_jobs=2)
    rf_pipe = ltr_feats1 >> pt.ltr.apply_learned_model(model)
    rf_pipe.fit(train_topics, pt_dataset.get_qrels())
    joblib.dump(model, model_path)
    print("Model trained and saved to file.")

print(rf_pipe)

[0mModel loaded from file.
Compose(Compose(Compose(BR(BM25), pt.apply.generic()), FUnion(Transformer(), FUnion(Compose(SDM(), BR(BM25)), FUnion(<pyterrier.batchretrieve.TextScorer object at 0x76b3799884c0>, FUnion(BR(CoordinateMatch), FUnion(BR(Js_KLs), FUnion(BR(BM25), BR(BM25)))))))), <pyterrier.ltr.RegressionTransformer object at 0x76b3793f9180>)


In [61]:
def save_results(system, name):
    run = system(pt_dataset.get_topics('text'))
    persist_and_normalize_run(run, system_name=name, default_output='../runs')
    os.rename('../runs/run.txt', "../runs/"+name+".txt")

run = rf_pipe(pt_dataset.get_topics('text'))
persist_and_normalize_run(run, system_name="RandomForestRegressor", default_output='../runs')

The run file is normalized outside the TIRA sandbox, I will store it at "../runs".


TypeError: 'BatchRetrieve' object does not support item assignment