In [9]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run, ir_datasets
from tira.rest_api_client import Client
ensure_pyterrier_is_loaded()
import pandas as pd
import pyterrier as pt
from tqdm import tqdm

In [10]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

In [11]:
# The dataset: the union of the IR Anthology and the ACL Anthology
# This line creates an IRDSDataset object and registers it under the name provided as an argument.
dataset = 'ir-lab-sose-2024/ir-acl-anthology-20240504-training'
pt_dataset = pt.get_dataset('irds:' + dataset)

index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)
print("Files in IR-ACL corpus: %s " % len(list(pt_dataset.get_corpus_iter())))


# We are using Pisa Index to index the dataset
#index = PisaIndex('./index', overwrite=True)
#index.index(pt_dataset.get_corpus_iter())

# get all topics of training dataset
#topics = pt_dataset.get_topics()
#topics

ir-lab-sose-2024/ir-acl-anthology-20240504-training documents: 100%|██████████| 126958/126958 [00:02<00:00, 57155.22it/s]

Files in IR-ACL corpus: 126958 





# We now do Query expansion in order to improve retrieval effectiveness 
Query expansion generally improves recall, by adding more terms to the query, it broadens the search scope, potentially retrieving more relevant documents. While it could also have a slight negative effect on precision since query expansion might also introdoces irrelevant results. Therefore, we use ndcg_cut_5 and recall_1000 as the evalutation metric.

In [19]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25")

In [20]:
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

bo1_pipe = bm25 >> pt.rewrite.Bo1QueryExpansion(index) >> bm25
pipelineDisplay = bm25 >> bo1_pipe
pipelineDisplay.search("retrieval")
pt.Experiment([bm25, bo1_pipe], pt_dataset.get_topics(), pt_dataset.get_qrels(), eval_metrics=['ndcg_cut_5', 'recall_1000'], names=['BM25', 'BM25 >> Bo1 >> BM25'])

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


Unnamed: 0,name,ndcg_cut_5,recall_1000
0,BM25,0.39365,0.825376
1,BM25 >> Bo1 >> BM25,0.381675,0.833643


In [21]:
kl_pipe = bm25 >> pt.rewrite.KLQueryExpansion(index) >> bm25

pt.Experiment([bm25, kl_pipe], pt_dataset.get_topics(), pt_dataset.get_qrels(), eval_metrics=['ndcg_cut_5', 'recall_1000'], names=['BM25', 'BM25 >> KL >> BM25'])

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


Unnamed: 0,name,ndcg_cut_5,recall_1000
0,BM25,0.39365,0.825376
1,BM25 >> KL >> BM25,0.383947,0.831915


In [22]:
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

rm3_pipe = bm25 >> pt.rewrite.RM3(index) >> bm25

pt.Experiment([bm25, rm3_pipe], pt_dataset.get_topics(), pt_dataset.get_qrels(), eval_metrics=['ndcg_cut_5', 'recall_1000'], names=['BM25', 'BM25 >> RM3 >> BM25'])

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.
11:49:14.698 [main] WARN org.terrier.querying.RM1 - Did not identify any usable candidate expansion terms from docid 125137 among 5 possibilities
11:49:14.934 [main] WARN org.terrier.querying.RM1 - Did not identify any usable candidate expansion terms from docid 116910 among 4 possibilities


Unnamed: 0,name,ndcg_cut_5,recall_1000
0,BM25,0.39365,0.825376
1,BM25 >> RM3 >> BM25,0.341725,0.825062


# Running gridsearch to find the best configuration

In [14]:
def run_bm25_grid_search_run(index, output_dir, queries):
    """
        defaults: http://terrier.org/docs/current/javadoc/org/terrier/matching/models/BM25.html
        k_1 = 1.2d, k_3 = 8d, b = 0.75d
        We do not tune parameter k_3, as this parameter only impacts queries with reduntant terms.
    """
    for b in [0.7, 0.75, 0.8]:
        for k_1 in [1.1, 1.2, 1.3]:
            system = f'bm25-b={b}-k_1={k_1}'
            configuration = {"bm25.b" : b, "bm25.k_1": k_1}
            run_output_dir = output_dir + '/' + system
            !rm -Rf {run_output_dir}
            !mkdir -p {run_output_dir}
            print(f'Run {system}')
            BM25 = pt.BatchRetrieve(index, wmodel="BM25", controls=configuration, verbose=True)
            run = BM25(queries)
            persist_and_normalize_run(run, system, run_output_dir)

In [12]:
dataset_tira = ir_datasets.load(dataset)
queries = pt.io.read_topics(ir_datasets.topics_file(dataset), format='trecxml')

queries.head(3)

Unnamed: 0,qid,query
0,1,retrieval system improving effectiveness
1,2,machine learning language identification
2,3,social media detect self harm


In [15]:
run_bm25_grid_search_run(index, 'grid-search/training', queries)

Run bm25-b=0.7-k_1=1.1


BR(BM25): 100%|██████████| 68/68 [00:03<00:00, 19.56q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.7-k_1=1.1".
Done. run file is stored under "grid-search/training/bm25-b=0.7-k_1=1.1/run.txt".
Run bm25-b=0.7-k_1=1.2


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 50.12q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.7-k_1=1.2".
Done. run file is stored under "grid-search/training/bm25-b=0.7-k_1=1.2/run.txt".
Run bm25-b=0.7-k_1=1.3


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 52.09q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.7-k_1=1.3".
Done. run file is stored under "grid-search/training/bm25-b=0.7-k_1=1.3/run.txt".
Run bm25-b=0.75-k_1=1.1


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 50.30q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.75-k_1=1.1".
Done. run file is stored under "grid-search/training/bm25-b=0.75-k_1=1.1/run.txt".
Run bm25-b=0.75-k_1=1.2


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 48.70q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.75-k_1=1.2".
Done. run file is stored under "grid-search/training/bm25-b=0.75-k_1=1.2/run.txt".
Run bm25-b=0.75-k_1=1.3


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 54.40q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.75-k_1=1.3".
Done. run file is stored under "grid-search/training/bm25-b=0.75-k_1=1.3/run.txt".
Run bm25-b=0.8-k_1=1.1


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 49.58q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.8-k_1=1.1".
Done. run file is stored under "grid-search/training/bm25-b=0.8-k_1=1.1/run.txt".
Run bm25-b=0.8-k_1=1.2


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 49.68q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.8-k_1=1.2".
Done. run file is stored under "grid-search/training/bm25-b=0.8-k_1=1.2/run.txt".
Run bm25-b=0.8-k_1=1.3


BR(BM25): 100%|██████████| 68/68 [00:01<00:00, 49.17q/s]


The run file is normalized outside the TIRA sandbox, I will store it at "grid-search/training/bm25-b=0.8-k_1=1.3".
Done. run file is stored under "grid-search/training/bm25-b=0.8-k_1=1.3/run.txt".


# Part 2: Evaluate all Configurations of the Grid Search

In [17]:
from trectools import TrecRun, TrecQrel, TrecEval
from tira.rest_api_client import Client
from glob import glob
import pandas as pd
tira = Client()

def load_qrels(dataset):
    return TrecQrel(tira.download_dataset('ir-lab-sose-2024', dataset, truth_dataset=True) + '/qrels.txt')

training_qrels = load_qrels('ir-acl-anthology-20240504-training')

In [18]:
def evaluate_run(run_dir, qrels):
    run = TrecRun(run_dir + '/run.txt')
    trec_eval = TrecEval(run, qrels)

    return {
        'run': run.get_runid(),
        'nDCG@10': trec_eval.get_ndcg(depth=10),
        'nDCG@10 (unjudgedRemoved)': trec_eval.get_ndcg(depth=10, removeUnjudged=True),
        'MAP': trec_eval.get_map(depth=10),
        'MRR': trec_eval.get_reciprocal_rank()
    }

In [19]:
df = []
for r in glob('grid-search/training/bm25*'):
    df += [evaluate_run(r, training_qrels)]
df = pd.DataFrame(df)
df.sort_values('nDCG@10', ascending=False)

  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()
  selection = selection[~selection["rel"].isnull()].groupby("query").first().copy()


Unnamed: 0,run,nDCG@10,nDCG@10 (unjudgedRemoved),MAP,MRR
6,bm25-b=0.7-k_1=1.1,0.395015,0.661986,0.153667,0.603516
0,bm25-b=0.7-k_1=1.2,0.390369,0.663237,0.151478,0.596932
1,bm25-b=0.7-k_1=1.3,0.389849,0.663309,0.150454,0.601123
3,bm25-b=0.75-k_1=1.3,0.376771,0.667465,0.142509,0.582299
7,bm25-b=0.75-k_1=1.1,0.376579,0.665047,0.143468,0.587166
8,bm25-b=0.75-k_1=1.2,0.37422,0.667753,0.14213,0.579877
2,bm25-b=0.8-k_1=1.1,0.359005,0.667781,0.135026,0.553124
4,bm25-b=0.8-k_1=1.3,0.35534,0.67167,0.133847,0.540789
5,bm25-b=0.8-k_1=1.2,0.355237,0.66832,0.132791,0.545914
