In [29]:
import pyterrier as pt
import pandas as pd
import os
from collections import defaultdict

from retrieval_utils import generate_document_pool_rrf_df, apply_cutoff
if not pt.started():
    pt.init()

  if not pt.started():


In [15]:
#init retrieval models
dataset = pt.get_dataset('irds:cord19/fulltext/trec-covid')

index_folder = "/workspaces/CORD19_Plus/retrieval/indices"
field_dict = {0 : 'docno', 1 : 'ir_id', 2 : 'content', 3 : 'header', 4 : 'caption', 5 : 'references'}
field_configs = [[0,1,2,3,4,5], [0,1,2], [0,1,3], [0,1,4], [0,1,5]]

#field_configs = [[1,2,3,4,5]]

index_paths = []

for config in field_configs:
    fields = [field_dict[c] for c in config]
    current_path = "_".join(fields)
    full_path = f"{index_folder}/{current_path}"
    index_paths.append(full_path)

index_paths = sorted(index_paths)
indices  = [pt.IndexFactory.of(path) for path in index_paths]

for index in indices:
    print(index.getCollectionStatistics().toString())

cut_off = 100
engines = {i : pt.BatchRetrieve(indices[i], wmodel="BM25")%cut_off for i in range(len(indices))}

Number of documents: 137
Number of terms: 618
Number of postings: 1837
Number of fields: 3
Number of tokens: 2055
Field names: [docno, ir_id, caption]
Positions:   false

Number of documents: 137
Number of terms: 4138
Number of postings: 10196
Number of fields: 3
Number of tokens: 21726
Field names: [docno, ir_id, content]
Positions:   false

Number of documents: 137
Number of terms: 5757
Number of postings: 18155
Number of fields: 6
Number of tokens: 39720
Field names: [docno, ir_id, content, header, caption, references]
Positions:   false

Number of documents: 137
Number of terms: 619
Number of postings: 1516
Number of fields: 3
Number of tokens: 1901
Field names: [docno, ir_id, header]
Positions:   false

Number of documents: 137
Number of terms: 2919
Number of postings: 8594
Number of fields: 3
Number of tokens: 15490
Field names: [docno, ir_id, references]
Positions:   false


  engines = {i : pt.BatchRetrieve(indices[i], wmodel="BM25")%cut_off for i in range(len(indices))}


In [16]:
topics_t = dataset.get_topics('title')
topics_d = dataset.get_topics('description')
topics_n = dataset.get_topics('narrative')

topics_all = dataset.get_topics('title')

topics_all['query'] = topics_all.apply(lambda row: f"{topics_t.loc[int(row['qid'])-1]['query']}. {topics_d.loc[int(row['qid'])-1]['query']}. {topics_n.loc[int(row['qid'])-1]['query']}", axis=1)

In [17]:
qrels_path = "/workspaces/CORD19_Plus/retrieval/table_qrels.json"
qrels = pd.read_json(qrels_path)

qrels['qid'] = qrels['qid'].astype(str)
qrels['docno'] = qrels['docno'].apply(lambda row: row.replace(".json", ""))

In [18]:
def save_ranking(engine, topics, name, output_path="/workspaces/CORD19_Plus/retrieval/rankings"):
    res = engine.transform(topics)
    pt.io.write_results(res, f'{output_path}/{name}.trec', format='trec', run_name=name)

In [19]:
for i, engine in engines.items():
    name = f"bm25_{index_paths[i].split('/')[-1].replace('docno_ir_id_', '')}"
    print(name)
    if name == "bm25_content_header_caption_references":
        name = "bm25_catchall"

    save_ranking(engines[0], topics_all, name)

bm25_caption
bm25_content
bm25_content_header_caption_references
bm25_header
bm25_references


In [21]:
pool = generate_document_pool_rrf_df("/workspaces/CORD19_Plus/retrieval/rankings")
pool.rename(columns={'rrf_score': 'score'}, inplace=True)
# Add a new column 'rank' for each 'qid' based on the descending order of 'score'
pool['rank'] = pool.groupby('qid')['score'].rank(ascending=False, method='first').astype(int)

In [26]:
pd.set_option('display.max_rows', None)

In [23]:
qrels_path = "/workspaces/CORD19_Plus/retrieval/table_qrels.json"
qrels = pd.read_json(qrels_path)

qrels['qid'] = qrels['qid'].astype(str)
qrels['docno'] = qrels['docno'].apply(lambda row: row.replace(".json", ""))
table_qrels = qrels

In [None]:
pool['rank'] -=1

In [24]:
pt.Experiment(
    [pool, engines[0], engines[1], engines[2], engines[3], engines[4]],
    topics_all,
    table_qrels,
    eval_metrics=['P_10', 'P_20', 'P_50','P_100', 'map', 'ndcg_cut_10','ndcg_cut_50','recall_100', 'mrt']
)

Unnamed: 0,name,P_10,P_20,P_50,P_100,map,ndcg_cut_10,ndcg_cut_50,recall_100,mrt
0,qid docno score rank\n0 ...,0.05,0.085,0.062,0.05,0.10284,0.043459,0.214006,0.9,0.0
1,"RankCutoff(TerrierRetr(BM25), 100)",0.04,0.085,0.052,0.027,0.094912,0.051936,0.201783,0.476429,5.250254
2,"RankCutoff(TerrierRetr(BM25), 100)",0.05,0.04,0.054,0.036,0.075289,0.087865,0.207452,0.542381,6.512762
3,"RankCutoff(TerrierRetr(BM25), 100)",0.11,0.095,0.062,0.048,0.107915,0.116123,0.273509,0.875,8.46454
4,"RankCutoff(TerrierRetr(BM25), 100)",0.07,0.035,0.022,0.011,0.064795,0.12845,0.149807,0.243333,4.152941
5,"RankCutoff(TerrierRetr(BM25), 100)",0.08,0.07,0.064,0.039,0.125051,0.125906,0.287337,0.752143,5.680817


In [30]:
pool = apply_cutoff(pool, 50)

In [31]:
pool

Unnamed: 0,qid,docno,score,rank
0,1,06o7pa3d_8_0,1.6918,1
1,1,066rysjh_24_0,0.0938,2
2,1,033q671f_12_0,0.0913,3
3,1,066rysjh_25_0,0.0913,4
4,1,05tszdt7_4_0,0.0912,5
5,1,02bwyi1w_20_0,0.0896,6
6,1,05rorg0t_6_0,0.089,7
7,1,01es0zv4_3_0,0.089,8
8,1,033q671f_14_0,0.0864,9
9,1,05fc3ne1_2_0,0.0862,10
