In [1]:
from ir_datasets_subsample import register_subsamples
import ir_datasets
import pyterrier as pt

register_subsamples()

# Access data with PyTerrier

### Step 1: Load the datasets

In [2]:
pt_dataset = pt.datasets.get_dataset("irds:corpus-subsamples/trec-rag-2024")

### Step 2: Index the documents

In [3]:
indexer = pt.IterDictIndexer("/tmp/index-rag24", overwrite=True, meta={"docno": 100, "text": 20480})
indexer.index(pt_dataset.get_corpus_iter())

index = pt.IndexFactory.of("/tmp/index-rag24")

Java started (triggered by TerrierIndexer.__init__) and loaded: pyterrier.java, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
corpus-subsamples/trec-rag-2024 documents: 116694it [00:38, 3026.82it/s]


### Step 3: Configure Retrieval Approaches

In [4]:
bm25 = pt.terrier.Retriever(index, wmodel="BM25")
pl2 = pt.terrier.Retriever(index, wmodel="PL2")

### Step 4: Run Retrieval Experiments

In [5]:
pt.Experiment(
        [bm25, pl2],
        topics=pt_dataset.get_topics(),
        qrels=pt_dataset.get_qrels(),
        eval_metrics=["ndcg_cut.10", "recip_rank"],
        names=["BM25", "PL2"]
    )

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.264266,0.550517
1,PL2,0.232176,0.501331


### Step 5: Inspect queries and qrels in detail

In [6]:
pt_dataset.get_topics().head(3)

Unnamed: 0,qid,query
0,2024-145979,what is vicarious trauma and how can it be cop...
1,2024-216592,why disability insurance is a smart investment
2,2024-32912,how bad did the vietnam war devastate the econ...


In [7]:
pt_dataset.get_qrels().head(3)

Unnamed: 0,qid,docno,label,iteration
0,2024-105741,msmarco_v2.1_doc_00_125364462#6_229054655,0,0
1,2024-105741,msmarco_v2.1_doc_00_1534870566#4_2687181963,0,0
2,2024-105741,msmarco_v2.1_doc_00_1534870566#5_2687183111,1,0


# Alternative: Access data with ir_datasets

In [8]:
dataset = ir_datasets.load("corpus-subsamples/trec-rag-2024")


In [9]:
for query in dataset.queries_iter():
    print(query)
    break

GenericQuery(query_id='2024-145979', text='what is vicarious trauma and how can it be coped with?')


In [10]:
for doc in dataset.docs_iter():
    print(doc)
    break

GenericDoc(doc_id='msmarco_v2.1_doc_52_1400719578#0_2842698387', text='Taylor Swift\'s "Invisible String" Lyrics Meaning - Song Meanings and Facts Taylor Swift’s “Invisible String” Lyrics Meaning\nTaylor Swift’s “Invisible String” Lyrics Meaning\n“Invisible String” is about Taylor Swift’s relationship with Joe Alwyn?\nPast Relationship References\nIn Conclusion…\n“Invincible String” Details\nWas “Invincible String” released as a single?\n Taylor Swift\'s "Invisible String" Lyrics Meaning - Song Meanings and Facts\nTaylor Swift’s “Invisible String” Lyrics Meaning\nby Amanda London ·  Published August 18, 2020 · Updated August 18, 2020\nIn “Invisible String”, Taylor borrows the East Asian legend which describes the “ Red String of Fate ” as an invisible string which ties two people who are destined to meet together. Taylor explores the possibility of this legend being a reality in her new relationship. She uses different colors in telling the story of the state of her relationship. For i

In [11]:
for qrel in dataset.qrels_iter():
    print(qrel)
    break

TrecQrel(query_id='2024-105741', doc_id='msmarco_v2.1_doc_00_125364462#6_229054655', relevance=0, iteration='0')
