In [1]:
from ir_datasets_subsample import register_subsamples
import ir_datasets
import pyterrier as pt
from pathlib import Path

register_subsamples()

In [2]:
def get_index(dataset_id):
    index_dir = Path("/tmp/index/" + (dataset_id.replace('/', '-')))
    pt_dataset = pt.datasets.get_dataset("irds:" + dataset_id)

    if not index_dir.exists() or not (index_dir / "data.properties").exists():
        indexer = pt.IterDictIndexer(str(index_dir), overwrite=True, meta={"docno": 100, "text": 20480})
        indexer.index(pt_dataset.get_corpus_iter())

    return pt.IndexFactory.of(str(index_dir))

def run_pt_experiment(dataset_id):
    pt_dataset = pt.datasets.get_dataset("irds:" + dataset_id)
    query_field = "title" if "misinfo" in dataset_id else "query"
    topics = pt_dataset.get_topics(query_field)
    index = get_index(dataset_id)

    # PyTerrier needs to use pre-tokenized queries
    tokeniser = pt.java.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
    topics["query"] = topics["query"].apply(lambda i: " ".join(tokeniser.getTokens(i)))

    bm25 = pt.terrier.Retriever(index, wmodel="BM25")
    pl2 = pt.terrier.Retriever(index, wmodel="PL2")
    tf = pt.terrier.Retriever(index, wmodel="Tf")

    return pt.Experiment(
        [bm25, pl2, tf],
        topics=topics,
        qrels=pt_dataset.get_qrels(),
        eval_metrics=["ndcg_cut.10", "recip_rank"],
        names=["BM25", "PL2", "Tf"]
    )

In [4]:
run_pt_experiment("corpus-subsamples/clueweb09/en/trec-web-2009")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.088972,0.180594
1,PL2,0.065918,0.167344
2,Tf,0.000663,0.021575


In [5]:
run_pt_experiment("corpus-subsamples/clueweb09/en/trec-web-2010")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.074353,0.215784
1,PL2,0.073132,0.175654
2,Tf,0.026095,0.061632


In [6]:
run_pt_experiment("corpus-subsamples/clueweb09/en/trec-web-2011")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.222346,0.415908
1,PL2,0.110293,0.310176
2,Tf,0.015912,0.048795


In [7]:
run_pt_experiment("corpus-subsamples/clueweb09/en/trec-web-2012")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.053855,0.193225
1,PL2,0.044213,0.183628
2,Tf,0.027149,0.08791


In [8]:
run_pt_experiment("corpus-subsamples/clueweb12/trec-web-2013")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.232599,0.454886
1,PL2,0.205969,0.44531
2,Tf,0.021597,0.067767


In [9]:
run_pt_experiment("corpus-subsamples/clueweb12/trec-web-2014")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.305758,0.537395
1,PL2,0.266955,0.53413
2,Tf,0.064728,0.163052


In [10]:
run_pt_experiment("corpus-subsamples/clueweb12/b13/trec-misinfo-2019")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.482746,0.768167
1,PL2,0.343369,0.603578
2,Tf,0.134279,0.318837
