In [1]:
from ir_datasets_subsample import register_subsamples
import ir_datasets
import pyterrier as pt
from pathlib import Path

register_subsamples()

In [8]:
def get_index(dataset_id, field, docs_iter):
    index_dir = Path("/tmp/index/" + (dataset_id.replace('/', '-') + '-' + field))

    if not index_dir.exists() or not (index_dir / "data.properties").exists():
        indexer = pt.IterDictIndexer(str(index_dir), overwrite=True, meta={"docno": 100, "text": 20480})
        indexer.index(docs_iter())

    return pt.IndexFactory.of(str(index_dir))

def run_pt_experiment(dataset_id):
    pt_dataset = pt.datasets.get_dataset("irds:" + dataset_id)
    query_field = "title" if "misinfo" in dataset_id else "query"
    topics = pt_dataset.get_topics(query_field)
    

    default_text_iter = lambda: ({"docno": i["docno"], "text": i["title"] + " " + i["text"]} for i in pt_dataset.get_corpus_iter())
    title_iter = lambda: ({"docno": i["docno"], "text": i["title"]} for i in pt_dataset.get_corpus_iter())
    main_content_iter = lambda: ({"docno": i["docno"], "text": i["main_content"]} for i in pt_dataset.get_corpus_iter())
    index_default_text = get_index(dataset_id, "default_text", default_text_iter)
    index_title = get_index(dataset_id, "title", title_iter)
    index_main_content = get_index(dataset_id, "main_content", main_content_iter)

    # PyTerrier needs to use pre-tokenized queries
    tokeniser = pt.java.autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
    topics["query"] = topics["query"].apply(lambda i: " ".join(tokeniser.getTokens(i)))

    bm25_default = pt.terrier.Retriever(index_default_text, wmodel="BM25")
    pl2_default = pt.terrier.Retriever(index_default_text, wmodel="PL2")
    tf_default = pt.terrier.Retriever(index_default_text, wmodel="Tf")

    bm25_title = pt.terrier.Retriever(index_title, wmodel="BM25")
    pl2_title = pt.terrier.Retriever(index_title, wmodel="PL2")
    tf_title = pt.terrier.Retriever(index_title, wmodel="Tf")

    bm25_main = pt.terrier.Retriever(index_main_content, wmodel="BM25")
    pl2_main = pt.terrier.Retriever(index_main_content, wmodel="PL2")
    tf_main = pt.terrier.Retriever(index_main_content, wmodel="Tf")

    return pt.Experiment(
        [bm25_default, pl2_default, tf_default, bm25_title, pl2_title, tf_title, bm25_main, pl2_main, tf_main],
        topics=topics,
        qrels=pt_dataset.get_qrels(),
        eval_metrics=["ndcg_cut.10", "recip_rank"],
        names=["BM25", "PL2", "Tf", "BM25 (Title)", "PL2 (Title)", "Tf (Title)", "BM25 (Main Content)", "PL2 (Main Content)", "Tf (Main Content)"]
    )

In [9]:
run_pt_experiment("corpus-subsamples/clueweb09/en/trec-web-2009")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.088636,0.180563
1,PL2,0.065742,0.165369
2,Tf,0.000663,0.021405
3,BM25 (Title),0.113699,0.318163
4,PL2 (Title),0.113182,0.313593
5,Tf (Title),0.052808,0.128113
6,BM25 (Main Content),0.089901,0.211468
7,PL2 (Main Content),0.071189,0.180358
8,Tf (Main Content),0.000663,0.022189


In [11]:
run_pt_experiment("corpus-subsamples/clueweb09/en/trec-web-2010")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.076105,0.21616
1,PL2,0.074575,0.175028
2,Tf,0.026095,0.06166
3,BM25 (Title),0.090311,0.318363
4,PL2 (Title),0.087581,0.323665
5,Tf (Title),0.022159,0.127526
6,BM25 (Main Content),0.066621,0.226012
7,PL2 (Main Content),0.075494,0.179385
8,Tf (Main Content),0.017902,0.060458


In [13]:
run_pt_experiment("corpus-subsamples/clueweb09/en/trec-web-2011")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.22939,0.411087
1,PL2,0.13409,0.318168
2,Tf,0.015912,0.048894
3,BM25 (Title),0.208043,0.41433
4,PL2 (Title),0.198369,0.404208
5,Tf (Title),0.039784,0.142689
6,BM25 (Main Content),0.215319,0.366182
7,PL2 (Main Content),0.105938,0.25932
8,Tf (Main Content),0.013258,0.048294


In [15]:
run_pt_experiment("corpus-subsamples/clueweb09/en/trec-web-2012")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.052761,0.187392
1,PL2,0.045864,0.184914
2,Tf,0.027366,0.089698
3,BM25 (Title),0.081439,0.30621
4,PL2 (Title),0.084407,0.317359
5,Tf (Title),0.02819,0.177155
6,BM25 (Main Content),0.046453,0.187185
7,PL2 (Main Content),0.037245,0.163543
8,Tf (Main Content),0.019015,0.084827


In [21]:
run_pt_experiment("corpus-subsamples/clueweb12/trec-web-2013")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.231938,0.455253
1,PL2,0.206711,0.446981
2,Tf,0.021597,0.067873
3,BM25 (Title),0.230365,0.502573
4,PL2 (Title),0.226249,0.497002
5,Tf (Title),0.068093,0.199548
6,BM25 (Main Content),0.244841,0.513953
7,PL2 (Main Content),0.224924,0.472195
8,Tf (Main Content),0.029425,0.08743


In [18]:
run_pt_experiment("corpus-subsamples/clueweb12/trec-web-2014")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.305282,0.550403
1,PL2,0.261025,0.501289
2,Tf,0.064799,0.163041
3,BM25 (Title),0.254503,0.546971
4,PL2 (Title),0.24997,0.552354
5,Tf (Title),0.111687,0.261968
6,BM25 (Main Content),0.310884,0.553023
7,PL2 (Main Content),0.262941,0.526033
8,Tf (Main Content),0.066293,0.194448


In [20]:
run_pt_experiment("corpus-subsamples/clueweb12/b13/trec-misinfo-2019")

Unnamed: 0,name,ndcg_cut.10,recip_rank
0,BM25,0.483547,0.7675
1,PL2,0.34629,0.613273
2,Tf,0.135046,0.318828
3,BM25 (Title),0.315915,0.614693
4,PL2 (Title),0.303257,0.609551
5,Tf (Title),0.079646,0.225955
6,BM25 (Main Content),0.49755,0.786652
7,PL2 (Main Content),0.398274,0.671653
8,Tf (Main Content),0.135193,0.33336
