In [1]:
import pandas as pd
from tira.rest_api_client import Client
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run, ir_datasets
from tqdm import tqdm

# This method ensures that that PyTerrier is loaded so that it also works in the TIRA sandbox
ensure_pyterrier_is_loaded()
import pyterrier as pt


tira = Client()
dataset_id = 'longeval-2023-01-20240423-training'
pt_dataset = pt.get_dataset(f'irds:ir-benchmarks/{dataset_id}')

__normalize_queries = lambda q: q.lower().strip()

def load_oracle_index(file_name='../oracle-indexing/oracle-index.jsonl.gz', allowed_dataset_ids=('longeval-train-20230513-training', 'longeval-heldout-20230513-training', 'longeval-short-july-20230513-training', 'longeval-long-september-20230513-training', 'longeval-2023-01-20240423-training')):
    entries = pd.read_json(file_name, orient='records', lines=True)
    entries = [i.to_dict() for _, i in entries.iterrows() if i['relevance'] > 0]
    ds_id_to_matches = {}
    ret = {}

    for entry in entries:
        if entry['dataset'] not in allowed_dataset_ids:
            continue
        query = __normalize_queries(entry['query'])
        if query not in ret:
            ret[query] = []
        ret[query].append(entry)

    return ret

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [2]:
oracle_index = load_oracle_index()

In [3]:
overlapping_queries = {i.query_id: i.default_text() for i in pt_dataset.irds_ref().queries_iter()}
overlapping_queries = {k: v for k, v in overlapping_queries.items() if __normalize_queries(v) in oracle_index}

In [4]:
print('Select overlapping topics...')
topics = pt_dataset.get_topics('text')
topics = topics[topics['qid'].isin(overlapping_queries.keys())]

print(f'Done. Found {len(topics)} overlapping topics.')

Select overlapping topics...
Done. Found 599 overlapping topics.


In [5]:
bo1_keyquery_bm25 = pt.transformer.get_transformer(pt.io.read_results('results/lag1-run.txt'))

In [11]:
bo1_keyquery_bm25(topics)

Unnamed: 0,qid,query,docno,rank,score,name
0,q012318,case over the border,doc012311802151,1,64.095481,bo1-keyquery-bm25
1,q012318,case over the border,doc012311208165,2,63.872447,bo1-keyquery-bm25
2,q012318,case over the border,doc012302106800,3,62.355284,bo1-keyquery-bm25
3,q012318,case over the border,doc012312601719,4,60.145840,bo1-keyquery-bm25
4,q012318,case over the border,doc012311313365,5,59.913390,bo1-keyquery-bm25
...,...,...,...,...,...,...
598995,q0123137438954745,used electric car,doc012300216377,996,21.104490,bo1-keyquery-bm25
598996,q0123137438954745,used electric car,doc012300114005,997,21.104490,bo1-keyquery-bm25
598997,q0123137438954745,used electric car,doc012303714732,998,21.099496,bo1-keyquery-bm25
598998,q0123137438954745,used electric car,doc012302016900,999,21.097952,bo1-keyquery-bm25


In [13]:
topics

Unnamed: 0,qid,query
0,q012318,case over the border
1,q012396,water atlantic
2,q0123180,blanquette de veau recipe
3,q0123240,gift woman
4,q0123387,government
...,...,...
594,q0123137438954650,tajine veal
595,q0123137438954652,leek pie
596,q0123137438954702,veal head gribiche sauce
597,q0123137438954719,veal recipe


In [16]:
pt_dataset.get_qrels()

Unnamed: 0,qid,docno,label,iteration
0,q012318,doc012303114898,0,0
1,q012318,doc012307806130,1,0
2,q012318,doc012311314092,0,0
3,q012318,doc012301310209,0,0
4,q012318,doc012311608989,0,0
...,...,...,...,...
9780,q0123137438954745,doc012302414038,0,0
9781,q0123137438954745,doc012311608287,0,0
9782,q0123137438954745,doc012302414427,1,0
9783,q0123137438954745,doc012306601180,0,0


In [19]:
tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 (tira-ir-starter-pyterrier)', dataset_id)(topics)

{'status': 0, 'context': {'include_navigation': False, 'user_id': None, 'role': 'guest', 'organizer_teams': '[]', 'submission': {'docker_software_id': 244, 'display_name': 'BM25 (tira-ir-starter-pyterrier)', 'user_image_name': 'registry.webis.de/code-research/tira/tira-user-tira-ir-starter/pyterrier:0.0.1', 'command': '/workspace/pyterrier_cli.py --input $inputDataset --output $outputDir --index_directory $inputRun --params wmodel=BM25 --retrieval_pipeline default_pipelines.wmodel_batch_retrieve', 'tira_image_name': 'registry.webis.de/code-research/tira/tira-user-tira-ir-starter/pyterrier:0.0.1-tira-docker-software-id-sienna-bug', 'task_id': 'ir-benchmarks', 'vm_id': 'tira-ir-starter', 'description': '', 'paper_link': '', 'input_docker_software': 'Index (tira-ir-starter-pyterrier)', 'input_docker_software_id': 243, 'input_upload_id': None, 'ir_re_ranker': False, 'public_image_name': 'docker.io/webis/ir-benchmarks-submissions:tira-ir-starter-pyterrier-0-0-1-tira-docker-software-id-sienn

Unnamed: 0,qid,query,q0,rank,score,system,docno
0,q012318,case over the border,Q0,1.0,13.845236,pyterrier.default_pipelines.wmodel_batch_retrieve,doc012308410757
1,q012318,case over the border,Q0,2.0,13.844633,pyterrier.default_pipelines.wmodel_batch_retrieve,doc012301806861
2,q012318,case over the border,Q0,3.0,13.597514,pyterrier.default_pipelines.wmodel_batch_retrieve,doc012306804277
3,q012318,case over the border,Q0,4.0,13.511236,pyterrier.default_pipelines.wmodel_batch_retrieve,doc012302206125
4,q012318,case over the border,Q0,5.0,13.460093,pyterrier.default_pipelines.wmodel_batch_retrieve,doc012302014548
...,...,...,...,...,...,...,...
585436,q0123137438954745,used electric car,Q0,996.0,12.613259,pyterrier.default_pipelines.wmodel_batch_retrieve,doc012303705976
585437,q0123137438954745,used electric car,Q0,997.0,12.610808,pyterrier.default_pipelines.wmodel_batch_retrieve,doc012309202530
585438,q0123137438954745,used electric car,Q0,998.0,12.610628,pyterrier.default_pipelines.wmodel_batch_retrieve,doc012312602259
585439,q0123137438954745,used electric car,Q0,999.0,12.609772,pyterrier.default_pipelines.wmodel_batch_retrieve,doc012307814438


In [24]:
pt.Experiment(
    [
        #tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 (tira-ir-starter-pyterrier)', dataset_id),
        #tira.pt.from_submission('ir-benchmarks/fschlatt/sparse-cross-encoder-4-512', dataset_id),
        #tira.pt.from_submission('workshop-on-open-web-search/fschlatt/rank-zephyr', dataset_id),
        bo1_keyquery_bm25,
        pt.transformer.get_transformer(pt.io.read_results('../runs/ows_ltr_all/ows_ltr_all.train_2024')),
        pt.transformer.get_transformer(pt.io.read_results('../learning-to-rank/ows_bm25_reverted_index/ows_bm25_reverted_index.lag1')),
    ],
    topics,
    pt_dataset.get_qrels(),
    ["ndcg", "ndcg_cut.10", "recip_rank", "recall_100"],
    names=[
        #"BM25", #'Sparse Cross Encoder', 'RankZephyr',
        'BO1 Keyqueries >> BM25',
        'ows-ltr'
        ]
)

Unnamed: 0,name,ndcg,ndcg_cut.10,recip_rank,recall_100
0,BO1 Keyqueries >> BM25,0.55354,0.381203,0.699123,0.762923
1,ows-ltr,0.570855,0.556623,0.799832,0.558882


In [82]:
rm3_keyquery_bm25(topics)

Unnamed: 0,qid,docid,docno,rank,score,query_0,query
0,q0123103079215846,1340293,doc012311802304,0,28.662406,schengen space,applypipeline:off space^0.300000012 resid^0.02...
1,q0123103079215846,1120561,doc012301310377,1,28.521230,schengen space,applypipeline:off space^0.300000012 resid^0.02...
2,q0123103079215846,1586428,doc012312006255,2,28.507358,schengen space,applypipeline:off space^0.300000012 resid^0.02...
3,q0123103079215846,766422,doc012302813370,3,28.376519,schengen space,applypipeline:off space^0.300000012 resid^0.02...
4,q0123103079215846,1586721,doc012312006548,4,28.248832,schengen space,applypipeline:off space^0.300000012 resid^0.02...
...,...,...,...,...,...,...,...
194995,q012394489282336,609542,doc012304101836,995,17.658995,leek soup,applypipeline:off music^0.014745858 cookbook^0...
194996,q012394489282336,1423406,doc012311208667,996,17.655327,leek soup,applypipeline:off music^0.014745858 cookbook^0...
194997,q012394489282336,94503,doc012308213952,997,17.648631,leek soup,applypipeline:off music^0.014745858 cookbook^0...
194998,q012394489282336,1981915,doc012310507758,998,17.647480,leek soup,applypipeline:off music^0.014745858 cookbook^0...


In [55]:
bm25_foo = pt.BatchRetrieve(index, wmodel='BM25')

In [57]:
bm25_foo.search('sda')

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,29000,doc012306005842,0,22.90029,sda
1,1,23856,doc012305705148,1,22.513659,sda
2,1,87899,doc012300603031,2,21.086208,sda
3,1,61811,doc012307305274,3,16.749233,sda
4,1,63928,doc012310906127,4,15.593594,sda
5,1,74584,doc012303907912,5,15.575961,sda
6,1,41945,doc012311711774,6,14.950474,sda
7,1,14079,doc012305800975,7,14.80585,sda
8,1,40311,doc012304511683,8,14.648405,sda
9,1,20946,doc012304406557,9,14.433524,sda
