In [25]:
import pandas as pd
from tira.rest_api_client import Client
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run, ir_datasets
from tqdm import tqdm

# This method ensures that that PyTerrier is loaded so that it also works in the TIRA sandbox
ensure_pyterrier_is_loaded()
import pyterrier as pt


tira = Client()
dataset_id = 'longeval-2023-01-20240423-training'
pt_dataset = pt.get_dataset(f'irds:ir-benchmarks/{dataset_id}')

__normalize_queries = lambda q: q.lower().strip()

def load_oracle_index(file_name='../oracle-indexing/oracle-index.jsonl.gz', allowed_dataset_ids=('longeval-train-20230513-training', 'longeval-heldout-20230513-training', 'longeval-short-july-20230513-training', 'longeval-long-september-20230513-training')):
    entries = pd.read_json(file_name, orient='records', lines=True)
    entries = [i.to_dict() for _, i in entries.iterrows() if i['relevance'] > 0]
    ret = {}

    for entry in entries:
        if entry['dataset'] not in allowed_dataset_ids:
            continue
        query = __normalize_queries(entry['query'])
        if query not in ret:
            ret[query] = []
        ret[query].append(entry)

    return ret

In [2]:
oracle_index = load_oracle_index()

In [18]:
bm25_raw = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 (tira-ir-starter-pyterrier)', dataset_id) %900

{'status': 0, 'context': {'include_navigation': False, 'user_id': None, 'role': 'guest', 'organizer_teams': '[]', 'submission': {'docker_software_id': 244, 'display_name': 'BM25 (tira-ir-starter-pyterrier)', 'user_image_name': 'registry.webis.de/code-research/tira/tira-user-tira-ir-starter/pyterrier:0.0.1', 'command': '/workspace/pyterrier_cli.py --input $inputDataset --output $outputDir --index_directory $inputRun --params wmodel=BM25 --retrieval_pipeline default_pipelines.wmodel_batch_retrieve', 'tira_image_name': 'registry.webis.de/code-research/tira/tira-user-tira-ir-starter/pyterrier:0.0.1-tira-docker-software-id-sienna-bug', 'task_id': 'ir-benchmarks', 'vm_id': 'tira-ir-starter', 'description': '', 'paper_link': '', 'input_docker_software': 'Index (tira-ir-starter-pyterrier)', 'input_docker_software_id': 243, 'input_upload_id': None, 'ir_re_ranker': False, 'public_image_name': 'docker.io/webis/ir-benchmarks-submissions:tira-ir-starter-pyterrier-0-0-1-tira-docker-software-id-sienn

In [77]:
index = tira.pt.index('ir-benchmarks/tira-ir-starter/Index (tira-ir-starter-pyterrier)', dataset_id)

In [19]:
docs = bm25_raw(pt_dataset.get_topics('text'))

In [20]:
overlapping_queries = {i.query_id: i.default_text() for i in pt_dataset.irds_ref().queries_iter()}
overlapping_queries = {k: v for k, v in overlapping_queries.items() if __normalize_queries(v) in oracle_index}

In [22]:
doc_ids = []

for _, i in bm25_raw(pt_dataset.get_topics('text')).iterrows():
    if i['qid'] in overlapping_queries:
        doc_ids.append(i['docno'])

doc_ids = set(doc_ids)

In [23]:
len(doc_ids)

91458

In [26]:
docs_for_reformulation = []

for i in tqdm(pt_dataset.get_corpus_iter()):
    if i['docno'] not in doc_ids:
        continue
    docs_for_reformulation += [i]

ir-benchmarks/longeval-2023-01-20240423-training documents: 100%|██████████| 2049729/2049729 [02:13<00:00, 15370.74it/s]
100%|██████████| 2049729/2049729 [02:13<00:00, 15370.88it/s]


In [44]:
docs_for_reformulation[:2]

[{'text': '\n\nOriginal apple-themed grocery and restaurant in Paris!\nPomze France\n- Island of France\n- Paris\n- Paris\nPresentation\nDaniel and Emmanuel Dayan have made it their mission to put the humble apple back on the map.\nAnd their idea paid off, as Pomze was born in the middle of Paris!\nBoth a monomaniacal grocery store and a restaurant, their Locavore UFO promotes the artisans of this terroirist product and turns it down to envy.\nOn the Deco side, upstairs, we start in a frenzy in the orchards of France, the pictures of which decorate the walls of this former Haussmann apartment.\nOn the menu, the inventive plates of Nicolas Soliman: mackerel with cider, salad vinegar, shredded cabbage, sour apples and nuts;\nTonka bean lacquered fillet and roasted plums with pommel;\ntarte Tatin in thousand leaves of apples Reinette confites and milk ribot ice...\nNot to mention the monthly match "a big cheese, a big cider" (you know what to drink, by the way).\nServices Blue Card , Mast

In [46]:
additional_docs = {}

for i in oracle_index:
    for j in oracle_index[i]:
        additional_docs[j['doc_id']] = j['doc']

additional_docs = [{'docno': 'ADD_' + k, 'text': v} for k, v in additional_docs.items()]

In [47]:
additional_docs[:2]

[{'docno': 'ADD_doc062200105012',
  'text': '\n\nUpCadhoc:\nMulti-Sign Card and Gift Certificate Show/Hide\nnavigation\nMy user space My financial space I access my personal space I want to buy a gift certificate\nI was offered an UpCadhoc card I access my management space To place or track your UpCadhoc card orders To track your UpCadhoc card orders I access my management space To place or track your UpCadhoc card orders To track your UpCadhoc card orders My user space\nMy Financial Space My User Space I Access My Personal Space I Want to Buy Gift Certificate I Was Offered an UpCadhoc Card My Financial Space I Access My Management Space To Order or Track Your UpCadhoc Cheques To Track Your UpCadhoc Card Orders Find Us on Search on Cadhoc I Want to Access My Management and Order Space Cadhoc Connect You to Your Customer Space Cadhoc Paper Version Connect You to Your e\nThe ideal solution for motivating, rewarding teams and retaining customers\nA Cooperative and Solidarity Group New!\nD

In [48]:
print('Select overlapping topics...')
topics = pt_dataset.get_topics('text')
topics = topics[topics['qid'].isin(overlapping_queries.keys())]

print(f'Done. Found {len(topics)} overlapping topics.')

Select overlapping topics...
Done. Found 195 overlapping topics.


In [50]:
print('docs_for_reformulation:', len(docs_for_reformulation))
print('additional_docs:', len(additional_docs))

docs_for_reformulation: 91458
additional_docs: 8473


In [69]:
iter_indexer = pt.IterDictIndexer("/tmp/reformulation-index", meta={'docno': 50, 'text': 4096}, overwrite=True)
index_old = iter_indexer.index(tqdm(docs_for_reformulation + additional_docs, 'Index'))

Index: 100%|██████████| 99931/99931 [01:13<00:00, 1359.47it/s]


In [66]:
oracle_retrieval = []

for _, topic in topics.iterrows():
    r = 0
    for hit in sorted(oracle_index[__normalize_queries(overlapping_queries[topic['qid']])], key=lambda x: x['relevance'], reverse=True):
        #print(hit)
        r += 1
        oracle_retrieval += [{'qid': topic['qid'], 'query': topic['query'], 'docno': 'ADD_' + hit['doc_id'], 'rank': r, 'score': 100-r, 'run_id': 'oracle'}]

oracle_retrieval = pd.DataFrame(oracle_retrieval)
oracle_retrieval = pt.transformer.get_transformer(oracle_retrieval)

In [67]:
oracle_retrieval(topics)

Unnamed: 0,qid,query,docno,rank,score,run_id
0,q0123836,potato patty,ADD_doc062200201250,1,99,oracle
1,q0123836,potato patty,ADD_doc062200208494,2,98,oracle
2,q0123836,potato patty,ADD_doc072205600709,3,97,oracle
3,q0123836,potato patty,ADD_doc072215803774,4,96,oracle
4,q0123836,potato patty,ADD_doc062200106362,5,95,oracle
...,...,...,...,...,...,...
1739,q0123137438954745,used electric car,ADD_doc092202306639,17,83,oracle
1740,q0123137438954745,used electric car,ADD_doc092204905125,18,82,oracle
1741,q0123137438954745,used electric car,ADD_doc092204905125,19,81,oracle
1742,q0123137438954745,used electric car,ADD_doc092203008286,20,80,oracle


In [89]:
rm3_keyquery_bm25 = oracle_retrieval >> pt.rewrite.RM3(index_old, fb_docs=10, fb_terms=20) >> pt.BatchRetrieve(index, wmodel="BM25")
bo1_keyquery_bm25 = oracle_retrieval >> pt.rewrite.Bo1QueryExpansion(index_old, fb_docs=10, fb_terms=20) >> pt.BatchRetrieve(index, wmodel="BM25")
kl_keyquery_bm25 = oracle_retrieval >> pt.rewrite.KLQueryExpansion(index_old, fb_docs=10, fb_terms=20) >> pt.BatchRetrieve(index, wmodel="BM25")

In [93]:
pt.Experiment(
    [
        tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 (tira-ir-starter-pyterrier)', dataset_id),
        tira.pt.from_submission('ir-benchmarks/fschlatt/sparse-cross-encoder-4-512', dataset_id),
        tira.pt.from_submission('workshop-on-open-web-search/fschlatt/rank-zephyr', dataset_id),
        rm3_keyquery_bm25,
        bo1_keyquery_bm25,
        kl_keyquery_bm25,
    ],
    topics,
    pt_dataset.get_qrels(),
    ["ndcg", "ndcg_cut.10", "recip_rank", "recall_100"],
    names=["BM25", 'Sparse Cross Encoder', 'RankZephyr', 'RM3 Keyqueries >> BM25', 'BO1 Keyqueries >> BM25', 'KL Keyqueries >> BM25']
)

{'status': 0, 'context': {'include_navigation': False, 'user_id': None, 'role': 'guest', 'organizer_teams': '[]', 'submission': {'docker_software_id': 244, 'display_name': 'BM25 (tira-ir-starter-pyterrier)', 'user_image_name': 'registry.webis.de/code-research/tira/tira-user-tira-ir-starter/pyterrier:0.0.1', 'command': '/workspace/pyterrier_cli.py --input $inputDataset --output $outputDir --index_directory $inputRun --params wmodel=BM25 --retrieval_pipeline default_pipelines.wmodel_batch_retrieve', 'tira_image_name': 'registry.webis.de/code-research/tira/tira-user-tira-ir-starter/pyterrier:0.0.1-tira-docker-software-id-sienna-bug', 'task_id': 'ir-benchmarks', 'vm_id': 'tira-ir-starter', 'description': '', 'paper_link': '', 'input_docker_software': 'Index (tira-ir-starter-pyterrier)', 'input_docker_software_id': 243, 'input_upload_id': None, 'ir_re_ranker': False, 'public_image_name': 'docker.io/webis/ir-benchmarks-submissions:tira-ir-starter-pyterrier-0-0-1-tira-docker-software-id-sienn

Unnamed: 0,name,ndcg,ndcg_cut.10,recip_rank,recall_100
0,BM25,0.350489,0.171283,0.336867,0.547271
1,Sparse Cross Encoder,0.327323,0.202812,0.376275,0.547271
2,RankZephyr,0.342603,0.223018,0.435556,0.547271
3,RM3 Keyqueries >> BM25,0.422255,0.229326,0.461395,0.641683
4,BO1 Keyqueries >> BM25,0.443201,0.244972,0.505799,0.670757
5,KL Keyqueries >> BM25,0.440022,0.243748,0.492165,0.673453


In [82]:
rm3_keyquery_bm25(topics)

Unnamed: 0,qid,docid,docno,rank,score,query_0,query
0,q0123103079215846,1340293,doc012311802304,0,28.662406,schengen space,applypipeline:off space^0.300000012 resid^0.02...
1,q0123103079215846,1120561,doc012301310377,1,28.521230,schengen space,applypipeline:off space^0.300000012 resid^0.02...
2,q0123103079215846,1586428,doc012312006255,2,28.507358,schengen space,applypipeline:off space^0.300000012 resid^0.02...
3,q0123103079215846,766422,doc012302813370,3,28.376519,schengen space,applypipeline:off space^0.300000012 resid^0.02...
4,q0123103079215846,1586721,doc012312006548,4,28.248832,schengen space,applypipeline:off space^0.300000012 resid^0.02...
...,...,...,...,...,...,...,...
194995,q012394489282336,609542,doc012304101836,995,17.658995,leek soup,applypipeline:off music^0.014745858 cookbook^0...
194996,q012394489282336,1423406,doc012311208667,996,17.655327,leek soup,applypipeline:off music^0.014745858 cookbook^0...
194997,q012394489282336,94503,doc012308213952,997,17.648631,leek soup,applypipeline:off music^0.014745858 cookbook^0...
194998,q012394489282336,1981915,doc012310507758,998,17.647480,leek soup,applypipeline:off music^0.014745858 cookbook^0...


In [55]:
bm25_foo = pt.BatchRetrieve(index, wmodel='BM25')

In [57]:
bm25_foo.search('sda')

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,29000,doc012306005842,0,22.90029,sda
1,1,23856,doc012305705148,1,22.513659,sda
2,1,87899,doc012300603031,2,21.086208,sda
3,1,61811,doc012307305274,3,16.749233,sda
4,1,63928,doc012310906127,4,15.593594,sda
5,1,74584,doc012303907912,5,15.575961,sda
6,1,41945,doc012311711774,6,14.950474,sda
7,1,14079,doc012305800975,7,14.80585,sda
8,1,40311,doc012304511683,8,14.648405,sda
9,1,20946,doc012304406557,9,14.433524,sda
