# Create Static Run Files

In [31]:
import ir_datasets
import pandas as pd
import json
import ir_datasets

WAPO_JUDGED_IDS = set(i for sublist in ir_datasets.load('wapo/v2/trec-core-2018').qrels.asdict().values() for i in sublist)
NYT_JUDGED_IDS = set(i for sublist in ir_datasets.load('nyt/trec-core-2017').qrels.asdict().values() for i in sublist)

LEAKAGE_TRAINING_DATASETS = [
    '1k-random-orcas-no-overlap', '1k-random-ms-marco-no-overlap', '1k-explicit-robust04-train-leakage', '1k-queries-only-paraphrases', 
    '10k-random-orcas-no-overlap', '10k-random-ms-marco-no-overlap', '10k-explicit-robust04-train-leakage', '10k-queries-only-paraphrases', 

    '2k-random-orcas-and-ms-marco-no-overlap', '4k-random-orcas-and-ms-marco-no-overlap','8k-random-orcas-and-ms-marco-no-overlap', '16k-random-orcas-and-ms-marco-no-overlap', '32k-random-orcas-and-ms-marco-no-overlap', #'64k-random-orcas-and-ms-marco-no-overlap',
    
    '2k-random-orcas-and-ms-marco-no-overlap-and-1k-paraphrases', '4k-random-orcas-and-ms-marco-no-overlap-and-1k-paraphrases', '8k-random-orcas-and-ms-marco-no-overlap-and-1k-paraphrases', '16k-random-orcas-and-ms-marco-no-overlap-and-1k-paraphrases', '32k-random-orcas-and-ms-marco-no-overlap-and-1k-paraphrases', #'64k-random-orcas-and-ms-marco-no-overlap-and-1k-paraphrases',
    
    '2k-random-orcas-and-ms-marco-no-overlap-and-1k-robust04', '4k-random-orcas-and-ms-marco-no-overlap-and-1k-robust04', '8k-random-orcas-and-ms-marco-no-overlap-and-1k-robust04', '16k-random-orcas-and-ms-marco-no-overlap-and-1k-robust04', '32k-random-orcas-and-ms-marco-no-overlap-and-1k-robust04', # '64k-random-orcas-and-ms-marco-no-overlap-and-1k-robust04',
    
    '2k-random-orcas-and-ms-marco-no-overlap-and-400-explicit-nyt-train-leakage', '4k-random-orcas-and-ms-marco-no-overlap-and-400-explicit-nyt-train-leakage', '8k-random-orcas-and-ms-marco-no-overlap-and-400-explicit-nyt-train-leakage', '16k-random-orcas-and-ms-marco-no-overlap-and-400-explicit-nyt-train-leakage', '32k-random-orcas-and-ms-marco-no-overlap-and-400-explicit-nyt-train-leakage',
    
    '2k-random-orcas-and-ms-marco-no-overlap-and-400-explicit-wapo-train-leakage', '4k-random-orcas-and-ms-marco-no-overlap-and-400-explicit-wapo-train-leakage', '8k-random-orcas-and-ms-marco-no-overlap-and-400-explicit-wapo-train-leakage', '16k-random-orcas-and-ms-marco-no-overlap-and-400-explicit-wapo-train-leakage', '32k-random-orcas-and-ms-marco-no-overlap-and-400-explicit-wapo-train-leakage',
]

def prefix(doc_id):
    if doc_id.startswith('FBI') or doc_id.startswith('FT') or doc_id.startswith('FR') or doc_id.startswith('LA'):
        return 'robust04_'
    elif doc_id.startswith('D') and int(doc_id[1:]) > 0:
        return 'ms_marco_'
    elif doc_id in WAPO_JUDGED_IDS:
        return 'wapo_'
    elif doc_id in NYT_JUDGED_IDS:
        return 'nyt_'
    else:
         raise ValueError(doc_id)

def to_qrels_ms_marco(file_name):    
    ret = []
    
    with open('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/training-data/' + file_name + '/training-data.txt', 'r') as f:
        for l in f:
            l = json.loads(l)
            
            ret += [{'query_id': prefix(l['docid']) + str(l['query_id']), 'doc_id': prefix(l['docid']) + str(l['docid']), 'rel': l['rel']}]

    return pd.DataFrame(ret)

def robust_04_qrels():
    tmp = ir_datasets.load('trec-robust04').qrels.asdict()
    ret = {}
    for k,v in tmp.items():
        ret['robust04_' + k] = {'robust04_' + doc_id: rel for doc_id, rel in v.items()}

    return ret

# Topics to skip from qrels and topics because they retrieve empty results('ms_marco_8874431' has a navigational query pointing to a documnent not retrievable by the content)
TOPIC_SKIPLIST = set(json.load(open('../../jupyter/capreolus-models/topic_skiplist.json')))

def remove_unwanted_topics(data):
    for t in TOPIC_SKIPLIST:
        if t in data:
            del data[t]

def qrels():
    ret = {}

    for training_dataset in LEAKAGE_TRAINING_DATASETS:
        orig_qrels = to_qrels_ms_marco(training_dataset)

        for _, i in orig_qrels.iterrows():
            if i['query_id'] not in ret:
                ret[i['query_id']] = {}

            ret[i['query_id']][i['doc_id']] = int(i['rel'])

    ret.update(robust_04_qrels())
    remove_unwanted_topics(ret)
    return ret

with open('ms-marco-and-robust-04-train-test-leakage.run', 'w') as f:
    for qid, docs in qrels().items():
        docs = sorted(list(set(docs.keys())))
        for rank, doc in enumerate(docs):
            f.write(qid + ' Q0 ' + doc + ' ' + str(rank) + ' ' + str(len(docs) -rank) + ' static-run\n')

# Create static Run files for NYT

In [16]:
max([len(docs) for qid, docs in ir_datasets.load('nyt/trec-core-2017').qrels.asdict().items()])

965

In [8]:
with open('nyt-ac-common-core-2017.run', 'w') as f:
    for qid, docs in ir_datasets.load('nyt/trec-core-2017').qrels.asdict().items():
        docs = sorted(list(set(docs.keys())))
    
        for rank, doc in enumerate(docs):
            f.write(qid + ' Q0 ' + doc + ' ' + str(rank) + ' ' + str(len(docs) -rank) + ' static-run\n')

In [12]:
max([len(docs) for qid, docs in ir_datasets.load('wapo/v2/trec-core-2018').qrels.asdict().items()])

862

# Create static Run files for WAPO

In [13]:
with open('wapo-v2-common-core-2018.run', 'w') as f:
    for qid, docs in ir_datasets.load('wapo/v2/trec-core-2018').qrels.asdict().items():
        docs = sorted(list(set(docs.keys())))

        for rank, doc in enumerate(docs):
            f.write(qid + ' Q0 ' + doc + ' ' + str(rank) + ' ' + str(len(docs) -rank) + ' static-run\n')

In [15]:
!git add wapo-v2-common-core-2018.run

# Create skiplist

In [32]:
from tqdm import tqdm

skiplist = set()
skiplist_docs = set()
for training_dataset in LEAKAGE_TRAINING_DATASETS:
    with open('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/training-data/' + training_dataset + '/training-data.txt', 'r') as f:
        for l in tqdm(f):
            l = json.loads(l)

            if 'doc_text' not in l:
                skiplist.add(prefix(l['docid']) + str(l['query_id']))

            doc_text = l['doc_text']
            doc_text = ''.join(filter(str.isalnum, doc_text)).strip()

            if len(doc_text) < 10:
                skiplist.add(prefix(l['docid']) + str(l['query_id']))
                skiplist_docs.add(prefix(l['docid']) + l['docid'])

1002it [00:00, 15657.38it/s]
1002it [00:00, 16777.35it/s]
996it [00:00, 2358.49it/s]
1002it [00:00, 15515.42it/s]
10002it [00:00, 15131.61it/s]
10002it [00:00, 15608.18it/s]
13798it [00:02, 6366.43it/s] 
10002it [00:00, 15445.82it/s]
2004it [00:00, 15602.11it/s]
4000it [00:00, 15965.16it/s]
8000it [00:00, 15636.52it/s]
16000it [00:01, 15753.12it/s]
32000it [00:02, 15369.13it/s]
2002it [00:00, 16450.06it/s]
4002it [00:00, 15901.77it/s]
8002it [00:00, 16139.54it/s]
16002it [00:00, 16296.54it/s]
32002it [00:02, 15854.04it/s]
1996it [00:00, 4505.62it/s] 
3996it [00:00, 7203.16it/s] 
7996it [00:00, 9851.43it/s] 
15996it [00:01, 12333.53it/s]
31996it [00:02, 13576.95it/s]
2000it [00:00, 10147.40it/s]
4000it [00:00, 12460.86it/s]
8000it [00:00, 13963.79it/s]
16000it [00:01, 15026.17it/s]
32000it [00:02, 15232.00it/s]
2000it [00:00, 10689.94it/s]
4000it [00:00, 13007.25it/s]
8000it [00:00, 14218.07it/s]
16000it [00:01, 14532.26it/s]
32000it [00:02, 15498.87it/s]


In [29]:
import json
json.dumps([i for i in skiplist])

'["ms_marco_2046982", "ms_marco_9677239", "ms_marco_9448183", "ms_marco_6559293", "ms_marco_984754", "ms_marco_12669596", "ms_marco_3267825", "ms_marco_6099459", "ms_marco_3143354", "ms_marco_270029", "ms_marco_2400810", "ms_marco_11746476", "ms_marco_358550", "ms_marco_766765", "ms_marco_692682", "ms_marco_7987328", "ms_marco_12295107", "ms_marco_6328467", "ms_marco_631791", "ms_marco_8874431", "ms_marco_845372", "ms_marco_8688016", "ms_marco_4850264", "ms_marco_491327", "ms_marco_10722191", "ms_marco_3017470", "ms_marco_9435618", "ms_marco_10252667", "ms_marco_6271758", "ms_marco_11683643", "ms_marco_1041175", "ms_marco_500196", "ms_marco_851174", "ms_marco_302293", "ms_marco_3217583", "ms_marco_6392799", "ms_marco_7196565", "ms_marco_2889210", "ms_marco_2444190", "ms_marco_564961", "ms_marco_246220", "ms_marco_844305", "ms_marco_10929046", "ms_marco_2822468", "ms_marco_3768546", "ms_marco_868106", "ms_marco_163212", "ms_marco_11303249", "ms_marco_2095004", "ms_marco_757919", "ms_mar

In [33]:
import json
json.dumps([i for i in skiplist_docs])

'["ms_marco_D1219585", "ms_marco_D2914936", "ms_marco_D2318873", "ms_marco_D869789", "ms_marco_D1998357", "ms_marco_D1477260", "ms_marco_D601927", "ms_marco_D366065", "ms_marco_D1372233", "ms_marco_D47623", "ms_marco_D649872", "ms_marco_D1754256", "ms_marco_D2839220", "ms_marco_D252401", "ms_marco_D1008978", "ms_marco_D1334116", "ms_marco_D1156691", "ms_marco_D1569764", "ms_marco_D2956588", "ms_marco_D1590974", "ms_marco_D2917988", "ms_marco_D638693", "ms_marco_D788023", "ms_marco_D3529085", "ms_marco_D2069998", "ms_marco_D77794", "ms_marco_D1239897", "ms_marco_D2689279", "ms_marco_D2226905", "ms_marco_D31994", "ms_marco_D2882714", "ms_marco_D3313712", "ms_marco_D2691975", "ms_marco_D2831911", "ms_marco_D1450900", "ms_marco_D2340065", "ms_marco_D669634", "ms_marco_D2081734", "ms_marco_D3011945", "ms_marco_D2386107", "ms_marco_D213130", "ms_marco_D1261252", "ms_marco_D3002714", "ms_marco_D2732246", "ms_marco_D3044541", "ms_marco_D1608555", "ms_marco_D80869", "ms_marco_D2249436", "ms_mar