# Construct Rerank Document Collection

In [30]:
import json
from tqdm import tqdm

DATA_DIR = '/mnt/ceph/storage/data-in-progress/data-research/web-search/private-web-search-with-keyqueries/'
QUERY_DIR = DATA_DIR + 'scrambling-on-anserini/'

SCRAMBLING_APPROACHES = ['nounphrase','tf-idf', 'hbc', 'arampatzis', 'HbcTfIdf', 'arampatzisHbc']
RETRIEVAL_MODELS = ['bm25', 'qld']
SELECTION_STRATEGIES = ['keyqueryNdcg', 'keyqueryNdcgRelaxed', 'ndcg', 'pmi']

TOPICS = [2, 8, 11, 17, 18, 26, 30, 33, 38, 40, 46, 47, 50, 57, 59, 61, 62, 66, 67, 78, 82, 88, 89, 95, 98, 104, 105, 109, 111, 117, 119, 121, 123, 128, 131, 136, 140, 142, 147, 152, 156, 162, 168, 173, 175, 177, 182, 196, 199, 207, 213, 222, 236, 253, 254, 262, 266, 273, 286, 287, 209, 214]

### Already Crawled Documents

In [38]:
DIR='/mnt/ceph/storage/data-in-progress/data-research/web-search/private-web-search-with-keyqueries/reranking-index-anserini/documents'
parts=!ls $DIR
existing_ids = []

for part in tqdm(parts):
    with open(DIR + '/' + part, 'r') as f:
        for l in f:
            existing_ids += [json.loads(l)['id']]

existing_ids = set(existing_ids)

100%|██████████| 700/700 [01:08<00:00, 10.26it/s]


In [39]:
[i for i in existing_ids][:10]

['clueweb12-0911wb-48-00029',
 'clueweb09-en0055-59-21632',
 'clueweb09-en0105-08-19045',
 'clueweb09-en0107-56-06456',
 'clueweb09-en0018-49-11969',
 'clueweb09-enwp00-92-20155',
 'clueweb09-enwp03-29-13875',
 'clueweb09-en0130-65-09008',
 'clueweb09-en0023-03-26654',
 'clueweb09-en0013-16-10532']

In [40]:
from tqdm import tqdm

def load_docs_in_topic(approach, topic):
    import json
    #print(DATA_DIR + 'submitted-scrambled-queries/' + approach + '/' + str(topic) + '.jsonl')
    topic_data = json.load(open(DATA_DIR + 'submitted-scrambled-queries/' + approach + '/' + str(topic) + '.jsonl'))
    ret = []
    for _, v in topic_data.items():
        ret += v['ranking']
    
    return set(ret)

def everything():
    ret = []
    for rtr in RETRIEVAL_MODELS:
        for stn in SELECTION_STRATEGIES:
            for scr in SCRAMBLING_APPROACHES:
                for topic in TOPICS:
                    ret += [(rtr, stn, scr, topic)]
    return ret

docs = []
for (rtr, stn, scr, topic) in tqdm(everything()):
    docs += [i for i in load_docs_in_topic(scr + '-' + rtr + '-' + stn, topic)]
docs = set([i for i in docs if i not in existing_ids])

print('Missing Docs: ' + str(len(docs)))

100%|██████████| 2976/2976 [00:03<00:00, 767.61it/s] 


Missing Docs: 0


### Write docs to file system

Outcommented: run only in case some docs are missing.
And pay attention to interleave files.

In [26]:
#import numpy as np
#splits = np.array_split([i for i in docs], 700)
#
#OUT_DIR='/mnt/ceph/storage/data-in-progress/data-research/web-search/private-web-search-with-keyqueries/reranking-index-anserini/doc-ids/'
#
#for i in tqdm(range(len(splits))):
#    with open(OUT_DIR + 'part-' + str(i), 'w') as f:
#        for doc_id in splits[i]:
#            f.write(doc_id + '\n')

100%|██████████| 700/700 [00:02<00:00, 278.69it/s]


# Create Allowlists for Indexing

In [42]:
def load_top_docs_in_topic(approach, topic, stop_at_position=1000, num_queries=1000):
    import json
    #print(DATA_DIR + 'submitted-scrambled-queries/' + approach + '/' + str(topic) + '.jsonl')
    topic_data = json.load(open(DATA_DIR + 'submitted-scrambled-queries/' + approach + '/' + str(topic) + '.jsonl'))
    ret = []
    for q, v in topic_data.items():
        if int(q) <= num_queries:
            ret += v['ranking'][:stop_at_position]
    
    return set(ret)

for rtr in RETRIEVAL_MODELS:
    for stn in SELECTION_STRATEGIES:
        for scr in SCRAMBLING_APPROACHES:
            for top_docs in [10, 100]:
                for num_queries in [5, 10, 25]:
                    docs = []
                    for topic in TOPICS:
                        docs += load_top_docs_in_topic(scr + '-' + rtr + '-' + stn, topic, top_docs, num_queries)
                    docs = set(docs)
                    
                    with open(DATA_DIR+ '/reranking-index-anserini/allow-lists/' + scr + '-' + rtr + '-' + stn + '-' + str(top_docs) + 'top_docs' + '-' + str(num_queries) + 'num_queries','w') as f:
                        for doc in docs:
                            f.write(doc + '\n')
