In [1]:
import pandas as pd
import gzip
import json
QRELS = ['qrels.trec-covid-additional-judgments.tsv', 'qrels.trec-covid-v1.tsv', 'qrels.trec-covid-v2.tsv']

CORPORA = ['trec-covid-beir/corpus-ORIG.jsonl']

def load_qrels(qrel_file):
    return pd.read_csv('../unprocessed/topics-and-qrels/' + qrel_file, sep='\t')

def load_all_docs():
    ret = []
    for qrel in QRELS:
        ret += list(load_qrels(qrel)['doc_id'])
    return set(ret)

judged_docs = load_all_docs()


In [31]:
!ls -lha trec-covid-v2/corpus.jsonl.gz

-rw-r--r-- 1 maik maik 16M Sep 19 16:19 trec-covid-v2/corpus.jsonl.gz


In [34]:
!zcat trec-covid/corpus.jsonl.gz|head -2

{"_id": "2b73a28n", "title": "Role of endothelin-1 in lung disease", "text": "Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator of smooth muscle tone, and inflammatory mediator that may play a key role in diseases of the airways, pulmonary circulation, and inflammatory lung diseases, both acute and chronic. This review will focus on the biology of ET-1 and its role in lung disease.", "metadata": {"url": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC59574/", "pubmed_id": "11686871"}}
{"_id": "zjufx4fo", "title": "Sequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis", "text": "Nidovirus subgenomic mRNAs contain a leader sequence derived from the 5\u2032 end of the genome fused to different sequences (\u2018bodies\u2019) derived from the 3\u2032 end. Their generation involves a unique mechanism of discontinuous subgenomic RNA synt

In [5]:
translation = {}
current_id = 0

with open('/mnt/ceph/storage-rw/data-tmp/current/kibi9872/beir-ColBERT/colbert-datasets/trec-covid-beir/collection-ORIG.tsv', 'r') as inp, open('/mnt/ceph/storage-rw/data-tmp/current/kibi9872/beir-ColBERT/colbert-datasets/trec-covid-beir/collection.tsv', 'w') as out:
    for l in inp:
        l =  l.split('\t')
        assert len(l) == 2
        
        doc_id = l[0].strip()
        doc_text = l[1].strip()
        if doc_id in judged_docs:
            out.write(str(current_id) + '\t' + doc_text + '\n')
            translation[current_id] = doc_id
            current_id += 1

json.dump(translation, open('/mnt/ceph/storage-rw/data-tmp/current/kibi9872/beir-ColBERT/colbert-datasets/trec-covid-beir/document-ids.json', 'w'))
        

In [10]:
with open('/mnt/ceph/storage-rw/data-tmp/current/kibi9872/beir-ColBERT/colbert-datasets/trec-covid-beir/collection-ORIG.tsv', 'r') as inp, open('/mnt/ceph/storage-rw/data-tmp/current/kibi9872/beir-ColBERT/colbert-datasets/trec-covid-beir/collection-original-ids.tsv', 'w') as out:
    for l in inp:
        doc_id = l.split('\t')[0].strip()
        if doc_id in judged_docs:
            out.write(l)


In [11]:
len(judged_docs)

36016

In [8]:
with open('/mnt/ceph/storage-rw/data-tmp/current/kibi9872/beir-ColBERT/colbert-datasets/trec-covid-beir/document-ids.tsv', 'w') as out:
    for k, v in translation.items():
        out.write(f'{v}\t{k}\n')

In [12]:
len(qrels_for_topic(10))

1107

In [9]:
def qrels_for_topic(topic):
    ret = []
    for qrel in QRELS:
        qrel = load_qrels(qrel)
        qrel['query_id'] = qrel['query_id'].astype(int)
        qrel = qrel[qrel['query_id'] == int(topic)]
        
        ret += list(qrel['doc_id'])


    return set(ret)



In [16]:
!ls ../beir/

a					   msmarco.psg.l2.zip
ance-results.json			   pl2-run.txt
bm25-run.txt				   sentence-bert-results.json
colbert.dnn.zip				   tf-idf-run.txt
colbert-results-normalized-pyterrier.txt   trec-covid
colbert-results-pyterrier.txt		   trec-covid-beir
colbert-results.tsv			   trec-covid-beir.zip
construct-corpora-to-re-rank.ipynb	   trec-covid-v2
dlm-run.txt				   trec-covid-v2.zip
hole-at-10-analysis-for-dense-models.xlsx  trec-covid.zip


In [18]:
def beir_json_results_to_run_file(f):
    inp = json.load(open(f'../beir/{f}-results.json', 'r'))
    ret = []
    
    for qid, ranking in inp.items():
        judged_docs_for_topic = qrels_for_topic(qid)
        position = 1
        docs = sorted(list(ranking.keys()), key=lambda i: ranking[i], reverse=True)
        
        for doc in docs:
            if doc in judged_docs_for_topic:
                ret += [{'qid': qid, 'q0': 'Q0', 'docid': doc, 'rank': position, 'score': ranking[doc], 'system': f}]
                position += 1
    
    return pd.DataFrame(ret)

def to_run(f):
    beir_json_results_to_run_file(f).to_csv(f'../beir/{f}-run.txt', sep=' ', header=False, index=False)

to_run('ance')
to_run('sentence-bert')

In [None]:
def beir_json_results_to_run_file(f):
    inp = json.load(open(f'../beir/{f}-results.json', 'r'))
    ret = []
    
    for qid, ranking in inp.items():
        judged_docs_for_topic = qrels_for_topic(qid)
        position = 1
        docs = sorted(list(ranking.keys()), key=lambda i: ranking[i], reverse=True)
        
        for doc in docs:
            if doc in judged_docs_for_topic:
                ret += [{'qid': qid, 'q0': 'Q0', 'docid': doc, 'rank': position, 'score': ranking[doc], 'system': f}]
                position += 1
    
    return pd.DataFrame(ret)

def to_run(f):
    beir_json_results_to_run_file(f).to_csv(f'../beir/{f}-run.txt', sep=' ', header=False, index=False)

to_run('ance')
to_run('sentence-bert')