In [2]:
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoT5, MonoBERT
from pyserini.search import SimpleSearcher
from trectools import TrecRun
from tqdm import tqdm
DOCS_TO_RERANK = 100

DATA_DIR = '/mnt/ceph/storage/data-in-progress/data-research/web-search/romcir22-keyquery/'

In [7]:
!nvidia-smi

Tue Mar  1 11:33:55 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.84       Driver Version: 460.84       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:41:00.0 Off |                    0 |
| N/A   21C    P0    58W / 400W |   4080MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  A100-SXM4-40GB      Off  | 00000000:81:00.0 Off |                    0 |
| N/A   17C    P0    51W / 400W |      3MiB / 40536MiB |      0%      Default |
|       

In [8]:
MONO_T5_RERANKERS = {
    'castorini/monot5-base-msmarco': MonoT5(),
}

In [9]:
MONO_BERT_RERANKERS = {
    'castorini/monobert-large-msmarco': MonoBERT(),
}

In [10]:
def load_index(index_name):
    return SimpleSearcher(DATA_DIR + '/indexes-judged-only/' + index_name)

def get_doc_text(index_name, doc_id):
    try:
        return INDEXES[index_name].doc(doc_id).contents()
    except:
        return ''

INDEXES = {
    'hmi-19': load_index('lucene-index.cw12-judged-only-raw'),
    'hmi-20': load_index('lucene-index.cc-news-20-judged-only-raw'),
    'hmi-21': load_index('lucene-index.c4noclean'),
}

RUNS = {
    'hmi-19': TrecRun(DATA_DIR + '/runs/hmi-19/run.hmi19.bm25_bm25(k1=0.9,b=0.4)_default'),
    'hmi-20': TrecRun(DATA_DIR + '/runs/hmi-20/run.hmi20.bm25_bm25(k1=0.9,b=0.4)_default'),
    'hmi-21': TrecRun(DATA_DIR + '/runs/hmi-21/run.hmi21.bm25_bm25(k1=0.9,b=0.4)_default'),
}


In [12]:
# Some final Smoke Tests
assert get_doc_text('hmi-19', 'clueweb12-1712wb-84-02961').startswith('Urinary Tract Infection Alternative Treatment- are Cranberries the Only U.t.i. Natural Cure?')
assert get_doc_text('hmi-20', '547e72c3-17f1-476a-a3c4-e00b2a97d98c').startswith('Coronavirus threat to global Vitamin D Ingredients Market In-Depth Analysis Of Competitive Landscape')
assert get_doc_text('hmi-21', 'en.noclean.c4-train.04871-of-07168.129759').startswith('Best Ankle Brace for Achilles Tendonitis - ')

In [13]:
def docs_for_topic(index_name, topic_number):
    df = RUNS[index_name].run_data
    return [i for i in df[df['query'].astype(int) == int(topic_number)].docid[:DOCS_TO_RERANK]]

def rerank_with_mono_t5(index_name, topic, query, reranker):
    import contextlib
    import io

    with contextlib.redirect_stderr(io.StringIO()), contextlib.redirect_stdout(io.StringIO()):
        documents = [Text(get_doc_text(index_name, i), {'docid': i}, 0) for i in docs_for_topic(index_name, topic)]
        ret = sorted(reranker.rerank(Query(query), documents), key=lambda i: i.score, reverse=True)

        return [{'score': i.score, 'id': i.metadata['docid'], 'body': i.text} for i in ret]

def queries(index_name):
    import json
    ret = []
    with open('../../../../third-party/health-misinfo-' + str(int(index_name.split('-')[1])) + '/topics.jsonl', 'r') as f:
        for l in f:
            l = json.loads(l)
            ret += [(int(l['topic']), l['title'])]
    return ret

In [14]:
for dataset in ['hmi-19', 'hmi-20', 'hmi-21']:
    for model_name, model in MONO_T5_RERANKERS.items():
        tag = model_name.replace('/', '-')
        year = int(dataset.split('-')[1])
        !mkdir -p {year}/{dataset}-{tag}
        with open(str(year) + '/' + dataset + '-' + tag + '/run.txt', 'w') as out_file:
            for query_id, query in tqdm(queries(dataset)):
                for i in zip(range(DOCS_TO_RERANK), rerank_with_mono_t5(dataset, query_id, query, model)):
                    out_file.write(str(query_id) + ' Q0 ' + i[1]['id'] + ' ' + str(i[0] + 1) + ' ' + str(i[1]['score']) + ' ' + tag + '\n')

100%|██████████| 51/51 [03:43<00:00,  4.39s/it]
100%|██████████| 50/50 [02:03<00:00,  2.48s/it]
100%|██████████| 50/50 [05:10<00:00,  6.21s/it]


In [15]:
for dataset in ['hmi-19', 'hmi-20', 'hmi-21']:
    for model_name, model in MONO_BERT_RERANKERS.items():
        tag = model_name.replace('/', '-')
        year = int(dataset.split('-')[1])
        !mkdir {year}/{dataset}-{tag}
        with open(str(year) + '/' + dataset + '-' + tag + '/run.txt', 'w') as out_file:
            for query_id, query in tqdm(queries(dataset)):
                for i in zip(range(DOCS_TO_RERANK), rerank_with_mono_t5(dataset, query_id, query, model)):
                    out_file.write(str(query_id) + ' Q0 ' + i[1]['id'] + ' ' + str(i[0] + 1) + ' ' + str(i[1]['score']) + ' ' + tag + '\n')

mkdir: cannot create directory ‘19/hmi-19-castorini-monobert-large-msmarco’: File exists


100%|██████████| 51/51 [05:48<00:00,  6.83s/it]


mkdir: cannot create directory ‘20/hmi-20-castorini-monobert-large-msmarco’: File exists


100%|██████████| 50/50 [04:29<00:00,  5.39s/it]


mkdir: cannot create directory ‘21/hmi-21-castorini-monobert-large-msmarco’: File exists


100%|██████████| 50/50 [09:13<00:00, 11.08s/it]
