In [2]:
#!nosetests ../tests

import sys
from trectools import TrecQrel
sys.path.insert(0, "../src")
import pyterrier as pt
import pandas as pd
from pyterrier_t5 import MonoT5ReRanker, DuoT5ReRanker
from retrieval_pipelines.FileSystemCache import FileSystemCache
from retrieval_pipelines.DuoT5ReRankerWithCache import DuoT5ReRankerWithCache
from sampling.samplers import PairwiseFullSampler, PairwiseRandomSampler, PairwiseGroupedSampler
from aggregation import BradleyTerryAggregator, KwikSortAggregator, AdditiveAggregator

DIR = '/mnt/ceph/storage/data-in-progress/data-research/web-search/SIGIR-22/sigir22-pairwise-ranking/'

if not pt.started():
    pt.init()

QRELS = {
    'cw09': {i: TrecQrel('../data/external/qrels/qrels.web.' + i + '.txt') for i in ['1-50', '51-100', '101-150', '151-200']},
    'cw12': {i: TrecQrel('../data/external/qrels/qrels.web.' + i + '.txt') for i in ['201-250', '251-300']}
}

CW09_INDEX = pt.IndexRef.of(DIR + '/indexes/pt-cw09-first-passage/data.properties')
CW12_INDEX = pt.IndexRef.of(DIR + '/indexes/pt-cw12-first-passage/data.properties')

In [3]:
def qrels(crawl):
    ret = []
    for i in QRELS[crawl]:
        ret += [QRELS[crawl][i].qrels_data.copy()]
    
    ret = pd.concat(ret)
    del ret['q0']
    ret = ret.rename(columns={'query': 'qid','docid': 'docno', 'rel': 'label'})
    ret['qid'] = ret['qid'].astype(str)
    ret['label'] = ret['label'].astype(int)
    return ret

TOPICS_CACHE = {}

def topics(crawl):
    if crawl in TOPICS_CACHE:
        return TOPICS_CACHE[crawl]
    
    ret = []
    if crawl == 'cw09':
        for i in ['irds:clueweb09/en/trec-web-2009', 'irds:clueweb09/en/trec-web-2010', 'irds:clueweb09/en/trec-web-2011', 'irds:clueweb09/en/trec-web-2012']:
            ret += [pt.get_dataset(i).get_topics('query').copy()]
    elif crawl == 'cw12':
        for i in ['irds:clueweb12/trec-web-2013', 'irds:clueweb12/trec-web-2014']:
            ret += [pt.get_dataset(i).get_topics('query').copy()]
    
    TOPICS_CACHE[crawl] = pd.concat(ret)
    TOPICS_CACHE[crawl]['qid'] = TOPICS_CACHE[crawl]['qid'].astype(str)
    TOPICS_CACHE[crawl]['query'] = TOPICS_CACHE[crawl]['query'].astype(str)
    return TOPICS_CACHE[crawl]

def load_run(crawl, name):
    from pyterrier.transformer import get_transformer
    return get_transformer(pt.io.read_results(DIR + '/run-files/' + crawl + '/' + name + '-run.txt'))

def persist_run_file(topics, crawl, name, method):
    ret = method(topics)
    out_dir = DIR + '/run-files/' + crawl + '/'
    !mkdir -p {out_dir}
    
    pt.io.write_results(ret, out_dir + name + '-run.txt')
    
def mono_t5(model):
    return MonoT5ReRanker(
        tok_model='t5-' + model,
        model='castorini/monot5-' + model + '-msmarco',
        batch_size=32)

def duo_t5(model, sampler, crawl, aggregator, batch_size=32):
    return DuoT5ReRankerWithCache(
        tok_model='t5-' + model,
        model='castorini/duot5-' + model + '-msmarco',
        batch_size=batch_size,
        cache=FileSystemCache(DIR + 'pairwise-cache/' + crawl + '/castorini-duot5-' + model + '-msmarco'),
        sampler=sampler,
        aggregator=aggregator,
    )

def max_passage_transformer(file_name):
    from pyterrier.model import add_ranks
    from pyterrier.transformer import get_transformer

    def tmp(i):
        assert len(i.qid.unique()) == 1
        assert len(i.docno.unique()) == 1
        i = i.sort_values('score', ascending=False).iloc[0]
    
        return pd.Series({'text': i['text'], 'score': i['score']})

    ret = pd.read_json(file_name, lines=True)
    ret['docno'] = ret['docno'].apply(lambda i: i.split('___')[0])
    
    ret = ret.groupby(['qid', 'query', 'docno']).apply(tmp).reset_index()
    ret['qid'] = ret['qid'].astype(str)
    
    return get_transformer(add_ranks(ret))

#inverse_relevancy = load_run('cw09_first_passage', 'all-judged-inverse')
#mono_t5_3b = inverse_relevancy >> pt.text.get_text(CW09_INDEX, "text") >> mono_t5('3b')
#mono_t5_base = inverse_relevancy >> pt.text.get_text(CW09_INDEX, "text") >> mono_t5('base')
#duo_t5_3b = inverse_relevancy >> pt.text.get_text(CW09_INDEX, "text") >> duo_t5('3b', PairwiseFullSampler(), 'cw09_first_passage', AdditiveAggregator())
#duo_t5_base = inverse_relevancy >> pt.text.get_text(CW09_INDEX, "text") >> duo_t5('base', PairwiseFullSampler(), 'cw09_first_passage', AdditiveAggregator())
#duo_t5_top_10_3b = load_run('cw09_first_passage', 'mono_t5_3b') %10 >> pt.text.get_text(CW09_INDEX, "text") >> duo_t5('3b', PairwiseFullSampler(), 'cw09_first_passage', AdditiveAggregator())
#duo_t5_top_20_3b = load_run('cw09_first_passage', 'mono_t5_3b') %20 >> pt.text.get_text(CW09_INDEX, "text") >> duo_t5('3b', PairwiseFullSampler(), 'cw09_first_passage', AdditiveAggregator())
#duo_t5_top_50_3b = load_run('cw09_first_passage', 'mono_t5_3b') %50 >> pt.text.get_text(CW09_INDEX, "text") >> duo_t5('3b', PairwiseFullSampler(), 'cw09_first_passage', AdditiveAggregator())

In [3]:
%%time
duo_t5_top_40_3b = load_run('cw12_first_passage', 'mono_t5_3b') %40 >> pt.text.get_text(CW12_INDEX, "text") >> duo_t5('3b', PairwiseFullSampler(), 'cw12_first_passage', AdditiveAggregator(), 4)

persist_run_file(topics('cw12'), 'cw12_first_passage', 'duo_t5_top_40_3b', duo_t5_top_40_3b)

  df.drop(df.columns.difference(['qid','query']), 1, inplace=True)
duoT5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:43<00:00,  1.03s/queries]
duoT5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:42<00:00,  1.03s/queries]


CPU times: user 3min 54s, sys: 12.1 s, total: 4min 6s
Wall time: 5min 8s


In [5]:
%%time
duo_t5_top_50_3b = load_run('cw12_first_passage', 'mono_t5_3b') %50 >> pt.text.get_text(CW12_INDEX, "text") >> duo_t5('3b', PairwiseFullSampler(), 'cw12_first_passage', AdditiveAggregator(), 4)

persist_run_file(topics('cw12'), 'cw12_first_passage', 'duo_t5_top_50_3b', duo_t5_top_50_3b)

duoT5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:42<00:00,  1.62s/queries]
duoT5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:41<00:00,  1.62s/queries]


CPU times: user 5min 51s, sys: 10.9 s, total: 6min 2s
Wall time: 6min 3s


In [4]:
%%time
duo_t5_top_50_3b = load_run('cw09_first_passage', 'mono_t5_3b') %50 >> pt.text.get_text(CW09_INDEX, "text") >> duo_t5('3b', PairwiseFullSampler(), 'cw09_first_passage', AdditiveAggregator(), 4)

persist_run_file(topics('cw09'), 'cw09_first_passage', 'duo_t5_top_50_3b', duo_t5_top_50_3b)

  df.drop(df.columns.difference(['qid','query']), 1, inplace=True)
duoT5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 198/198 [05:10<00:00,  1.57s/queries]
duoT5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 198/198 [05:07<00:00,  1.55s/queries]


CPU times: user 10min 56s, sys: 18.4 s, total: 11min 15s
Wall time: 11min 36s


In [None]:
%%time
persist_run_file(topics('cw09'), 'cw09_first_passage', 'duo_t5_top_10_3b', duo_t5_top_10_3b)

In [4]:
%%time
persist_run_file(topics('cw09'), 'cw09_first_passage', 'duo_t5_top_20_3b', duo_t5_top_20_3b)

duoT5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 198/198 [00:25<00:00,  7.91queries/s]
duoT5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 198/198 [00:25<00:00,  7.90queries/s]


CPU times: user 51.9 s, sys: 555 ms, total: 52.5 s
Wall time: 52.1 s


In [4]:
%%time
persist_run_file(topics('cw09'), 'cw09_first_passage', 'duo_t5_top_50_3b', duo_t5_top_50_3b)

duoT5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 198/198 [02:37<00:00,  1.26queries/s]
duoT5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 198/198 [02:37<00:00,  1.26queries/s]


CPU times: user 5min 23s, sys: 3.37 s, total: 5min 27s
Wall time: 5min 24s


In [None]:
%%time
persist_run_file(topics('cw09'), 'cw09_first_passage', 'duo_t5_base', duo_t5_base)

  df.drop(df.columns.difference(['qid','query']), 1, inplace=True)
duoT5:  37%|███████████████████████████████████▏                                                          | 74/198 [53:02<1:22:57, 40.14s/queries]Token indices sequence length is longer than the specified maximum sequence length for this model (842 > 512). Running this sequence through the model will result in indexing errors
duoT5:  78%|███████████████████████████████████████████████████████████████████▋                   | 154/198 [11:15:43<14:20:29, 1173.39s/queries]

In [29]:
%%time
persist_run_file(topics('cw09'), 'cw09_first_passage', 'mono_t5_3b', mono_t5_3b)

  df.drop(df.columns.difference(['qid','query']), 1, inplace=True)
monoT5:   0%|                                                                                                       | 0/2206 [00:00<?, ?batches/s]Token indices sequence length is longer than the specified maximum sequence length for this model (636 > 512). Running this sequence through the model will result in indexing errors
monoT5: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 2206/2206 [30:05<00:00,  1.22batches/s]


CPU times: user 30min 5s, sys: 5.68 s, total: 30min 11s
Wall time: 30min 11s


In [39]:
%%time
persist_run_file(topics('cw09'), 'cw09_first_passage', 'mono_t5_base', mono_t5_base)

monoT5:   0%|                                                                                                       | 0/2206 [00:00<?, ?batches/s]Token indices sequence length is longer than the specified maximum sequence length for this model (636 > 512). Running this sequence through the model will result in indexing errors
monoT5: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 2206/2206 [04:34<00:00,  8.03batches/s]


CPU times: user 4min 38s, sys: 2.78 s, total: 4min 41s
Wall time: 4min 40s


In [40]:
pt.Experiment(
    [inverse_relevancy, load_run('cw09_first_passage', 'mono_t5_3b'), load_run('cw09_first_passage', 'mono_t5_base')],
    topics('cw09'),
    qrels('cw09'),
    ['ndcg_cut_10', 'ndcg_cut_20', 'recip_rank'],
    ['InversedRelevancy', 'MonoT53b', 'MonoT5base'],
)

Unnamed: 0,name,ndcg_cut_10,ndcg_cut_20,recip_rank
0,InversedRelevancy,0.0,0.0,0.004423
1,MonoT53b,0.33595,0.367517,0.569015
2,MonoT5base,0.321444,0.344194,0.569906


# Experiments on CW12

In [44]:
inverse_relevancy = load_run('cw12_first_passage', 'all-judged-inverse')
mono_t5_3b = inverse_relevancy >> pt.text.get_text(CW12_INDEX, "text") >> mono_t5('3b')
mono_t5_base = inverse_relevancy >> pt.text.get_text(CW12_INDEX, "text") >> mono_t5('base')

In [45]:
%%time
persist_run_file(topics('cw12'), 'cw12_first_passage', 'mono_t5_3b', mono_t5_3b)

JavaException: JVM exception occurred: Index -1 out of bounds for length 27966 java.lang.ArrayIndexOutOfBoundsException

In [46]:
%%time
persist_run_file(topics('cw12'), 'cw12_first_passage', 'mono_t5_base', mono_t5_base)

JavaException: JVM exception occurred: Index -1 out of bounds for length 27966 java.lang.ArrayIndexOutOfBoundsException

# Run Files with only judged documents

In [38]:
for crawl in ['cw09', 'cw12']:
    qrels = []
    for qrel in QRELS[crawl]:
        qrels += [QRELS[crawl][qrel].qrels_data.copy()]
    qrels = pd.concat(qrels)

    qrels['rank'] = qrels.groupby('query')['rel'].rank("dense", ascending=True).astype(int)
    qrels['score'] = 1000 - qrels['rank']
    qrels['Q0'] = 'Q0'
    qrels['system'] = 'all-judged-inverse'
    qrels = qrels[['query', 'Q0', 'docid', 'rank', 'score', 'system']]

    qrels = qrels.sort_values(['query', 'rank'], ascending=[True, True])
    
    qrels.to_csv(DIR + 'run-files/' + crawl + '_first_passage/all-judged-inverse-run.txt', sep=' ', index=False, header=False)
    

# Max-Passages

In [3]:
df = []
import json
with open('/mnt/ceph/storage/data-in-progress/data-research/web-search/SIGIR-22/sigir22-pairwise-ranking/indexes/raw-cw09-and-cw12-all-passages.jsonl', 'r') as f:
    for l in f:
        l = json.loads(l)
        
        
        for p in l['passages']:
            if len(p) != 2:
                continue
            passage_id = p[0]
            passage_text = p[1]
            df += [{
                'qid': str(l['qid']),
                'docno': l['docno'] + '___' + str(passage_id),
                'text': passage_text,
            }]

df = pd.DataFrame(df)

In [4]:
df

Unnamed: 0,qid,docno,text
0,1,clueweb09-en0003-55-31884___1,"Obama — Blogs, Pictures, and more on WordPress..."
1,1,clueweb09-en0003-55-31884___2,"Ronald W. Burris, the former Illinois Attorney..."
2,1,clueweb09-en0003-55-31884___3,"Barnicle pointed out … more → Tags: Opinion, M..."
3,1,clueweb09-en0003-55-31884___4,jamesbray wrote 1 hour ago: I will shortly be ...
4,1,clueweb09-en0003-55-31884___5,Read what it’s all about from Kanye’s blog: Ja...
...,...,...,...
1323101,300,clueweb12-1911wb-40-07107___2,"Well, someone has found it – quite a few peopl..."
1323102,300,clueweb12-1911wb-40-07107___3,"If you meditate wrong, you're just spinning yo..."
1323103,300,clueweb12-1911wb-40-07107___4,Relax them until they're gone. Relax them away...
1323104,300,clueweb12-1911wb-40-07107___5,Go to last comment Garrett Mickley Level 1 Com...


In [5]:
from pyterrier.transformer import get_transformer
all_passages = get_transformer(df) >> mono_t5('3b')

In [6]:
all_passages_cw12 = all_passages(topics('cw12'))

  df.drop(df.columns.difference(['qid','query']), 1, inplace=True)
monoT5:   0%|                                                                                            | 1/11602 [00:00<3:07:40,  1.03batches/s]Token indices sequence length is longer than the specified maximum sequence length for this model (598 > 512). Running this sequence through the model will result in indexing errors
monoT5: 100%|████████████████████████████████████████████████████████████████████████████████████████| 11602/11602 [2:11:12<00:00,  1.47batches/s]


In [13]:
all_passages_cw12.to_json(DIR + '/run-files/cw12_all_passages/raw-monot5-3b-scores.jsonl', lines=True, orient='records')

In [14]:
!ls -lh {DIR}/run-files/cw12_all_passages/

total 494M
-rw-rw-r-- 1 root root 494M Jan 12 14:26 raw-monot5-3b-scores.jsonl


In [5]:
cw12_max_passages = max_passage_transformer(DIR + '/run-files/cw12_all_passages/raw-monot5-3b-scores.jsonl')
#duot5_50_cw12_maxp = cw12_max_passages % 50 >> duo_t5('3b', PairwiseFullSampler(), 'cw12_max_passage', AdditiveAggregator(), 8)

In [6]:
%%time
persist_run_file(topics('cw12'), 'cw12_all_passages', 'monot5_3b_maxp', cw12_max_passages)

CPU times: user 79.8 ms, sys: 213 ms, total: 292 ms
Wall time: 787 ms


In [5]:
%%time
persist_run_file(topics('cw12'), 'cw12_all_passages', 'duot5_50_cw12_maxp', duot5_50_cw12_maxp)

duoT5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:37<00:00,  1.57s/queries]
duoT5: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:37<00:00,  1.58s/queries]


CPU times: user 5min 21s, sys: 5.07 s, total: 5min 27s
Wall time: 5min 23s


In [5]:
cw09_max_passages = max_passage_transformer(DIR + '/run-files/cw09_all_passages/raw-monot5-3b-scores.jsonl')
duot5_30_cw09_maxp = cw09_max_passages % 30 >> duo_t5('3b', PairwiseFullSampler(), 'cw09_max_passage', AdditiveAggregator(), 1)

In [6]:
%%time
persist_run_file(topics('cw09'), 'cw09_all_passages', 'duot5_30_cw09_maxp', duot5_30_cw09_maxp)

  df.drop(df.columns.difference(['qid','query']), 1, inplace=True)
duoT5:   0%|                                                                                                         | 0/198 [00:00<?, ?queries/s]Token indices sequence length is longer than the specified maximum sequence length for this model (624 > 512). Running this sequence through the model will result in indexing errors
duoT5:  65%|█████████████████████████████████████████████████████████████▉                                 | 129/198 [56:15<30:05, 26.17s/queries]


RuntimeError: CUDA out of memory. Tried to allocate 10.01 GiB (GPU 0; 39.59 GiB total capacity; 31.13 GiB already allocated; 1.22 GiB free; 36.94 GiB reserved in total by PyTorch)