In [1]:
workspace = '/content/drive/MyDrive/Courses/CS410/MP4/'

import os
os.path.exists(workspace)

True

In [2]:
 %%capture
 !pip install transformers ir-measures torch 

In [3]:
import ir_measures
from ir_measures import *

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [5]:
import numpy as np
import json

In [19]:
rank_file = workspace + 'MP4.1-bi-encoder-candidate-ranking 100.trec'
rerank_file = workspace + 'MP4.2-bi-encoder-candidate-reranking.trec'

In [6]:
def load_trec(filename):
    qid2docid = {}
    with open(filename, 'r') as f:
        for l in f:
            l = l.strip().split('\t')
            qid = int(l[0])
            doc_id = int(l[2])
            rank = int(l[3])
            score = float(l[4])
            if qid not in qid2docid:
                qid2docid[qid] = {}
            qid2docid[qid][rank] = (doc_id, score)
    return qid2docid

In [8]:
candidates = load_trec(rank_file)

In [9]:
model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-electra-base')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-electra-base')

Downloading:   0%|          | 0.00/730 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [10]:
documents = []
with open(workspace + 'data/collection.jsonl', 'r') as f:
  for l in f:
    data = json.loads(l)
    documents.append(data['contents'])

In [11]:
queries = []

with open(workspace + 'data/queries.txt','r') as f:
  for l in f:
    l = l.strip()
    queries.append(l)

In [12]:
top_n = 10
Nq = 100
model.eval() # turns off dropout

with open(rerank_file, 'w') as w:
  # for qid in range(1, len(candidates)+1):
  for qid in range(1, Nq + 1):
    # Nrank = len(candidates[qid])
    Nrank = top_n
    q = queries[qid]
    doc_ids = [candidates[qid][rank][0] for rank in candidates[qid] if rank <= Nrank]
    docs = [documents[doc_id] for doc_id in doc_ids]
    features = tokenizer([q for _ in range(Nrank)], 
                         docs,  
                         padding=True, truncation=True, return_tensors="pt")
    
    with torch.no_grad(): #save memory and speed by not calculating backward pass or storing gradient
      scores = model(**features).logits  
    scores = np.squeeze(scores.numpy()) 

    order = np.argsort(-scores)   # descending
    # new_docs = np.array(docs)[order]
    new_doc_ids = np.array(doc_ids)[order]
    new_scores = scores[order]

    for j in range(0, Nrank):
      w.write("{}\t0\t{}\t{}\t{}\tcandidate-run\n".format(qid, new_doc_ids[j], j+1, new_scores[j]))  #QUERY_ID\t0\tDOCUMENT_ID\tRANK\tSCORE\trun_id

ranking

In [20]:
qrels = ir_measures.read_trec_qrels(workspace + 'data/qrels.trec')
run = ir_measures.read_trec_run(rank_file)

In [21]:
names = [NDCG@3, NDCG@5, NDCG@10, NDCG@20, R@5, R@10, R@100, R@1000]
evals = ir_measures.calc_aggregate(names, qrels, run)

In [22]:
for name in names:
  print("{:10s} = {:.3f}".format(str(name), evals[name]))

nDCG@3     = 0.016
nDCG@5     = 0.015
nDCG@10    = 0.015
nDCG@20    = 0.017
R@5        = 0.007
R@10       = 0.011
R@100      = 0.018
R@1000     = 0.018


reranking

In [16]:
qrels = ir_measures.read_trec_qrels(workspace + 'data/qrels.trec')
run = ir_measures.read_trec_run(rerank_file)

In [17]:
names = [NDCG@3, NDCG@5, NDCG@10, NDCG@20, R@5, R@10, R@100, R@1000]
evals = ir_measures.calc_aggregate(names, qrels, run)

In [18]:
for name in names:
  print("{:10s} = {:.3f}".format(str(name), evals[name]))

nDCG@3     = 0.013
nDCG@5     = 0.013
nDCG@10    = 0.014
nDCG@20    = 0.012
R@5        = 0.006
R@10       = 0.011
R@100      = 0.011
R@1000     = 0.011
