In [1]:
# import os
# import json
# import pickle
import random
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import spacy
# from spacy.lang.en.stop_words import STOP_WORDS
# from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from utils import Data
data_processor = Data(root_path="../")

from collections import namedtuple
from tqdm import tqdm

print("Data files: ")
for file in data_processor.data_dict.keys():
    print(file)

Data files: 
collection.sampled.tsv
train_sample_queries.tsv
train_sample_passv2_qrels.tsv
val_2021_53_queries.tsv
val_2021_passage_top100.txt
val_2021.qrels.pass.final.txt
test_2022_76_queries.tsv
test_2022_passage_top100.txt
test_2022.qrels.pass.withDupes.txt


In [2]:
data_processor.read_in_memory()

正在处理文件collection.sampled.tsv 读取文件的格式为('pid', 'passage')
正在处理文件train_sample_queries.tsv 读取文件的格式为('qid', 'query')
正在处理文件train_sample_passv2_qrels.tsv 读取文件的格式为('qid', 'mark', 'pid', 'rating')
正在处理文件val_2021_53_queries.tsv 读取文件的格式为('qid', 'query')
正在处理文件val_2021_passage_top100.txt 读取文件的格式为('qid', 'mark', 'pid', 'rank', 'score', 'sys_id')
正在处理文件val_2021.qrels.pass.final.txt 读取文件的格式为('qid', 'mark', 'pid', 'rating')
正在处理文件test_2022_76_queries.tsv 读取文件的格式为('qid', 'query')
正在处理文件test_2022_passage_top100.txt 读取文件的格式为('qid', 'mark', 'pid', 'rank', 'score', 'sys_id')
正在处理文件test_2022.qrels.pass.withDupes.txt 读取文件的格式为('qid', 'mark', 'pid', 'rating')


In [3]:
# read data from memory by calling data_processor.dataset with following keys
for f in data_processor.dataset.keys():
    print(f)

collection.sampled
train_sample_queries
train_sample_passv2_qrels
val_2021_53_queries
val_2021_passage_top100
val_2021.qrels.pass.final
test_2022_76_queries
test_2022_passage_top100
test_2022.qrels.pass.withDupes


In [4]:
set_passage_id = set(data_processor.dataset['collection.sampled'].keys())
print("Total number of passages: ", len(set_passage_id))

Total number of passages:  126799


# Tokenization

In [9]:
import torch

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
path = "../model/Reranker"
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForSequenceClassification.from_pretrained(path)

# Loader

In [11]:
from torch.utils.data import DataLoader, TensorDataset

In [12]:
# 超参数
negative_number = 4
answer_number = negative_number + 1
max_length = 512
pre_batch_size = 64
batch_size = 4

In [14]:
data_processor.dataset['train_sample_passv2_qrels']

{'1185869': {'msmarco_passage_08_840101254': 1},
 '645590': {'msmarco_passage_61_310962365': 1},
 '827277': {'msmarco_passage_36_735439741': 1},
 '14562': {'msmarco_passage_00_867054103': 1},
 '708236': {'msmarco_passage_03_576438477': 1},
 '306992': {'msmarco_passage_03_757919813': 1},
 '441269': {'msmarco_passage_03_158685688': 1},
 '772957': {'msmarco_passage_09_285432897': 1},
 '535936': {'msmarco_passage_11_182401921': 1},
 '1137044': {'msmarco_passage_66_620248041': 1,
  'msmarco_passage_53_618022360': 1},
 '510071': {'msmarco_passage_21_89762755': 1},
 '251661': {'msmarco_passage_59_49662548': 1},
 '559856': {'msmarco_passage_60_487300127': 1},
 '940456': {'msmarco_passage_20_523768356': 1},
 '325893': {'msmarco_passage_04_710049609': 1},
 '879099': {'msmarco_passage_00_849238043': 1},
 '1028746': {'msmarco_passage_07_664291139': 1},
 '66325': {'msmarco_passage_03_779837601': 1},
 '252138': {'msmarco_passage_01_705906629': 1},
 '716682': {'msmarco_passage_62_321356819': 1},
 '10

QA_pairs is a list of tuples with the first element being the query string and the second element being the list of answer strings, whose length is answer_number. The first answer is the correct answer/passage, and the rest are wrong answers/passages.

In [15]:
QA_pairs = []

for i, (qid, q_dict) in enumerate(data_processor.dataset['train_sample_passv2_qrels'].items()):
    if i==1000:
        break
    pid = list(q_dict.keys())[0] # positive passage id
    query = data_processor.dataset['train_sample_queries'][qid]['query'] # query string
    passage = data_processor.dataset['collection.sampled'][pid]['passage'] # passage string
    sentences = [passage] + [data_processor.dataset['collection.sampled'][i]['passage'] for i in random.sample(list(set_passage_id - set([pid])), negative_number)]
    pair = (
        query, sentences
    )
    QA_pairs.append(pair)

In [16]:
batch_input_ids = torch.empty((0, answer_number, max_length), dtype=torch.long)
batch_attention_mask = torch.empty((0, answer_number, max_length), dtype=torch.long)
batch_token_type_ids = torch.empty((0, answer_number, max_length), dtype=torch.long)

In [17]:
head = 0
while head + pre_batch_size < len(QA_pairs):
    batch = QA_pairs[head:head+pre_batch_size]
    queries = []
    answers = []
    for query, passages in batch:
        queries += [query]*answer_number
        answers += passages

    inputs = tokenizer(queries, answers, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")

    batch_input_ids = torch.cat((batch_input_ids, inputs["input_ids"].reshape(pre_batch_size, answer_number, max_length)), dim=0)
    batch_attention_mask = torch.cat((batch_attention_mask, inputs["attention_mask"].reshape(pre_batch_size, answer_number, max_length)), dim=0)
    batch_token_type_ids = torch.cat((batch_token_type_ids, inputs["token_type_ids"].reshape(pre_batch_size, answer_number, max_length)), dim=0)
    head += pre_batch_size

In [18]:
dataset = TensorDataset(batch_input_ids, batch_attention_mask, batch_token_type_ids)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [19]:
def LCELoss(outputs):
    """
    The last two dimensions of outputs are the number of passages for each query and the specific encoding length, respectively.
    Each query has a positive example and several negative examples, which means the first passage is the positive example and the rest are negative examples.
    Loss = -score_positive + log(sum(exp(score_negative)))
    """
    outputs = outputs.reshape(-1, answer_number)
    positive_score = outputs[:, 0]
    negative_score = outputs[:, 1:]
    loss = -positive_score + torch.logsumexp(negative_score, dim=1)
    return loss

In [20]:
import time
from torch.optim import Adam
optimizer = Adam(model.parameters(), lr=1e-5)

In [21]:
for input_ids, attention_mask, token_type_ids in train_loader:
    # print(input_ids.shape, attention_mask.shape, token_type_ids.shape)
    input_ids = input_ids.reshape(-1, max_length)
    attention_mask = attention_mask.reshape(-1, max_length)
    token_type_ids = token_type_ids.reshape(-1, max_length)
    # 计算输出
    start = time.time()
    outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids).logits # slow
    end = time.time()
    print(end - start)
    print(outputs.shape)
    # 计算损失
    loss = LCELoss(outputs)
    print(loss.shape)
    print(loss)
    # 反向传播
    start = time.time()
    loss.mean().backward() # mean or sum? also slow!
    end = time.time()
    print(end - start)
    # 更新参数
    start = time.time()
    optimizer.step()
    end = time.time()
    print(end - start)

    break

7.81747841835022
torch.Size([20, 1])
torch.Size([4])
tensor([ -6.0932,  -7.5623, -15.6571, -17.9215], grad_fn=<AddBackward0>)
9.43141508102417
0.50111985206604


# Test

- test_2022_76_queries
- test_2022_passage_top100
- test_2022.qrels.pass.withDupes

In [22]:
from utils import NDCG

In [55]:
example_query_id = random.choice(list(data_processor.dataset['test_2022_76_queries'].keys()))
example_query_id

'2012536'

In [56]:
example_query = data_processor.dataset['test_2022_76_queries'][example_query_id]['query']
example_query

'how to cook pork tenderloin steaks in oven'

In [57]:
top_100_pid = [
    data_processor.dataset['test_2022_passage_top100'][example_query_id][i][0]
    for i in range(1, 101)
]
# top_100_pid

In [58]:
top_100_passages = [
    data_processor.dataset['collection.sampled'][pid]['passage']
    for pid in top_100_pid
]
top_100_passages[:5]

["'More Articles. How to Cook Sirloin Filets in a Pan and Finish in the Oven. How to Cook Steak on a Baking Sheet. How to Cook 5 Lbs. of Beef Tenderloin. Easy Ways to Cook Pork Steak.'",
 "How to Cook Pork Tenderloin in the Oven. While we also love grilling pork tenderloin (or even cooking it all day in the slow cooker), today's technique is dedicated to how to roast pork tenderloin in the oven, because it's the most accessible cooking method year-round. Once you have marinated your pork tenderloin, it's time to get cooking!",
 "'More Articles. How to Cook Cubed Steak With Gravy and Onions. How to Make a Juicy Pork Tenderloin. How to Cook Tender Rolled Flank Steaks in the Oven. How to Cook a Bottom Round Roast & Vegetables. How to Cook Beef Topside in a Slow Cooker. Jupiterimages/Comstock/Getty Images.'",
 "'More Articles. How to Cook Cubed Steak With Gravy and Onions. How to Make a Juicy Pork Tenderloin. How to Cook Tender Rolled Flank Steaks in the Oven. How to Cook a Bottom Round Ro

In [62]:
standard_rates = [
    data_processor.dataset['test_2022.qrels.pass.withDupes'][example_query_id][pid]
    if data_processor.dataset['test_2022.qrels.pass.withDupes'][example_query_id].get(pid) is not None else 0
    for pid in top_100_pid
]
len(standard_rates)

100

In [63]:
res = NDCG()
res.update(standard_rates)
res.NDCG_at_k()

0.5317188772951082

In [64]:
inputs = tokenizer([example_query]*100, top_100_passages, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
with torch.no_grad():
    outputs = model(inputs["input_ids"], attention_mask=inputs["attention_mask"], token_type_ids=inputs["token_type_ids"]).logits
outputs.shape

torch.Size([100, 1])

In [65]:
# outputs前100的索引
scores, indices = outputs.flatten().topk(
    k=100,
    dim=-1,
    largest=True,
    sorted=True
)

In [66]:
reranked_rates = [int(standard_rates[i]) for i in indices]
res.update(reranked_rates)
res.NDCG_at_k()

0.6675305124058125