In [1]:
import os
import random
import json
import wget
import requests
import tarfile
import argparse

import numpy as np
import pandas as pd

import torch
import transformers
from peft import LoraConfig
from transformers import (AutoTokenizer, 
                          AutoModelForCausalLM,
                          BitsAndBytesConfig)
import utils
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
parser = argparse.ArgumentParser(description='Rerank')

parser.add_argument('--model_name', type=str, default='facebook/opt-125m')
parser.add_argument('--collection', type=str, default='msmarco-passage')
parser.add_argument('--collection_dir', type=str, default='./collections/msmarco-passage')
parser.add_argument('--seed',type=int, default=2023)
parser.add_argument('--batch_size', type=int, default=256)
parser.add_argument('--max_len', type=int, default=40)
parser.add_argument('--lr', type=float, default=1e-5)
parser.add_argument('--max_epochs', type=int, default=10)
parser.add_argument('--use_cuda', type=bool, default=True)
parser.add_argument('--k', type=int, default=100, help='top k')
parser.add_argument('--k1', type=float, default=1.5, help='BM25 parameter')
parser.add_argument('--b', type=float, default=0.75, help='BM25 parameter')

parser.add_argument

config = parser.parse_args([])

In [None]:
collection_path = os.path.join(config.collection_dir, 'collection.tsv')
queries_tr_path = os.path.join(config.collection_dir, 'queries.train.tsv')
qrels_tr_path = os.path.join(config.collection_dir, 'qrels.train.tsv')
qrels_dev_path = os.path.join(config.collection_dir, 'qrels.dev.tsv')
queries_dev_path = os.path.join(config.collection_dir, 'queries.dev.tsv')
queries_eval_path = os.path.join(config.collection_dir, 'queries.eval.tsv')
top1000_tr_path = os.path.join(config.collection_dir, 'top1000.train.txt')
top1000_dev_path = os.path.join(config.collection_dir, 'top1000.dev')

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(config.seed)

In [None]:
from torch.utils.data import DataLoader, Dataset, TensorDataset, IterableDataset

# Encode Dataset For Reranking
class MarcoEncodeDataset(Dataset):
    def __init__(self, collection_dir, tokenizer, mode='train', q_max_len=128, p_max_len=128):
        self.collection_dir = collection_dir
        self.tokenizer = tokenizer
        self.mode = mode
        self.q_max_len = q_max_len
        self.p_max_len = p_max_len
        # load data
        passages_path = os.path.join(collection_dir, 'collection.tsv')
        queries_path = os.path.join(collection_dir, f'queries.{mode}.tsv')
        qrels_path = os.path.join(collection_dir, f'qrels.{mode}.tsv')
        
        self.passages = pd.read_csv(passages_path, sep='\t', header=None, names=['pid', 'passage'], index_col='pid')
        self.queries = pd.read_csv(queries_path, sep='\t', header=None, names=['qid', 'query'], index_col='qid')
        self.relations = pd.read_csv(qrels_path, sep='\t', header=None, names=['qid', '0', 'pid', 'label'])
        if self.mode == 'train':
            top1000_path = os.path.join(collection_dir, f'top1000.{mode}.txt')
            top1000_dict = utils.read_top1000(top1000_path)
            self.top1000 = pd.DataFrame(list(top1000_dict.items()), columns=['qid', 'pid'])
        else:
            top1000_path = os.path.join(collection_dir, f'top1000.{mode}')
            self.top1000 = pd.read_csv(top1000_path, sep='\t', header=None, names=['qid', 'pid', 'query', 'passage'])
    
    def __len__(self):
        return len(self.top1000)
        
    def __getitem__(self, idx):
        x = self.top1000.iloc[idx]
        query = self.queries.loc[x.qid].query
        passage = self.passages.loc[x.pid].passage 
        label = 0 if self.relations.loc[(self.relations['qid'] == x.qid) & (self.relations['pid'] == x.pid)].empty else 1
        
        encode_query = self.tokenizer.encode_plus(
            query,
            max_length=self.q_max_len,
            truncation='only_first',
            return_token_type_ids=True,
            return_attention_mask=True,
            padding='max_length',
            return_tensors='pt'
        )

        encoded_psg = self.tokenizer.encode_plus(
            passage,
            max_length=self.p_max_len,
            truncation='only_first',
            return_token_type_ids=True,
            return_attention_mask=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        encoded = {
            'qid': x.qid,
            'pid': x.pid,
            'q_input_ids': encode_query['input_ids'], # query
            'p_input_ids': encoded_psg['input_ids'], # passage
            'q_attn_msk': encode_query['attention_mask'],
            'p_attn_msk': encoded_psg['attention_mask'], 
            'label': torch.LongTensor([label]),
        }
        
        return encoded

In [None]:
LLAMA_MODEL_LIST = [
    'meta-llama/Llama-2-7b',
    'meta-llam/Llama-2-7b-hf',
    'meta-llama/Llama-2-7b-chat',
    'meta-llama/Llama-2-7b-chat-hf',
    'meta-llama/Llama-2-13b',
    'meta-llama/Llama-2-13b-hf'
    'meta-llama/Llama-2-13b-chat',
    'meta-llama/Llama-2-13b-chat-hf',
    'meta-llama/Llama-2-70b',
    'meta-llama/Llama-2-70b-hf',
    'meta-llama/Llama-2-70b-chat',
    'meta-llama/Llama-2-70b-chat-hf',
]

n_gpu = os.cpu_count()

class LlaMAReranker:
    def __init__(self, model_name, use_cuda, batch_size=config.batch_size, n_gpu=n_gpu, mode='train'):
        self.mode = mode
        self.batch_size = batch_size
        self.n_gpu = n_gpu

        self.model = self.load_model(config.model_name, config.use_cuda)
        self.model.eval()
        self.tokenizer = self.load_tokenizer(config.model_name)
        self.encode_dataset = MarcoEncodeDataset(config.collection_dir, self.tokenizer, mode=self.mode)
        self.encode_dataloader = DataLoader(encode_dataset, batch_size=self.batch_size, num_workers=4*self.n_gpu)
    
    def load_model(self, model_name:str, use_cuda:bool):
        device = torch.device('cuda' if torch.cuda.is_available() & use_cuda else 'cpu')
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32).to(device)
        model.config.use_cache=True

        # for param in model.parameters():
        #     param.requires_grad = False

        return model
    
    def load_tokenizer(self, model_name:str):
        tokenizer =  AutoTokenizer.from_pretrained(model_name)
        return tokenizer
    
    # 그냥 일반 rerank (이때 assert q_max_len == p_max_len)
    def rerank(self, encoded):
        device = torch.device('cuda' if torch.cuda.is_available() & use_cuda else 'cpu')
        input_ids = torch.cat([encoded['q_input_ids'], encoded['p_input_ids']], dim=1).to(device)
        attn_msk = torch.cat([encoded['q_attn_msk'], encoded['p_attn_msk']], dim=1).to(device)
        inputs = {'input_ids': input_ids, 'attention_mask': attn_msk}
        # q_input_ids = q_input_ids.to(device)
        # p_input_ids = p_input_ids.to(device)
        # p_len = len(p_input_ids)
        with torch.no_grad():
            logits = self.model(**inputs).logits
            
    
    # def score(self):

In [None]:
llama_reranker = LlaMAReranker(LLAMA_MODEL_LIST[1], config.use_cuda)
model = llama_reranker.model
tokenizer = llama_reranker.tokenizer

In [None]:
marco_encoded_train = MarcoEncodeDataset(config.collection_dir, tokenizer)
# marco_encoded_dev = MarcoEncodeDataset(config.collection_dir, tokenizer, mode='dev')

In [None]:
n_gpu = os.cpu_count()
train_dataloader = DataLoader(marco_encoded_train, batch_size=config.batch_size, num_workers=4*n_gpu)

In [None]:
a = next(iter(train_dataloader))

In [None]:
input_ids = torch.cat([a['query_input_ids'], a['psg_input_ids']], dim=1).to(device)
attn_msk = torch.cat([a['query_attn_msk'], a['psg_attn_msk']], dim=1).to(device)
inputs = {'input_ids': input_ids, 'attention_mask': attn_msk}
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
# https://huggingface.co/models?search=gpt+neo
GPT_PRETRAINED_MODEL_LIST = [
    'EleutherAI/gpt-neo-125m',
    'EleutherAI/gpt-neo-2.7B',
    'EleutherAI/gpt-neo-1.3B'
]

class GPTReranker:
    def __init__(self):
        self.model = self.load_model(config.model_name, config.use_cuda)
        self.tokenizer = self.load_tokenizer(config.model_name)
        # tokenizer.pad_token = tokenizer.eos_token
        self.model.eval()
    
    def load_model(self, model_name:str, use_cuda:bool):
        device = torch.device('cuda' if torch.cuda.is_available() & use_cuda else 'cpu')
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32).to(device)
        model.config.use_cache=True
        return model
    
    def load_tokenizer(self, model_name:str):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        return tokenizer
    
    def _get_prompt(self, query)
    
    def rerank(self, query, texts):
        prompt =  f"Please generate a query based on the following passage: {texts}"

# model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')
# tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')