In [1]:
import os
import random
import json
import wget
import requests
import tarfile

import argparse

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

import transformers
from peft import LoraConfig
from transformers import (AutoTokenizer, 
                          AutoModelForCausalLM,
                          BitsAndBytesConfig)
import utils
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
parser = argparse.ArgumentParser(description='Rerank')

parser.add_argument('--model_name', type=str, default='facebook/opt-125m')
parser.add_argument('--collection', type=str, default='msmarco-passage')
parser.add_argument('--collection_dir', type=str, default='./collections/msmarco-passage')
parser.add_argument('--seed',type=int, default=2023)
parser.add_argument('--batch_size', type=int, default=256)
parser.add_argument('--max_len', type=int, default=40)
parser.add_argument('--lr', type=float, default=1e-5)
parser.add_argument('--max_epochs', type=int, default=10)
parser.add_argument('--use_cuda', type=bool, default=True)
parser.add_argument('--k', type=int, default=100, help='top k')
parser.add_argument('--k1', type=float, default=1.5, help='BM25 parameter')
parser.add_argument('--b', type=float, default=0.75, help='BM25 parameter')

parser.add_argument

config = parser.parse_args([])

In [3]:
collection_path = os.path.join(config.collection_dir, 'collection.tsv')
queries_tr_path = os.path.join(config.collection_dir, 'queries.train.tsv')
qrels_tr_path = os.path.join(config.collection_dir, 'qrels.train.tsv')
qrels_dev_path = os.path.join(config.collection_dir, 'qrels.dev.tsv')
queries_dev_path = os.path.join(config.collection_dir, 'queries.dev.tsv')
queries_eval_path = os.path.join(config.collection_dir, 'queries.eval.tsv')
top1000_tr_path = os.path.join(config.collection_dir, 'top1000.train.txt')
top1000_dev_path = os.path.join(config.collection_dir, 'top1000.dev')

In [4]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

set_seed(config.seed)

hf_token = open('./hf_token.txt', 'r', encoding='utf-8').read()
os.system(f'huggingface-cli login --token {hf_token}')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/work/.cache/huggingface/token
Login successful


0

In [5]:
from torch.utils.data import DataLoader, Dataset, TensorDataset, IterableDataset

# Encode Dataset For Reranking
class MarcoEncodeDataset(Dataset):
    def __init__(self, collection_dir, tokenizer, mode='train', q_max_len=128, p_max_len=128):
        self.collection_dir = collection_dir
        self.tokenizer = tokenizer
        self.mode = mode
        self.q_max_len = q_max_len
        self.p_max_len = p_max_len
        # load data
        passages_path = os.path.join(collection_dir, 'collection.tsv')
        queries_path = os.path.join(collection_dir, f'queries.{mode}.tsv')
        qrels_path = os.path.join(collection_dir, f'qrels.{mode}.tsv')
        
        self.passages = pd.read_csv(passages_path, sep='\t', header=None, names=['pid', 'passage'], index_col='pid')
        self.queries = pd.read_csv(queries_path, sep='\t', header=None, names=['qid', 'query'], index_col='qid')
        self.relations = pd.read_csv(qrels_path, sep='\t', header=None, names=['qid', '0', 'pid', 'label'])
        if self.mode == 'train':
            top1000_path = os.path.join(collection_dir, f'top1000.{mode}.txt')
            top1000_dict = utils.read_top1000(top1000_path)
            self.top1000 = pd.DataFrame(list(top1000_dict.items()), columns=['qid', 'pid'])
        else:
            top1000_path = os.path.join(collection_dir, f'top1000.{mode}')
            self.top1000 = pd.read_csv(top1000_path, sep='\t', header=None, names=['qid', 'pid', 'query', 'passage'])
    
    def __len__(self):
        return len(self.top1000)
        
    def __getitem__(self, idx):
        x = self.top1000.iloc[idx]
        query = self.queries.loc[x.qid].query
        passage = self.passages.loc[x.pid].passage 
        label = 0 if self.relations.loc[(self.relations['qid'] == x.qid) & (self.relations['pid'] == x.pid)].empty else 1
        
        encode_query = self.tokenizer.encode_plus(
            query,
            max_length=self.q_max_len,
            truncation='only_first',
            # return_token_type_ids=True,
            return_attention_mask=True,
            padding='max_length',
            return_tensors='pt'
        )

        encoded_psg = self.tokenizer.encode_plus(
            passage,
            max_length=self.p_max_len,
            truncation='only_first',
            # return_token_type_ids=True,
            return_attention_mask=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        encoded = {
            'qid': x.qid,
            'pid': x.pid,
            'q_input_ids': encode_query['input_ids'], # query
            'p_input_ids': encoded_psg['input_ids'], # passage
            'q_attn_msk': encode_query['attention_mask'],
            'p_attn_msk': encoded_psg['attention_mask'], 
            'label': torch.LongTensor([label]),
        }
        
        return encoded

In [6]:
LLAMA_MODEL_LIST = [
    'meta-llama/Llama-2-7b',
    'meta-llama/Llama-2-7b-hf',
    'meta-llama/Llama-2-7b-chat',
    'meta-llama/Llama-2-7b-chat-hf',
    'meta-llama/Llama-2-13b',
    'meta-llama/Llama-2-13b-hf'
    'meta-llama/Llama-2-13b-chat',
    'meta-llama/Llama-2-13b-chat-hf',
    'meta-llama/Llama-2-70b',
    'meta-llama/Llama-2-70b-hf',
    'meta-llama/Llama-2-70b-chat',
    'meta-llama/Llama-2-70b-chat-hf',
]

# huggingface-cli login --token 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

class LlaMAReranker:
    def __init__(self, model_name, use_cuda, batch_size, n_gpu, device, mode='train'):
        self.use_cuda = use_cuda
        self.model_name = model_name
        self.mode = mode
        self.batch_size = batch_size
        self.n_gpu = n_gpu
        self.device = device
        # self.device = torch.device('cuda' if torch.cuda.is_available() & use_cuda else 'cpu')

        self.model = self.load_model(self.model_name, self.use_cuda)
        self.tokenizer = self.load_tokenizer(config.model_name)
        self.encode_dataset = MarcoEncodeDataset(config.collection_dir, self.tokenizer, mode=self.mode)
        self.encode_dataloader = DataLoader(self.encode_dataset, batch_size=self.batch_size, num_workers=4*self.n_gpu)
    
    def load_model(self, model_name:str, use_cuda:bool):
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32).to(self.device)
        # model.config.pad_token_id = model.config.bos_token_id
        model.config.use_cache=True
        model.resize_token_embeddings(len(self.tokenizer))

        return model
    
    def load_tokenizer(self, model_name:str):
        tokenizer =  AutoTokenizer.from_pretrained(model_name)
        tokenizer.pad_token = tokenizer.bos_token
        return tokenizer
    
    def rerank(self, encoded):
        q_input_ids = encoded['q_input_ids'].squeeze(dim=1)
        p_input_ids = encoded['p_input_ids'].squeeze(dim=1)
        q_attn_msk = encoded['q_attn_msk'].squeeze(dim=1)
        p_attn_msk = encoded['p_attn_msk'].squeeze(dim=1)
        input_ids = torch.cat([q_input_ids, p_input_ids], dim=1).to(self.device)
        attn_msk = torch.cat([q_attn_msk, p_attn_msk], dim=1).to(self.device)
        inputs = {'input_ids': input_ids, 'attention_mask': attn_msk}
        # q_input_ids = q_input_ids.to(device)
        # p_input_ids = p_input_ids.to(device)
        # p_len = len(p_input_ids)
        with torch.no_grad():
            logits = self.model(**inputs).logits
        
        return logits
            
    # 그냥 일반 rerank (이때 assert q_max_len == p_max_len)
    def score(self, encoded):
        input_ids = torch.cat([encoded['q_input_ids'], encoded['p_input_ids']], dim=1).to(self.device)
        attn_msk = torch.cat([encoded['q_attn_msk'], encoded['p_attn_msk']], dim=1).to(self.device)
        p_len = len(encoded['p_input_ids'])
        inputs = {'input_ids': input_ids, 'attention_mask': attn_msk}
        with torch.no_grad():
            logits = self.model(**inputs).logits
        
        score = logits.sum().item()
        return score

# torch.cuda.empty_cache()

In [7]:
torch.cuda.empty_cache()

In [8]:
N_GPU = os.cpu_count()
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() & config.use_cuda else 'cpu')
llama_reranker = LlaMAReranker(model_name=LLAMA_MODEL_LIST[1], use_cuda=config.use_cuda, batch_size=1, n_gpu=4*N_GPU, device=DEVICE, mode='dev')
model = llama_reranker.model
tokenizer = llama_reranker.tokenizer
# dev_dataset = llama_reranker.encode_dataset 
# dev_dataloader = llama_reranker.encode_dataloader

In [None]:
# marco_encoded_train = MarcoEncodeDataset(config.collection_dir, tokenizer)
marco_encoded_dev = MarcoEncodeDataset(config.collection_dir, tokenizer, mode='dev')

In [None]:
# train_dataloader = DataLoader(marco_encoded_train, batch_size=config.batch_size, num_workers=4*n_gpu)
dev_dataloader = DataLoader(marco_encoded_dev, batch_size=config.batch_size, num_workers=4*N_GPU)



In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
# a = next(iter(train_dataloader))
a = next(iter(dev_dataloader))

In [None]:
a['p_input_ids'].squeeze(dim=1).shape

torch.Size([256, 128])

Embedding(50265, 4096)

In [None]:
model(a['p_input_ids'].squeeze(dim=1).to(DEVICE))

: 

: 

In [None]:
input_ids = torch.cat([a['q_input_ids'].squeeze(dim=1), a['p_input_ids'].squeeze(dim=1)], dim=-1).to(DEVICE)
attn_msk = torch.cat([a['q_attn_msk'].squeeze(dim=1), a['p_attn_msk'].squeeze(dim=1)], dim=1).to(DEVICE)
inputs = {'input_ids': input_ids, 'attention_mask': attn_msk}

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# encode_query = tokenizer.encode_plus(
#             'what is query',
#             max_length=128,
#             truncation='only_first',
#             # return_token_type_ids=True,
#             return_attention_mask=True,
#             padding='max_length',
#             return_tensors='pt'
#         )
# encode_query


In [None]:
for idx, encodeds in enumerate(train_dataloader):
    logits, y = llama_reranker.rerank(encodeds)
    loss_fn = 
    loss = loss_fn(logits, y)
    model.zero_grad()
    loss.backward()

    if

In [None]:
# https://huggingface.co/models?search=gpt+neo
GPT_PRETRAINED_MODEL_LIST = [
    'EleutherAI/gpt-neo-125m',
    'EleutherAI/gpt-neo-2.7B',
    'EleutherAI/gpt-neo-1.3B'
]

class GPTReranker:
    def __init__(self):
        self.model = self.load_model(config.model_name, config.use_cuda)
        self.tokenizer = self.load_tokenizer(config.model_name)
        # tokenizer.pad_token = tokenizer.eos_token
        self.model.eval()
    
    def load_model(self, model_name:str, use_cuda:bool):
        device = torch.device('cuda' if torch.cuda.is_available() & use_cuda else 'cpu')
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32).to(device)
        model.config.use_cache=True
        return model
    
    def load_tokenizer(self, model_name:str):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        return tokenizer
    
    def _get_prompt(self, query)
    
    def rerank(self, query, texts):
        prompt =  f"Please generate a query based on the following passage: {texts}"

# model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')
# tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')