In [2]:
import os
import random
import json
import requests
import io
from zipfile import ZipFile
import argparse

import numpy as np
import pandas as pd

import torch

import pyserini
from pyserini.search import SimpleSearcher
from pyserini.dsearch import SimpleDenseSearcher

import transformers
# from transformers import set_seed
# set_seed(42)

from peft import LoraConfig
from transformers import (AutoTokenizer, 
                          AutoModelForCausalLM, 
                          BitsAndBytesConfig)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
parser = argparse.ArgumentParser(description='Reranking with LLaMA2')

parser.add_argument('--model_name', type=str, default='Llama-2-7b-hf')
parser.add_argument('--dataset', type=str, default='nfcorpus')
parser.add_argument('--data_path', type=str, default='./collections/')
parser.add_argument('--seed',type=int, default=42)
parser.add_argument('--batch_size', type=int, default=256)
parser.add_argument('--max_len', type=int, default=40)
parser.add_argument('--lr', type=float, default=1e-5)
parser.add_argument('--max_epochs', type=int, default=10)
parser.add_argument('--use_cuda', type=bool, default=False)
parser.add_argument('--k', type=int, default=10, help='top k')
parser.add_argument('--k1', type=float, default=1.5, help='BM25 parameter')
parser.add_argument('--b', type=float, default=0.75, help='BM25 parameter')

parser.add_argument

config = parser.parse_args([])

In [10]:
dataset_path = os.path.join(config.data_path, config.dataset)
nfcorpus_url = 'https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nfcorpus.zip'
tsv_path = os.path.join(dataset_path, config.dataset, 'queries.tsv')
queries_jsonl_path = os.path.join(dataset_path, config.dataset, 'queries.jsonl')
corpus_jsonl_path = os.path.join(dataset_path, config.dataset, 'corpus.jsonl')
pyserini_jsonl_path = os.path.join(dataset_path, 'pyserini-corpus', 'corpus.jsonl')
index_path = os.path.join('./indexes', 'lucene-index.nfcorpus')

In [5]:
# download data
if not os.path.exists(dataset_path):
    os.mkdir(dataset_path)

response = requests.get(nfcorpus_url, stream=True)
file = ZipFile(io.BytesIO(response.content))
file.extractall(path=dataset_path)

In [8]:
with open(tsv_path, 'w') as out:
    with open(queries_jsonl_path, 'r') as f:
        for line in f:
            l = json.loads(line)
            out.write(l['_id'] + '\t' + l['text'] + '\n')

In [11]:
# https://github.com/castorini/pyserini/blob/e371ed3661e90db6b797290493d973cb6c089c43/docs/conceptual-framework2.md
with open(pyserini_jsonl_path, 'w') as out:
    with open(corpus_jsonl_path, 'r') as f:
        for line in f:
            l = json.loads(line)
            s = json.dumps({'id': l['_id'], 'contents': l['title'] + ' ' + l['text']})
            out.write(s + '\n')

In [13]:
from typing import List
from pyserini.search.lucene import LuceneSearcher 

# Indexer # Retriever # BaseRetriever 만들고 BM25, ANCE, Hybrid
# build_sparse_index 
# build_dense_index
# ssearch, dsearch, hsearch

class BM25Retriever:
    def __init__(self, jsonl_path, index_path, k1=1.5, b=0.75):
        self.jsonl_path = jsonl_path
        if not os.path.exists(index_path):
            self.build_sparse_index(jsonl_path, index_path)
        self.searcher = LuceneSearcher(index_path) # searcher = SimpleSearcher.from_prebuilt_index('msmarco-passage')
        self.searcher.set_bm25(k1=k1, b=b)
        # self.searcher.set_language()
    
    def build_sparse_index(self, jsonl_path, index_path): # 나중에 dense, hybird하기 위해 build_dense_index 만들고 새로운 class 만들기
        execute_code = os.system('python -m pyserini.index.lucene ' + 
                                 '--collection JsonCollection ' +
                                 f'--input {jsonl_path} ' +
                                 f'--index {index_path} ' +
                                 '--generator DefaultLuceneDocumentGenerator ' +
                                 '--threads 1 --storeRaw')
        if execute_code != 0:
            raise Exception('Indexing Failed!')
        else:
            print('Indexing Success!')
            
    def _get_results(self, qid, hits:List):
        results = []
        
        for i, hit in enumerate(hits):
            docid = hit.docid
            content = json.loads(hits[i].raw)['contents']
            bm25_score = hit.score
            result = {'rank': i,
                      'qid': qid,
                      'docid': docid, 
                      'bm25_score': bm25_score,
                      'content': content}
            results.append(result)
            
        return results
    
    def search(self, qid, query:str, k:int=10):
        hits = self.searcher.search(query, k=k)
        search_results  = self._get_results(qid, hits)
        
        return search_results
    
    def batch_search(self, queries: List[str], qids: List[str], k:int=10):
        batch_hits = self.searcher.batch_search(queries, qids, k=k)
        bsearch_results = {}
        
        for qid, hits in batch_hits.items():
            bsearch_results[qid] = self._get_results(qid, hits)
        
        return bsearch_results

In [16]:
bm25_retriever = BM25Retriever('collections/nfcorpus/pyserini-corpus/', index_path)

Indexing Success!


In [20]:
bm25_retriever.search(qid="PLAIN-63", query='How to Get Enough Antioxidants Each Day')

[{'rank': 0,
  'qid': 'PLAIN-63',
  'docid': 'MED-1933',
  'bm25_score': 4.668799877166748,
  'content': 'From the Cover: Accelerated telomere shortening in response to life stress Numerous studies demonstrate links between chronic stress and indices of poor health, including risk factors for cardiovascular disease and poorer immune function. Nevertheless, the exact mechanisms of how stress gets “under the skin” remain elusive. We investigated the hypothesis that stress impacts health by modulating the rate of cellular aging. Here we provide evidence that psychological stress— both perceived stress and chronicity of stress—is significantly associated with higher oxidative stress, lower telomerase activity, and shorter telomere length, which are known determinants of cell senescence and longevity, in peripheral blood mononuclear cells from healthy premenopausal women. Women with the highest levels of perceived stress have telomeres shorter on average by the equivalent of at least one de

In [None]:
from typing import List 
from base import Reranker, Query, Text # pygaggle

# https://github.com/informagi/EMBERT/blob/f89efeeeef53d4dc9e2cc1f2b547aa34aa4f7945/Code/pygaggle/rerank/transformer.py
class LLaMAReranker(Reranker):
    def __init__(self, model_name, max_len, use_cuda):
        self.model = self.load_model(model_name, use_cuda)
        self.tokenizer = self.load_tokenizer(model_name)
        self.max_len = max_len
    
    def load_model(self, model_name:str, use_cuda:bool):
        device = torch.device('cuda' if torch.cuda.is_available() & use_cuda else 'cpu')
        model = AutoModelForCausalLM.from_pretrained(f'meta-llama/{model_name}', torch_dtype=torch.float16).to(device)
        return model
    
    def load_tokenizer(self, model_name:str):
        tokenizer = AutoTokenizer.from_pretrained(f'meta-llama/{model_name}')
        return tokenizer
        
    
    def rerank(self, query: Query, texts: List[Text]) -> List[Text]:
        for text in texts:
            result = self.tokenizer(query.text)
            input_ids
            attn_mask
            
        
            
        
        return super().rerank(query, texts)
    
    # similarity score
    # deft score(self, input_ids, item)

In [None]:
def get_inputs(item, device, tokenizer):
    input_ids = torch.tensor(['input_ids'], device=device).unsqueeze(0)
    input_ids = tokenizer.decode()
    input_ids, 
    token_type 
    attn_mask
    

In [None]:
# top_results = torch.topk(scores, k=5).indices 
# reranked_corpus = [corpus[i] for i in top_results] 

# scored_articles = zip(articles, cosine_similarities)

# # Sort articles by cosine similarity
# sorted_articles = sorted(scored_articles, key=lambda x: x[1], reverse=True)

# scores = []
# https://github.com/amazon-science/datatuner/blob/f70369659e1c58e6ddb44d6db467978679dbdd3c/src/datatuner/lm/reranker.py#L5 


In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
)

tokenizer = AutoTokenizer.from_pretrained(config.model)
model = AutoModelForCausalLM.from_pretrained(config.model,
                                             quantization_config=quantization_config,
                                             trust_remote_code=True,)

model.config.use_cache=True

In [None]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules = ["q_proj", "v_proj"],
    inference_mode=False,
    bias='none',
    task_type='CAUSAL_LM',
)

model = get