In [4]:
import os
from tqdm import tqdm
import random
import json
import requests
import tarfile
import argparse
import multiprocessing

import numpy as np
import pandas as pd

import torch

import pyserini
from pyserini.search import SimpleSearcher
from pyserini.dsearch import SimpleDenseSearcher
from pyserini.search.lucene import LuceneSearcher

import transformers

from peft import LoraConfig
from transformers import (AutoTokenizer, 
                          AutoModelForCausalLM, 
                          BitsAndBytesConfig)

In [None]:
parser = argparse.ArgumentParser(description='Rerank')

parser.add_argument('--model_name', type=str, default='facebook/opt-125m')
parser.add_argument('--collection', type=str, default='msmarco-passage')
parser.add_argument('--collections_path', type=str, default='./collections/')
parser.add_argument('--seed',type=int, default=42)
parser.add_argument('--batch_size', type=int, default=256)
parser.add_argument('--max_len', type=int, default=40)
parser.add_argument('--lr', type=float, default=1e-5)
parser.add_argument('--max_epochs', type=int, default=10)
parser.add_argument('--use_cuda', type=bool, default=False)
parser.add_argument('--k', type=int, default=100, help='top k')
parser.add_argument('--k1', type=float, default=1.5, help='BM25 parameter')
parser.add_argument('--b', type=float, default=0.75, help='BM25 parameter')

parser.add_argument

config = parser.parse_args([])

In [None]:
def get_msmarco_passage_jsonl(collections_path, ):
    msmarco_passage_path = os.path.join(collections_path, 'msmarco-passage')
    # https://microsoft.github.io/msmarco/Datasets
    msmarco_url = 'https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz'
    
    if not os.path.exists(msmarco_passage_path):
        os.mkdir(msmarco_passage_path)
        
    response = requests.get(msmarco_url, stream=True)
    file = tarfile.open(fileobj=response.raw, mode='r|gz')
    file.extractall(path=msmarco_passage_path)
    
    tsv_path = os.path.join(msmarco_passage_path, 'collection.tsv')
    jsonl_path = os.path.join(msmarco_passage_path, 'collection_jsonl')
    
    if os.path.exists(tsv_path):
        os.system(f'python anserini-tools/scripts/msmarco/convert_collection_to_jsonl.py ' +
                  f'--collection-path {tsv_path} ' +
                  f'--output-folder {jsonl_path}')

In [None]:
def get_msmarco_passage_top1000():
    top1000_tr_url = 'https://msmarco.blob.core.windows.net/msmarcoranking/top1000.train.tar.gz'
    response = requests.get(top1000_tr_url, stream=True)
    file = tarfile.open(fileobj=response.raw, mode='r|gz')
    file.extractall(path='./collections/msmarco-passage')
    
    top1000_dev_url = 'https://msmarco.blob.core.windows.net/msmarcoranking/top1000.dev.tar.gz'
    response = requests.get(top1000_dev_url, stream=True)
    file = tarfile.open(fileobj=response.raw, mode='r|gz')
    file.extractall(path='./collections/msmarco-passage')

In [None]:
get_msmarco_passage_top1000()

NameError: name 'requests' is not defined

In [4]:
from typing import List

class Indexer:
    def __init__(self, jsonl_path, index_path):
        self.jsonl_path = jsonl_path
        self.index_path = index_path # indexes/lucene-index-msmarco-passage
    
    def build_sparse_index(self):
        execute_code = os.system('python -m pyserini.index.lucene ' + 
                                 '--collection JsonCollection ' +
                                 f'--input {self.jsonl_path} ' +
                                 f'--index {self.index_path} ' +
                                 '--generator DefaultLuceneDocumentGenerator ' +
                                 '--threads 1 --storeRaw')
        if execute_code != 0:
            raise Exception('Indexing Failed!')
        else:
            print('Indexing Success!')
    
    def build_dense_index(self):
        pass 

class BM25Retriever:
    def __init__(self, index_path, k, k1=1.5, b=0.75):
        self.searcher = LuceneSearcher(index_path)
        self.searcher.set_bm25(k1=k1, b=b)
        self.k = k
            
    def _get_results(self, qid, hits:List):
        results = []
        
        for i, hit in enumerate(hits):
            docid = hit.docid
            content = json.loads(hits[i].raw)['contents']
            bm25_score = hit.score
            result = {'rank': i+1,
                      'qid': qid,
                      'docid': docid, 
                      'score': bm25_score,
                      'content': content}
            results.append(result)
            
        return results
    
    def _save_json(self, results:List[dict]):
        json_path = os.path.join('./retrieved/', f'bm25-{config.collection}-top{self.k}.json')
        json_file = open(json_path, 'w', encoding='utf-8', newline='\n')
        for result in results:
            json_file.write(json.dumps(result) + '\n')
        
        json_file.close()
    
    def search(self, qid, query_text:str):
        search_results = {}
        hits = self.searcher.search(query_text, k=self.k,)
        search_results['query'] = query_text
        search_results['hits']  = self._get_results(qid, hits)
        
        return search_results
    
    def batch_search(self, qids:List[str], query_texts: List[str], is_save:bool):
        query_dict = dict(zip(qids, query_texts))
        batch_hits = self.searcher.batch_search(query_texts, qids, k=self.k, threads=multiprocessing.cpu_count())
        bsearch_results = []
        bsearch_items = {}

        for qid, hits in batch_hits.items():
            bsearch_items['query'] = query_dict[qid]
            bsearch_items['hits'] = self._get_results(qid, hits)
            bsearch_results.append(bsearch_items)
            bsearch_items = {}
            
        if is_save:
            self._save_json(bsearch_results)
       
        return bsearch_results
    
# if not os.path.exists(index_path):
#             indexer = Indexer()
#             self.build_sparse_index(jsonl_path, index_path)     

In [5]:
def read_json(json_path:str):
    with open(json_path, 'r') as f:
        lines = f.readlines()
        result = [json.loads(line) for line in lines]
        return result

def save_json(results:List[dict]):
    json_path = os.path.join('./retrieved/', f'bm25-{config.collection}-top{config.k}.json')
    json_file = open(json_path, 'w', encoding='utf-8', newline='\n')
    for result in results:
        json_file.write(json.dumps(result) + '\n')
        
    json_file.close()

In [6]:
collection_dir_path = os.path.join(config.collections_path, config.collection)
collection_path = os.path.join(collection_dir_path, 'collection.tsv')
queries_train_path = os.path.join(collection_dir_path, 'queries.train.tsv')
qrels_train_path = os.path.join(collection_dir_path, 'qrels.train.tsv')
queries_dev_path = os.path.join(collection_dir_path, 'queries.dev.tsv')
queries_eval_path = os.path.join(collection_dir_path, 'queries.eval.tsv')

In [7]:
queries_train = pd.read_csv(queries_train_path, sep='\t', header=None, names=['qid', 'query'])
queries_train['qid'] = queries_train['qid'].astype(str)
queries_dev = pd.read_csv(queries_dev_path, sep='\t', header=None, names=['qid', 'query'])
queries_dev['qid'] = queries_dev['qid'].astype(str)

In [8]:
bm25 = BM25Retriever(index_path='indexes/lucene-index-msmarco-passage', k=100)

In [9]:
# qids_tr = queries_train['qid'].tolist()
# query_texts_tr = queries_train['query'].tolist()

# results_tr = []

# total = len(qids_tr)
# batch_size = 40000
# for i in tqdm(range(0, total, batch_size)):
#     tmp = bm25.batch_search(qids=qids_tr[i:min(i + batch_size, total)], query_texts=query_texts_tr[i:min(i + batch_size, total)], is_save=False)
#     results_tr.extend(tmp)
    
# save_json(results_tr)

  0%|          | 0/21 [00:00<?, ?it/s]

In [None]:
# qids_dev = queries_dev['qid'].tolist()
# query_texts_dev = queries_dev['query'].tolist()

# results_dev = []

# for i in tqdm(range(len(qids_dev))):
#     tmp = bm25.search(qid=qids_dev[i], query_text=query_texts_dev[i])
#     results_dev.append(tmp)
    
# save_json(results_dev)

808731

In [None]:
def get_result(results, qids, dids, scores):
    for qid, did, score in zip(qids, dids, scores):
        if qid not in results:
            results[qid] = {}
        
        results[qid][did] = float(score)

In [None]:
class MarcoDataset:
    def __init__(self, collection_dir_path, tokenizer, mode='train'):
        self.collection_dir_path = collection_dir_path
        self.tokenizer = tokenizer

    
    def __len__(self):
        return len(self.queries)
    
    def __getitem__(self, idx):
        query = self.queries.iloc[idx].query
        corpus = self.collection.iloc[idx].corpus 
        
        encoding = self.get_encoding(query, corpus, idx)
    
    def get_encoding(self, query, corpus, idx):
        qids = self.tokenizer(query, max_length=128, truncation=True).input_ids
        cids = self.tokenizer(corpus, max_length=512, truncation=True).input_ids
        ids = cids + qids
        encoding = self.tokenizer.encode()
        return encoding

In [None]:
from torch.utils.data import DataLoader, Dataset, TensorDataset, IterableDataset

class MarcoEncodedDataset(Dataset):
    def __init__(self, collection_dir_path, tokenizer, mode='train', max_query_len=128, max_corpus_len=512):
        self.collection_dir_path = collection_dir_path
        self.tokenizer = tokenizer
        self.mode = mode
        self.max_query_len = max_query_len
        self.max_corpus_len = max_corpus_len
        # load data
        collection_path = os.path.join(collection_dir_path, 'collection.tsv')
        queries_path = os.path.join(collection_dir_path, f'queries.{mode}.tsv')
        qrels_path = os.path.join(collection_dir_path, f'qrels.{mode}.tsv')
        self.collection = pd.read_csv(collection_path, sep='\t', header=None, names=['did', 'corpus'], index_col='did')
        self.queries = pd.read_csv(queries_path, sep='\t', header=None, names=['qid', 'query'], index_col='qid')
        self.relations = pd.read_csv(qrels_path, sep='\t', header=None, names=['qid', '0', 'did', 'label'])
        
        
    def __getitem__(self, index):
        x = self.queries.loc[x.qid].query_text
        
        query
    
    

In [None]:
class MarcoEncodedDataset(Dataset):
    def __init__(self)

In [None]:
from torch.utils.data import DataLoader, Dataset, IterableDataset

# https://github.com/OpenMatch/OpenMatch/blob/ad1d6228bcf288ebe86037f93cd4ae20061ec4ea/src/openmatch/retriever/reranker.py
def RerankDataset(IterableDataset):
    def __init__(self, tokenizer, query_dataset, corpus_dataset):
        self.tokenizer = tokenizer
        self.query_dataset = query_dataset
        self.corpus_dataset = corpus_dataset
        
    # def __iter__(self):
    #     for qid, did in items():
    #         yield 
    #         {
    #                 "query_id": qid, 
    #                 "doc_id": did, 
    #                 **encode_pair(
    #                     self.tokenizer, 
    #                     self.query_dataset[qid]["input_ids"], 
    #                     self.corpus_dataset[did]["input_ids"], 
    #                     self.query_dataset.max_len, 
    #                     self.corpus_dataset.max_len,
    #                     encode_as_text_pair=self.encode_as_text_pair
    #                 ),
    #             }


        

In [None]:
num = 100
# 후에 query 후보를 여러 개 생성해서..? 더 많이.?
# llama2 chat 활용해서
# https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ/discussions/5
llama2_prompt = [
    {
        "role": "system",
        "content": "You are an intelligent assistant capable of generatig queries for given passages.",
    },
    {
        "role": "user",
        "content": f"Please generate a query for the following {num} passages based on its content. \nThe task is to generate a query that summarizes the main points of each passage. \nThe query should be relevant to the content of the passage."
    },
    {"role": "assistant", "content": "Okay, please provide the passages to generate a query."},
]

In [None]:
model.generate()

In [None]:
class GPTReranker:
    def __init__(self):
        self.model = self.load_model(config.model_name, config.use_cuda)
        self.tokenizer = self.load_tokenizer(config.model_name)
        self.model.eval()
    
    def load_model(self, model_name:str, use_cuda:bool):
        device = torch.device('cuda' if torch.cuda.is_available() & use_cuda else 'cpu')
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32).to(device)
        model.config.use_cache=True
        return model
    
    def load_tokenizer(self, model_name:str):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        return tokenizer
    
    def _get_prompt(self, query)
    
    def rerank(self, query, texts):
        prompt =  f"Please generate a query based on the following passage: {texts}"
        

In [None]:
model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-125m')
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-125m')

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
passages = "Since 2007, the RBA's outstanding reputation has been affected by the 'Securency' or NPA scandal. These RBA subsidiaries were involved in bribing overseas officials so that Australia might win lucrative note-printing contracts. The assets of the bank include the gold and foreign exchange reserves of Australia, which is estimated to have a net worth of A$101 billion. Nearly 94% of the RBA's employees work at its headquarters in Sydney, New South Wales and at the Business Resumption Site."
prompt =f"Please generate a question for the following passages: {passages}"

prompt

In [None]:
inputs = tokenizer(prompt, return_tensors='pt')

In [None]:
generate_ids = model.generate(inputs.input_ids, num_return_sequences=1, do_sample=True, num_beams=1, max_new_tokens=32)

In [None]:
tokenizer.decode(generate_ids[0], skip_special_tokens=True,)

In [None]:
prompt

In [None]:
inputs = tokenizer(prompt, return_tensors="pt")
generate_ids = model.generate(inputs.input_ids, max_length=128)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

In [None]:
def rerank(self, query, texts):
        reranked_texts = []

        # Encode the query text
        query_inputs = self.tokenizer(query, return_tensors='pt', truncation=True, max_length=self.max_len, padding=True)

        for text in texts:
            # Encode the text
            text_inputs = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=self.max_len, padding=True)

            # Generate the reranking input by concatenating query and text
            rerank_input = {
                'input_ids': torch.cat([query_inputs['input_ids'], text_inputs['input_ids']], dim=1),
                'attention_mask': torch.cat([query_inputs['attention_mask'], text_inputs['attention_mask']], dim=1)
            }

            # Generate reranking scores using the GPT model
            with torch.no_grad():
                logits = self.model(**rerank_input).logits

            # Calculate the total score by summing logits
            total_score = logits.sum().item()

            # Append text and total score to the reranked_texts
            reranked_texts.append({'text': text, 'total_score': total_score})

        # Sort texts based on total_score in descending order
        reranked_texts.sort(key=lambda x: x['total_score'], reverse=True)

        # Extract the sorted texts
        sorted_texts = [item['text'] for item in reranked_texts]

        return sorted_texts