In [76]:
import os
import random
import json
import requests
import tarfile
import argparse

import numpy as np
import pandas as pd

import torch

import pyserini
from pyserini.search import SimpleSearcher
from pyserini.dsearch import SimpleDenseSearcher

import transformers
# from transformers import set_seed
# set_seed(42)

from peft import LoraConfig
from transformers import (AutoTokenizer, 
                          AutoModelForCausalLM, 
                          BitsAndBytesConfig)
from datasets import load_dataset

In [78]:
parser = argparse.ArgumentParser(description='Reranking with LLaMA2')

parser.add_argument('--model', type=str, default='meta-llama/Llama-2-7b-hf', help='model name')
parser.add_argument('--dataset', type=str, default='msmarco-passage',)
parser.add_argument('--data_path', type=str, default='./collection/')
parser.add_argument('--seed',type=int, default=42)
parser.add_argument('--batch_size', type=int, default=256)
parser.add_argument('--max_len', type=int, default=40)
parser.add_argument('--lr', type=float, default=1e-5)
parser.add_argument('--max_epochs', type=int, default=10)
parser.add_argument('--use_cuda', type=bool, default=True)
parser.add_argument('--k', type=int, default=10, help='top k')
parser.add_argument('--k1', type=float, default=1.5, help='BM25 parameter')
parser.add_argument('--b', type=float, default=0.75, help='BM25 parameter')

parser.add_argument

config = parser.parse_args([])

In [80]:
# download data
# https://github.com/castorini/pyserini/blob/master/docs/experiments-msmarco-passage.md
dataset_path = os.path.join(config.data_path, config.dataset)
msmarco_url = 'https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz'

if not os.path.exists(dataset_path):
    os.mkdir(dataset_path)

response = requests.get(msmarco_url, stream=True)
file = tarfile.open(fileobj=response.raw, mode='r|gz')
file.extractall(path=dataset_path)

In [None]:
# tsv to jsonl
tsv_path = os.path.join(dataset_path, 'collection.tsv')
jsonl_path = os.path.join(dataset_path, 'collection.jsonl')
os.system('python tools/scripts/msmarco/convert_collection_to_jsonl.py' +
          f'--collection-path {tsv_path}' +
          f'--output-folder {jsonl_path}')

In [None]:
class BM25Retriever:
    def __init__(self) -> None:
        pass 
    
    def search(index_path:str=None):
        searcher = SimpleSearcher(index_dir='')
        searcher.set_bm25(k1=config.k1, b=config.b)
    
    def search_indexes(searcher, query, id, answers):
        hits = searcher.search(query, k=config.k)
        passages = []
        
        for i in range(len(hits)):
            qas = dict()
            qas['qas'] = [{'id': id, 'query': query, 'answers': answers}]
            qas['context'] = json.loads(hits[i].raw)['contents']
            qas["bm25_scores"] = hits[i].score
            passages.append(qas)
            
        return passages
    
# https://github.com/velocityCavalry/bm25-pyserini/blob/main/search_index.py#L64 


In [None]:
from typing import List 
from pygaggle.rerank.base import Reranker, Query, Text

class LLaMAReranker(Reranker):
    def __init__(self,
                 model=None,
                 ):
        super().__init__()
    
    def rerank(self, query: Query, texts: List[Text]) -> List[Text]:
        query =
        
        return super().rerank(query, texts)

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
)

tokenizer = AutoTokenizer.from_pretrained(config.model)
model = AutoModelForCausalLM.from_pretrained(config.model,
                                             quantization_config=quantization_config,
                                             trust_remote_code=True,)

model.config.use_cache=True

In [None]:
peft_config = LoraConfig(
    task_type='CAUSAL_LM',
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules = ["q_proj", "v_proj"]
)

In [None]:
import pandas as pd
import time
import sys
import os
import numpy as np
from translate import Translator
translator=Translator(to_lang='en',from_lang='es')
import sklearn.metrics.pairwise
from tqdm import tnrange
from sklearn.metrics import jaccard_score
import scipy
import re


from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer('bert-base-nli-mean-tokens') #BERT BASE
#embedder = SentenceTransformer('bert-large-nli-stsb-mean-tokens') # LARGE BERT

corpus_embeddings=embedder.encode(raw_data['text'].to_list())

queries=["VACHON CARROT CAKE",
         "DEMPSTERS S WW HAMBURGER BUNS 8S",
         "POM CINNAMON RAISIN BAGELS"]
query_embeddings = embedder.encode(queries)

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
closest_n = 5
for query, query_embedding in zip(queries, query_embeddings):
    distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:closest_n]:
        print(raw_data['text'][idx].strip(), "(Score: %.4f)" % (1-distance))