In [1]:
from datasets import Dataset, load_dataset
from qdrant_client import models, QdrantClient
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Qdrant
import time
from datetime import timedelta
import pandas as pd
from pandas import DataFrame
from matplotlib import pyplot as plt
from dotenv import load_dotenv
import numpy as np

import sys
import os
sys.path.append(os.path.abspath('../'))

from model.llm import LLM

In [2]:
llm = LLM(size=7, quantized=False)

Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
n = 10
rarit_dataset_buffer = []
k=10
options_enabled = False

# Overview of the dataset attributes
- Test

In [13]:
col_name = "retriever"

#tokenizer = LlamaTokenizer.from_pretrained("../models/llama7b", device_map='cuda')
embedding = HuggingFaceBgeEmbeddings(model_name="../models/retriever/bge-base-en-v1.5", model_kwargs={"device": "cuda:1"})

# Create the retriever
client = QdrantClient(url="http://localhost:6333")
db = Qdrant(client, 
            collection_name=col_name,
            embeddings=embedding,
            )

seed = 4048


def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x)
    return e_x / e_x.sum()

def search(query: str, k: int = 3):
    success = False
    while not success:
        try:
            results = db.similarity_search_with_score(query, k=k)
            if results:
                success = True 
        except:
            print(f"Error with example {query}, retrying in 0.2s")
            time.sleep(0.2)
    return results

# This function creates an example with the query and the prediction and the top k results
def make_example(query: str, ground_truth:str, dataset_name:str, context = None, example_id = None, k: int = 3, split = "llm", retrieval = True, task="", domain=""):
    contexts = []
    if retrieval:
        # Search for the query
        results = search(query, k=k)

        # Get the softmax of the scores
        retriever_softmax = softmax([result[1] for result in results])
        context_texts = [llm.format_prompt(query, result[0].page_content) for result in results]
        llm_scores = llm.to_tokens_and_logprobs(context_texts, [ground_truth] * k)[1]
        llm_softmax = softmax(llm_scores)

        # Get the text of the results
        contexts = [{
            "text": result[0].page_content, 
            "src": result[0].metadata["src"] if "src" in result[0].metadata.keys() else "unknown", 
            "id": result[0].metadata["id"] if "id" in result[0].metadata.keys() else result[0].metadata["title"],
            "retriever_score": result[1],
            "llm_score": lscore,
            "retriever_softmax": rsoft, 
            "llm_softmax": lsoft, 
            "llm_weighted_softmax": rsoft * lsoft, 
            "original_context": False } 
            for result, rsoft, lsoft, lscore in zip(results, retriever_softmax, llm_softmax, llm_scores)]
    
    # Add the original context
    if context:
        contexts.append({
            "text": context, 
            "src": dataset_name, 
            "id": str(example_id), 
            "original_context": True
            })

    return { 
        "split": split, 
        "query": query, 
        "ground_truth": ground_truth, 
        "contexts": contexts,
        "src": dataset_name, 
        "id": str(example_id),
        "task": task,
        "domain": domain, 
        }

itrf_dataset_buffer = []
itrf = DataFrame(columns=["split", "query", "prediction", "context", "src", "id", "context_src", "context_id", "original_context", "task", "domain"])

def make_examples(query: str, prediction:str, dataset_name:str, context = None, example_id = None, k: int = 3, retrieval = True, task="", domain=""):
    examples = []
    ex = make_example(query, prediction, dataset_name, context, example_id, k, retrieval=retrieval, task=task, domain=domain)
    for c in ex["contexts"]:
        examples.append({ 
            "split": ex["split"], 
            "query": ex["query"], 
            "ground_truth": ex["ground_truth"], 

            "retriever_score": c["retriever_score"],
            "llm_score": c["llm_score"],
            "retriever_softmax": c["retriever_softmax"], 
            "llm_softmax": c["llm_softmax"], 
            "llm_weighted_softmax": c["llm_weighted_softmax"], 

            "context": c["text"], 
            "src": ex["src"], 
            "id": str(ex["id"]), 
            "context_src": c["src"], 
            "context_id": str(c["id"]), 
            "original_context": c["original_context"],
            "task": task,
            "domain": domain,
            })
    return examples

def save_example(i, start, last_time, example, dname, force=False):
    save_examples(i, start, last_time, [example], dname, force=force)

def save_examples(i, start, last_time, examples, dname, force=False):
    global itrf_dataset_buffer
    global itrf
    # Save the dataset to a file
    itrf_dataset_buffer.extend(examples)
    
    if i % 100 == 0 or force:
        current_time = time.time()
        print(f"Processed {i} {dname} examples, time: {str(timedelta(seconds=(last_time - start)))}, last 100 in {str(timedelta(seconds=(current_time - last_time)))}")
        last_time = current_time
        if len(itrf_dataset_buffer) > 0:
            if itrf.empty:
                itrf = DataFrame(itrf_dataset_buffer)
            else:
                df = DataFrame(itrf_dataset_buffer)
                itrf = pd.concat([itrf, df])
            itrf_dataset_buffer.clear()
        itrf.to_parquet("../data/dataset/itrf_dataset_reranker.parquet")
        itrf.to_csv("../data/dataset/itrf_dataset_reranker.csv")

# Building the dataset

## Open domain questioning

In [11]:
datasets_openqa = ["tau/commonsense_qa", "math_qa", "web_questions", "wiki_qa", "yahoo_answers_qa", "freebase_qa", "ms_marco"]

#### tau/commonsense_qa

In [None]:
# def make_examples(query: str, prediction:str, dataset_name:str, context = None, example_id = None, k: int = 3, retrieval = True)
dname = datasets_openqa[0]
dataset = load_dataset(dname, split="train")
shuffled = iter(dataset.shuffle(seed=seed))
start = time.time()
last_time = start
for i in range(n):
    example = next(shuffled)

    options = ""
    if options_enabled:
        options = "\n Options: "
    answer = -1
    for idx, o in enumerate(example["choices"]["label"]):
        if options_enabled:
            options += f"{o}: {example['choices']['text'][idx]}, "
        if example["answerKey"] == o:
            answer = idx
    if options_enabled: 
        options += "\n"

    query = example["question"] + options
    if options_enabled:
        prediction = f"{example['answerKey']}) {example['choices']['text'][answer]}"
    else:
        prediction =  f"{example['choices']['text'][answer]}"
    example_id = example["id"] + "_" + example["question_concept"]
    examples = make_examples(query, prediction, dname, example_id=example_id, k=k, retrieval=True, task="mc", domain="openqa")
    
    save_examples(i, start, last_time, examples, dname)

itrf_dataset_buffer

In [9]:
len(itrf_dataset_buffer)

90

#### FreebaseQA

In [26]:
dname = datasets_openqa[5]
dataset = load_dataset(dname, split="train")

dataset[234]["Question-ID"]

'FreebaseQA-train-234'

In [30]:
dname = datasets_openqa[5]
dataset = load_dataset(dname, split="train")
shuffled = iter(dataset.shuffle(seed=2024))
start = time.time()
last_time = start
for i in range(n):
    example = next(shuffled)

    query = example["RawQuestion"]
    prediction =  example['Parses']["Answers"][0]["AnswersName"][0][0]
    example_id = example["Question-ID"]
    examples = make_examples(query, prediction, dname, example_id=example_id, k=k, retrieval=True, task="qa", domain="openqa")
    
    save_examples(i, start, last_time, examples, dname)

itrf_dataset_buffer

Error with example Who played Ray Doyle, John Deed and George Gently on TV?, retrying in 0.2s
<s>Background: Martin Shaw (born 21 January 1945) is an English actor. He is known for his roles in the television series The Professionals, The Chief, Judge John Deed and Inspector George Gently. He has also acted on stage and in film, and has narrated numerous audiobooks and presented various television series, including the 2006 series Martin Shaw: Aviators.

[INST]Who played Ray Doyle, John Deed and George Gently on TV?[/INST][ANS] martin shaw
<s>Background: Supt Pullman and Co. but in doing so I fell into the clutches of Judge John Deeds. This brilliant piece of television finds Martin Shaw in the title role portraying a judge called John Deeds. Then there’s “Inspector George Gently” in which Martin Shaw portrays a policeman called Inspector George Gently. This is set in the 1960s which gives chance after chance to look for continuity errors and stare longingly at old cars.I added to this

[{'split': 'llm',
  'query': 'Who played Thelma in The Likely Lads?',
  'ground_truth': 'brigit forsyth',
  'retriever_score': 0.7013439,
  'llm_score': 2.616742543426308e-16,
  'retriever_softmax': 0.10332175430351814,
  'llm_softmax': 0.098233154677034,
  'llm_weighted_softmax': 0.010149621872000001,
  'context': 'Brigit Forsyth (born 28 July 1940 in Edinburgh) is a Scottish actress, best known for her roles as Thelma Ferris in the BBC comedy Whatever Happened to the Likely Lads? and Helen Yeldham in the ITV drama Boon. Since December 2013, Forsyth has appeared in the BBC comedy Still Open All Hours.',
  'src': 'freebase_qa',
  'id': 'FreebaseQA-train-11754',
  'context_src': 'wiki',
  'context_id': 'Brigit Forsyth',
  'original_context': False,
  'task': 'qa',
  'domain': 'openqa'},
 {'split': 'llm',
  'query': 'Who played Thelma in The Likely Lads?',
  'ground_truth': 'brigit forsyth',
  'retriever_score': 0.69273615,
  'llm_score': 3.914847119897576e-09,
  'retriever_softmax': 0.1

#### MS Marco

In [36]:
dname = datasets_openqa[6]
dataset = load_dataset(dname, 'v2.1', split="train")
shuffled = iter(dataset.shuffle(seed=2024))
start = time.time()
last_time = start
for i in range(n):
    example = next(shuffled)

    query = example["query"]
    prediction =  example['answers'][0]
    example_id = dname + example["query_type"] + str(example["query_id"])

    context_id = example["passages"]["is_selected"].index(1)
    context = example["passages"]["passage_text"][context_id]

    examples = make_examples(query, prediction, dname, example_id=example_id, context=context, k=k, retrieval=True, task="qa", domain="openqa")
    
    save_examples(i, start, last_time, examples, dname)

itrf_dataset_buffer

<s>Background: The difference between a vegetable and a fruit is that vegetables are the edible portions of a plant, such as the leaves, stem, roots, tubers, bulbs and flowers, while a fruit is the mature ovary of a plant. Many plants that are considered fruits are botanically vegetables.
Some plants commonly known as vegetables, such as tomatoes, squash, pepper and eggplant, are botanically fruits. Fruits don't have to be sweet. Fruits provide protection for seeds in many plants and aid in the distribution of seeds.

[INST]what is a fruit and what is a vegetable[/INST][ANS] A fruit is the mature ovary of a seed plant, usually developed from a flower. A vegetable is a plant or that part of a plant which is edible, and does not necessarily have a role in the plant's reproductive cycle.
<s>Background: If by this you mean what fruits are customarily called vegetables, two of the most common are the tomato and the cucumber .
A vegetable is any edible part of a plant, and can include roots,

ValueError: 1 is not in list

## Deprecated

In [3]:
loftq_config = LoftQConfig(loftq_bits=4) # set 8bit quantization
lora_config = LoraConfig(
          r=16,
          lora_alpha=32,
          target_modules="all-linear",
          lora_dropout=0.05,
          bias="none",
        #   init_lora_weights="loftq", 
        #   loftq_config=loftq_config,
          task_type="CAUSAL_LM"
      )

load_dotenv("../.env")

True

In [4]:
model = PeftModel.from_pretrained(model, adapter_path, lora_config)

ValueError: Can't find 'adapter_config.json' at '../models/rarit/7b_4bit_init-causal'

In [9]:
adapter_path = "../models/rarit/7b_4bit_init"
model.load_adapter(adapter_path, adapter_name, is_trainable=False)

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '../models/rarit/7b_4bit_init'. Use `repo_type` argument if needed.

In [10]:
AutoModelForCausalLM.from_pretrained(adapter_path, )

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]


TypeError: PeftConfig.__init__() got an unexpected keyword argument 'attention_bias'