Version 5 of reproduction of the following paper : Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks

What is added from V4 : 
- Based on LLMware, but applied to TriviaQA --> application to something else so that the learning sticks. 

IMPORTS

In [1]:
import os
import re
from llmware.library import Library
from llmware.retrieval import Query
from llmware.setup import Setup
from llmware.status import Status
from llmware.models import ModelCatalog
from llmware.configs import LLMWareConfig, MilvusConfig

import time
from llmware.prompts import Prompt, HumanInTheLoop
from llmware.models import ModelCatalog

from importlib import util

from datasets import Dataset, load_dataset


  from .autonotebook import tqdm as notebook_tqdm


LIBRARY : import documents

In [2]:
#IMPORT TRIVIAQA dataset

# Load TriviaQA (unfiltered for simplicity)

dataset = load_dataset("trivia_qa", "unfiltered")

 

# Take first 50 examples for quick testing

test_set = dataset["train"].select(range(50))

 

# List of the features of the dataset

features = dataset["train"].features; features

{'question': Value('string'),
 'question_id': Value('string'),
 'question_source': Value('string'),
 'entity_pages': {'doc_source': List(Value('string')),
  'filename': List(Value('string')),
  'title': List(Value('string')),
  'wiki_context': List(Value('string'))},
 'search_results': {'description': List(Value('string')),
  'filename': List(Value('string')),
  'rank': List(Value('int32')),
  'title': List(Value('string')),
  'url': List(Value('string')),
  'search_context': List(Value('string'))},
 'answer': {'aliases': List(Value('string')),
  'normalized_aliases': List(Value('string')),
  'matched_wiki_entity_name': Value('string'),
  'normalized_matched_wiki_entity_name': Value('string'),
  'normalized_value': Value('string'),
  'type': Value('string'),
  'value': Value('string')}}

In [3]:
#PRINT ONE QUESTION AND ANSWER

digit = 77

example = dataset["train"][digit]

 

# Print the question and its answer

print("Question:", example["question"])

print("Answer:", example["answer"]["value"])

Question: 1998 was the Chinese year of which creature?
Answer: Tiger


In [4]:
#FUNCTION THAT WILL CREATE A LIBRARY

def create_library(library_name):

    print (f"\n > Creating library '{library_name}'...")

    library = Library().create_new_library(library_name)

    return library

In [5]:
# CREATE LIBRARY

library_name = "RAG_V5_Lib"

library = create_library(library_name)


 > Creating library 'RAG_V5_Lib'...


In [25]:
# FUNCTION TO PARSE THE TRIVIAQA DATASET AND ADD THEM TO THE LIBRARY

def dataset_to_file(dataset):
    print("\n > Transferring dataset to file ...")

    # Extract text from dataset (e.g., questions and answers)
    texts = []

    for i in range(len(dataset["train"])):
        question = dataset["train"]["question"][i]
        answer = dataset["train"]["answer"][i]
        combined_text = f"Q: {question}\nA: {answer}"
        texts.append(combined_text)

    # Create the folder if it doesn't exist
    os.makedirs("Data", exist_ok=True)

    # Define the full path to the file
    file_path = os.path.join("Data", "TriviaQA.txt")

    with open(file_path, "w", encoding="utf-8") as file:
        file.write("\n".join(texts))
        print(f"File saved as {file_path}")

def parse_files(library, data_path):
    print (f"\n > Parsing and adding dataset to library ...")
    library.add_files(input_folder_path=data_path, chunk_size=400, max_chunk_size=800, smart_chunking=2)

In [26]:
 #APPLY THE FUNCTION
dataset_to_file(dataset)
parse_files(library, "Data")



 > Transferring dataset to file ...


[37mINFO: update:  Duplicate files (skipped): 1[39m
[37mINFO: update:  Total uploaded: 0[39m


File saved as Data\TriviaQA.txt

 > Parsing and adding dataset to library ...


RETRIEVER EMBEDDINGS : Turn text into numbers

In [None]:
# FUNCTION TO MAKE EMBEDDINGS

def make_embeddings(embedding_model_name, vector_db):
    print("Generating Embeddings in {} db - with Model- {}".format(vector_db, embedding_model_name))
    LLMWareConfig().set_active_db("sqlite")
    MilvusConfig().set_config("lite", True)
    LLMWareConfig().set_vector_db(vector_db)
    library.install_new_embedding(embedding_model_name=embedding_model_name, vector_db=vector_db, batch_size=200)

In [None]:
# LIST OF THE EMBEDDING MODELS AVAILABLE
embedding_models = ModelCatalog().list_embedding_models()
model_names = [model['model_name'] for model in embedding_models]; model_names

In [None]:
# MAKE EMBEDDINGS
embedding_model_name = "mini-lm-sbert"
vector_db = "faiss"
make_embeddings(embedding_model_name, vector_db)

GENERATOR : PROMPTS

In [None]:
# FUNCTION TO MAKE GENERATOR MODEL
def generator(llm_model_name, digit):
    print("Loading model for LLM inference - ", llm_model_name)
    query = dataset["train"][digit]["question"]
    prompter = Prompt().load_model(llm_model_name, temperature=0.0, sample=False)
    results = Query(library).semantic_query(query, result_count=80, embedding_distance_threshold=1.0)

    # for each document in the library, we will run a query and look at the results

    for i, contract in enumerate(os.listdir(data_path)):
        qr = []
        if contract != ".DS_Store":
            print("\nContract Name: ", i, contract)

            #   we will look through the list of semantic query results, and pull the top results for each file
            for j, entries in enumerate(results):
                library_fn = entries["file_source"]

                if os.sep in library_fn:
                    # handles difference in windows file formats vs. mac / linux
                    library_fn = library_fn.split(os.sep)[-1]

                if library_fn == contract:
                    print("Top Retrieval: ", j, entries["distance"], entries["text"])
                    qr.append(entries)

            #   we will add the query results to the prompt
            source = prompter.add_source_query_results(query_results=qr)

            #   run the prompt
            response = prompter.prompt_with_source(query, prompt_name="default_with_context")

            #   note: prompt_with_resource returns a list of dictionary responses
            #   -- depending upon the size of the source context, it may call the llm several times
            #   -- each dict entry represents 1 call to the LLM

            #   post processing fact checking
            answer = dataset["train"][digit]["answer"]["value"]

            for resp in enumerate(response):
                if "llm_response" in resp:
                    print("\nupdate: llm answer - ", resp["llm_response"])
                    print("update: Right answer - ", answer[resp])

            # start fresh for next document
            prompter.clear_source_materials()

            # Save jsonl report with full transaction history to /prompt_history folder
            print("\nupdate: Prompt state saved at: ", os.path.join(LLMWareConfig.get_prompt_path(),prompter.prompt_id))

            prompter.save_state()

            # Generate CSV report for easy Human review in Excel
            csv_output = HumanInTheLoop(prompter).export_current_interaction_to_csv()
            print("\nupdate: CSV output for human review - ", csv_output)

In [None]:

# LIST OF THE EMBEDDING MODELS AVAILABLE
generative_models = ModelCatalog().list_generative_models()
gen_model_names = [model['model_name'] for model in generative_models]; gen_model_names

In [None]:
llm_model_name = "bling-phi-3-gguf"
generator(llm_model_name, digit)