Version 5 of reproduction of the following paper : Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks

What is added from V4 : 
- Based on LLMware, but applied to TriviaQA --> application to something else so that the learning sticks. 

IMPORTS

In [4]:
import os
import re
from llmware.library import Library
from llmware.retrieval import Query
from llmware.setup import Setup
from llmware.status import Status
from llmware.models import ModelCatalog
from llmware.configs import LLMWareConfig, MilvusConfig

import time
from llmware.prompts import Prompt, HumanInTheLoop
from llmware.models import ModelCatalog

from importlib import util

from datasets import Dataset, load_dataset


  from .autonotebook import tqdm as notebook_tqdm


LIBRARY : import documents

In [5]:
#IMPORT TRIVIAQA dataset

# Load TriviaQA (unfiltered for simplicity)
dataset = load_dataset("trivia_qa", "unfiltered")

# Take first 50 examples for quick testing
test_set = dataset["train"].select(range(50))

# List of the features of the dataset
features = dataset["train"].features; features

{'question': Value('string'),
 'question_id': Value('string'),
 'question_source': Value('string'),
 'entity_pages': {'doc_source': List(Value('string')),
  'filename': List(Value('string')),
  'title': List(Value('string')),
  'wiki_context': List(Value('string'))},
 'search_results': {'description': List(Value('string')),
  'filename': List(Value('string')),
  'rank': List(Value('int32')),
  'title': List(Value('string')),
  'url': List(Value('string')),
  'search_context': List(Value('string'))},
 'answer': {'aliases': List(Value('string')),
  'normalized_aliases': List(Value('string')),
  'matched_wiki_entity_name': Value('string'),
  'normalized_matched_wiki_entity_name': Value('string'),
  'normalized_value': Value('string'),
  'type': Value('string'),
  'value': Value('string')}}

In [6]:
#PRINT ONE QUESTION AND ANSWER
digit = 77
example = dataset["train"][digit]

# Print the question and its answer
print("Question:", example["question"])
print("Answer:", example["answer"]["value"])

Question: 1998 was the Chinese year of which creature?
Answer: Tiger


In [7]:
#FUNCTION THAT WILL CREATE A LIBRARY

def create_library(library_name):

    print (f"\n > Creating library '{library_name}'...")

    library = Library().create_new_library(library_name)

    return library

In [8]:
# CREATE LIBRARY

library_name = "RAG_V5_Lib"
library = create_library(library_name)


 > Creating library 'RAG_V5_Lib'...


In [9]:
# FUNCTION TO PARSE THE TRIVIAQA DATASET AND ADD THEM TO THE LIBRARY

def dataset_to_file(dataset):
    print("\n > Transferring dataset to file ...")

    # Extract text from dataset (e.g., questions and answers)
    texts = []

    # Create the folder if it doesn't exist
    os.makedirs("Data", exist_ok=True)


    for i in range(len(dataset["train"][1:10])):  # Limiting to first 10 for demonstration
        question = dataset["train"]["question"][i]
        answer = dataset["train"]["answer"][i]
        combined_text = f"Q: {question}\nA: {answer}"
        #texts.append(combined_text)

        # Define the full path to the file
        file_path = os.path.join("Data", f"TriviaQA_{i}.txt")

        # Write the combined text to a file
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(combined_text)
            print(f"File saved as {file_path}")

def parse_files(library, data_path):
    print (f"\n > Parsing and adding dataset to library ...")
    library.add_files(input_folder_path=data_path, chunk_size=400, max_chunk_size=800, smart_chunking=2)

In [10]:
 #APPLY THE FUNCTION
dataset_to_file(dataset)
parse_files(library, "Data")



 > Transferring dataset to file ...
File saved as Data\TriviaQA_0.txt
File saved as Data\TriviaQA_1.txt
File saved as Data\TriviaQA_2.txt
File saved as Data\TriviaQA_3.txt
File saved as Data\TriviaQA_4.txt
File saved as Data\TriviaQA_5.txt

 > Parsing and adding dataset to library ...


[37mINFO: update:  Duplicate files (skipped): 6[39m
[37mINFO: update:  Total uploaded: 0[39m


RETRIEVER EMBEDDINGS : Turn text into numbers

In [11]:
# FUNCTION TO MAKE EMBEDDINGS

def make_embeddings(embedding_model_name, vector_db):
    print("Generating Embeddings in {} db - with Model- {}".format(vector_db, embedding_model_name))
    LLMWareConfig().set_active_db("sqlite")
    MilvusConfig().set_config("lite", True)
    LLMWareConfig().set_vector_db(vector_db)
    library.install_new_embedding(embedding_model_name=embedding_model_name, vector_db=vector_db, batch_size=200)

In [12]:
# LIST OF THE EMBEDDING MODELS AVAILABLE
embedding_models = ModelCatalog().list_embedding_models()
model_names = [model['model_name'] for model in embedding_models]; model_names

['all-MiniLM-L6-v2',
 'all-mpnet-base-v2',
 'industry-bert-insurance',
 'industry-bert-contracts',
 'industry-bert-asset-management',
 'industry-bert-sec',
 'industry-bert-loans',
 'nomic-ai/nomic-embed-text-v1',
 'jinaai/jina-embeddings-v2-base-en',
 'jinaai/jina-embeddings-v2-small-en',
 'BAAI/bge-small-en-v1.5',
 'BAAI/bge-large-en-v1.5',
 'BAAI/bge-base-en-v1.5',
 'thenlper/gte-small',
 'thenlper/gte-base',
 'thenlper/gte-large',
 'llmrails/ember-v1',
 'WhereIsAI/UAE-Large-V1',
 'text-embedding-ada-002',
 'text-embedding-3-small',
 'text-embedding-3-large',
 'medium',
 'xlarge',
 'embed-english-v3.0',
 'embed-multilingual-v3.0',
 'embed-english-light-v3.0',
 'embed-multilingual-light-v3.0',
 'embed-english-v2.0',
 'embed-english-light-v2.0',
 'embed-multilingual-v2.0',
 'textembedding-gecko@latest']

In [13]:
# MAKE EMBEDDINGS
embedding_model_name = "mini-lm-sbert"
vector_db = "faiss"
make_embeddings(embedding_model_name, vector_db)

Generating Embeddings in faiss db - with Model- mini-lm-sbert


[37mINFO: update: EmbeddingHandler - FAISS - embedding_summary - {'embeddings_created': 0, 'embedded_blocks': 22, 'embedding_dims': 384, 'time_stamp': '2025-11-12_210755'}[39m


GENERATOR : PROMPTS

In [17]:
def generator(llm_model_name, digit):
    print("Loading model for LLM inference -", llm_model_name)
    query = dataset["train"][digit]["question"]
    prompter = Prompt().load_model(llm_model_name, temperature=0.0, sample=False)
    results = Query(library).semantic_query(query, result_count=80, embedding_distance_threshold=1.0)

    # iterate files in Data folder
    for idx, contract in enumerate(sorted(os.listdir("Data"))):
        if contract == ".DS_Store":
            continue

        print(f"\nContract {idx}: {contract}")
        qr = []

        # collect top retrievals that come from this contract
        for entry in results:
            library_fn = entry.get("file_source", "")
            # normalize file name split on os.sep to be robust
            if os.sep in library_fn:
                library_fn = library_fn.split(os.sep)[-1]
            if library_fn == contract:
                qr.append(entry)

        print(f"  - Retrieved {len(qr)} items for file {contract}")

        # skip if nothing was retrieved for this contract
        if not qr:
            print(f"  -> No retrievals for {contract}, skipping LLM call.")
            continue

        # add the query results as source/context to the prompter
        prompter.add_source_query_results(query_results=qr)

        # call the model (may return multiple call dicts)
        try:
            responses = prompter.prompt_with_source(query, prompt_name="default_with_context")
        except Exception as e:
            print(f"  !! LLM call failed for {contract}: {e}")
            prompter.clear_source_materials()
            continue

        # responses is a list of dicts; combine all llm_response parts into one string
        llm_text_parts = []
        for r in responses:
            # adapt to actual key names if different
            llm_resp = r.get("llm_response") or r.get("response") or r.get("text") or ""
            llm_text_parts.append(llm_resp.strip())

        llm_text = "\n\n".join([p for p in llm_text_parts if p])

        # get ground-truth answer (robust to different shapes)
        answer_obj = dataset["train"][digit].get("answer")
        if isinstance(answer_obj, dict):
            answer_val = answer_obj.get("value", "")
        else:
            answer_val = answer_obj

        # if answer_val is a list, join it; else leave as string
        if isinstance(answer_val, (list, tuple)):
            answer_text = " ||| ".join(map(str, answer_val))
        else:
            answer_text = str(answer_val)

        # Print the LLM answer and ground truth
        print(f"\n  LLM answer (file {contract}):\n{llm_text}\n")
        print(f"  Ground-truth answer:\n{answer_text}\n")

        # start fresh for next document
        prompter.clear_source_materials()

        # Save jsonl report with full transaction history to /prompt_history folder
        print("\nupdate: Prompt state saved at: ", os.path.join(LLMWareConfig.get_prompt_path(),prompter.prompt_id))

        prompter.save_state()

        # Generate CSV report for easy Human review in Excel
        csv_output = HumanInTheLoop(prompter).export_current_interaction_to_csv()

        print("\nupdate: CSV output for human review - ", csv_output)

    print("\nGenerator completed.")


In [18]:

# LIST OF THE EMBEDDING MODELS AVAILABLE
generative_models = ModelCatalog().list_generative_models()
gen_model_names = [model['model_name'] for model in generative_models]; gen_model_names

['Meta-Llama-3-8B',
 'Meta-Llama-3-8B-Instruct',
 'QuantFactory/Meta-Llama-3-8B-GGUF',
 'QuantFactory/Meta-Llama-3-8B-Instruct-GGUF',
 'TheBloke/Llama-2-7B-Chat-GGUF',
 'TheBloke/OpenHermes-2.5-Mistral-7B-GGUF',
 'TheBloke/Starling-LM-7B-alpha-GGUF',
 'TheBloke/zephyr-7B-beta-GGUF',
 'bartowski/Meta-Llama-3-8B-Instruct-GGUF',
 'bling-answer-tool',
 'bling-phi-2-gguf',
 'bling-phi-3-gguf',
 'bling-phi-3-onnx',
 'bling-phi-3-ov',
 'bling-phi-3.5-gguf',
 'bling-qwen-0.5b-gguf',
 'bling-qwen-1.5b-gguf',
 'bling-qwen-1.5b-ov',
 'bling-qwen-500m-ov',
 'bling-stablelm-3b-tool',
 'bling-tiny-llama-onnx',
 'bling-tiny-llama-ov',
 'chat-bison@001',
 'claude-2.0',
 'claude-2.1',
 'claude-3-5-haiku-20241022',
 'claude-3-5-sonnet-20240620',
 'claude-3-7-sonnet-20250219',
 'claude-3-haiku-20240307',
 'claude-3-opus-20240229',
 'claude-3-sonnet-20240229',
 'claude-instant-v1',
 'claude-v1',
 'codegemma-7b-it-ov',
 'command-medium-nightly',
 'command-xlarge-nightly',
 'deepseek-qwen-14b-gguf',
 'deeps

In [19]:
llm_model_name = "bling-phi-3-gguf"
generator(llm_model_name, 1)

Loading model for LLM inference - bling-phi-3-gguf

Contract 0: TriviaQA_0.txt
  - Retrieved 2 items for file TriviaQA_0.txt

  LLM answer (file TriviaQA_0.txt):
Not Found.

  Ground-truth answer:
Sinclair Lewis


update: Prompt state saved at:  C:\Users\xavie\llmware_data\prompt_history\093e6a50-e0d6-490b-83e3-dcf6a27971bc

update: CSV output for human review -  {'report_name': 'interaction_report_2025-11-12_211247.csv', 'report_fp': 'C:\\Users\\xavie\\llmware_data\\prompt_history\\interaction_report_2025-11-12_211247.csv', 'results': 1}

Contract 1: TriviaQA_1.txt
  - Retrieved 1 items for file TriviaQA_1.txt

  LLM answer (file TriviaQA_1.txt):
Sinclair Lewis

  Ground-truth answer:
Sinclair Lewis


update: Prompt state saved at:  C:\Users\xavie\llmware_data\prompt_history\093e6a50-e0d6-490b-83e3-dcf6a27971bc

update: CSV output for human review -  {'report_name': 'interaction_report_2025-11-12_211300.csv', 'report_fp': 'C:\\Users\\xavie\\llmware_data\\prompt_history\\interaction_re