Version 5 of reproduction of the following paper : Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks

What is added from V4 : 
- Based on LLMware, but applied to TriviaQA --> application to something else so that the learning sticks. 

IMPORTS

In [1]:
import os
import re
from llmware.library import Library
from llmware.retrieval import Query
from llmware.setup import Setup
from llmware.status import Status
from llmware.models import ModelCatalog
from llmware.configs import LLMWareConfig, MilvusConfig

import time
from llmware.prompts import Prompt, HumanInTheLoop
from llmware.models import ModelCatalog

from importlib import util

from datasets import Dataset, load_dataset


  from .autonotebook import tqdm as notebook_tqdm


LIBRARY : import documents

In [None]:
#IMPORT TRIVIAQA dataset

# Load TriviaQA (unfiltered for simplicity)
dataset = load_dataset("trivia_qa", "unfiltered")

# Take first 50 examples for quick testing
test_set = dataset["train"].select(range(50))

# List of the features of the dataset
features = dataset["train"].features; features

{'question': Value('string'),
 'question_id': Value('string'),
 'question_source': Value('string'),
 'entity_pages': {'doc_source': List(Value('string')),
  'filename': List(Value('string')),
  'title': List(Value('string')),
  'wiki_context': List(Value('string'))},
 'search_results': {'description': List(Value('string')),
  'filename': List(Value('string')),
  'rank': List(Value('int32')),
  'title': List(Value('string')),
  'url': List(Value('string')),
  'search_context': List(Value('string'))},
 'answer': {'aliases': List(Value('string')),
  'normalized_aliases': List(Value('string')),
  'matched_wiki_entity_name': Value('string'),
  'normalized_matched_wiki_entity_name': Value('string'),
  'normalized_value': Value('string'),
  'type': Value('string'),
  'value': Value('string')}}

In [None]:
#PRINT ONE QUESTION AND ANSWER

digit = 77

example = dataset["train"][digit]

 

# Print the question and its answer
print("Question:", example["question"])
print("Answer:", example["answer"]["value"])

Question: 1998 was the Chinese year of which creature?
Answer: Tiger


In [4]:
#FUNCTION THAT WILL CREATE A LIBRARY

def create_library(library_name):

    print (f"\n > Creating library '{library_name}'...")

    library = Library().create_new_library(library_name)

    return library

In [None]:
# CREATE LIBRARY

library_name = "RAG_V5_Lib"
library = create_library(library_name)


 > Creating library 'RAG_V5_Lib'...


In [59]:
# FUNCTION TO PARSE THE TRIVIAQA DATASET AND ADD THEM TO THE LIBRARY

def dataset_to_file(dataset):
    print("\n > Transferring dataset to file ...")

    # Extract text from dataset (e.g., questions and answers)
    texts = []

    # Create the folder if it doesn't exist
    os.makedirs("Data", exist_ok=True)


    for i in range(len(dataset["train"][1:10])):  # Limiting to first 10 for demonstration
        question = dataset["train"]["question"][i]
        answer = dataset["train"]["answer"][i]
        combined_text = f"Q: {question}\nA: {answer}"
        #texts.append(combined_text)

        # Define the full path to the file
        file_path = os.path.join("Data", f"TriviaQA_{i}.txt")

        # Write the combined text to a file
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(combined_text)
            print(f"File saved as {file_path}")

def parse_files(library, data_path):
    print (f"\n > Parsing and adding dataset to library ...")
    library.add_files(input_folder_path=data_path, chunk_size=400, max_chunk_size=800, smart_chunking=2)

In [60]:
 #APPLY THE FUNCTION
dataset_to_file(dataset)
parse_files(library, "Data")



 > Transferring dataset to file ...
File saved as Data\TriviaQA_0.txt
File saved as Data\TriviaQA_1.txt
File saved as Data\TriviaQA_2.txt
File saved as Data\TriviaQA_3.txt
File saved as Data\TriviaQA_4.txt
File saved as Data\TriviaQA_5.txt

 > Parsing and adding dataset to library ...


[37mINFO: update:  Duplicate files (skipped): 0[39m
[37mINFO: update:  Total uploaded: 6[39m
[37mINFO: Parser - parse_text file - processing - TriviaQA_0.txt[39m
[37mINFO: Parser - parse_text file - processing - TriviaQA_1.txt[39m
[37mINFO: Parser - parse_text file - processing - TriviaQA_2.txt[39m
[37mINFO: Parser - parse_text file - processing - TriviaQA_3.txt[39m
[37mINFO: Parser - parse_text file - processing - TriviaQA_4.txt[39m
[37mINFO: Parser - parse_text file - processing - TriviaQA_5.txt[39m


RETRIEVER EMBEDDINGS : Turn text into numbers

In [61]:
# FUNCTION TO MAKE EMBEDDINGS

def make_embeddings(embedding_model_name, vector_db):
    print("Generating Embeddings in {} db - with Model- {}".format(vector_db, embedding_model_name))
    LLMWareConfig().set_active_db("sqlite")
    MilvusConfig().set_config("lite", True)
    LLMWareConfig().set_vector_db(vector_db)
    library.install_new_embedding(embedding_model_name=embedding_model_name, vector_db=vector_db, batch_size=200)

In [62]:
# LIST OF THE EMBEDDING MODELS AVAILABLE
embedding_models = ModelCatalog().list_embedding_models()
model_names = [model['model_name'] for model in embedding_models]; model_names

['all-MiniLM-L6-v2',
 'all-mpnet-base-v2',
 'industry-bert-insurance',
 'industry-bert-contracts',
 'industry-bert-asset-management',
 'industry-bert-sec',
 'industry-bert-loans',
 'nomic-ai/nomic-embed-text-v1',
 'jinaai/jina-embeddings-v2-base-en',
 'jinaai/jina-embeddings-v2-small-en',
 'BAAI/bge-small-en-v1.5',
 'BAAI/bge-large-en-v1.5',
 'BAAI/bge-base-en-v1.5',
 'thenlper/gte-small',
 'thenlper/gte-base',
 'thenlper/gte-large',
 'llmrails/ember-v1',
 'WhereIsAI/UAE-Large-V1',
 'text-embedding-ada-002',
 'text-embedding-3-small',
 'text-embedding-3-large',
 'medium',
 'xlarge',
 'embed-english-v3.0',
 'embed-multilingual-v3.0',
 'embed-english-light-v3.0',
 'embed-multilingual-light-v3.0',
 'embed-english-v2.0',
 'embed-english-light-v2.0',
 'embed-multilingual-v2.0',
 'textembedding-gecko@latest']

In [63]:
# MAKE EMBEDDINGS
embedding_model_name = "mini-lm-sbert"
vector_db = "faiss"
make_embeddings(embedding_model_name, vector_db)

Generating Embeddings in faiss db - with Model- mini-lm-sbert


[37mINFO: update: embedding_handler - FAISS - Embeddings Created: 7 of 7[39m
[37mINFO: update: EmbeddingHandler - FAISS - embedding_summary - {'embeddings_created': 7, 'embedded_blocks': 22, 'embedding_dims': 384, 'time_stamp': '2025-11-12_202054'}[39m


GENERATOR : PROMPTS

In [82]:
# FUNCTION TO MAKE GENERATOR MODEL
def generator(llm_model_name, digit):
    print("Loading model for LLM inference - ", llm_model_name)
    query = dataset["train"][digit]["question"]
    prompter = Prompt().load_model(llm_model_name, temperature=0.0, sample=False)
    results = Query(library).semantic_query(query, result_count=80, embedding_distance_threshold=1.0)

    # for each document in the library, we will run a query and look at the results

    for i, contract in enumerate(os.listdir("Data")):
        qr = []
        if contract != ".DS_Store":
            print("\nContract Name: ", i, contract)

            #   we will look through the list of semantic query results, and pull the top results for each file
            for j, entries in enumerate(results):
                library_fn = entries["file_source"]

                if os.sep in library_fn:
                    # handles difference in windows file formats vs. mac / linux
                    library_fn = library_fn.split(os.sep)[-1]

                if library_fn == contract:
                   # print("Top Retrieval: ", j, entries["distance"], entries["text"])
                    qr.append(entries)

            #   we will add the query results to the prompt
            source = prompter.add_source_query_results(query_results=qr)

            #   run the prompt
            response = prompter.prompt_with_source(query, prompt_name="default_with_context")

            #   note: prompt_with_resource returns a list of dictionary responses
            #   -- depending upon the size of the source context, it may call the llm several times
            #   -- each dict entry represents 1 call to the LLM

            #   post processing fact checking
            answer = dataset["train"][digit]["answer"]["value"]

            print("\nupdate: llm answer - ", response[i]["llm_response"])
            print("update: Right answer - ", answer[i])

            # start fresh for next document
            prompter.clear_source_materials()

            # Save jsonl report with full transaction history to /prompt_history folder
            print("\nupdate: Prompt state saved at: ", os.path.join(LLMWareConfig.get_prompt_path(),prompter.prompt_id))

            prompter.save_state()

            # Generate CSV report for easy Human review in Excel
            csv_output = HumanInTheLoop(prompter).export_current_interaction_to_csv()
            print("\nupdate: CSV output for human review - ", csv_output)

In [83]:

# LIST OF THE EMBEDDING MODELS AVAILABLE
generative_models = ModelCatalog().list_generative_models()
gen_model_names = [model['model_name'] for model in generative_models]; gen_model_names

['Meta-Llama-3-8B',
 'Meta-Llama-3-8B-Instruct',
 'QuantFactory/Meta-Llama-3-8B-GGUF',
 'QuantFactory/Meta-Llama-3-8B-Instruct-GGUF',
 'TheBloke/Llama-2-7B-Chat-GGUF',
 'TheBloke/OpenHermes-2.5-Mistral-7B-GGUF',
 'TheBloke/Starling-LM-7B-alpha-GGUF',
 'TheBloke/zephyr-7B-beta-GGUF',
 'bartowski/Meta-Llama-3-8B-Instruct-GGUF',
 'bling-answer-tool',
 'bling-phi-2-gguf',
 'bling-phi-3-gguf',
 'bling-phi-3-onnx',
 'bling-phi-3-ov',
 'bling-phi-3.5-gguf',
 'bling-qwen-0.5b-gguf',
 'bling-qwen-1.5b-gguf',
 'bling-qwen-1.5b-ov',
 'bling-qwen-500m-ov',
 'bling-stablelm-3b-tool',
 'bling-tiny-llama-onnx',
 'bling-tiny-llama-ov',
 'chat-bison@001',
 'claude-2.0',
 'claude-2.1',
 'claude-3-5-haiku-20241022',
 'claude-3-5-sonnet-20240620',
 'claude-3-7-sonnet-20250219',
 'claude-3-haiku-20240307',
 'claude-3-opus-20240229',
 'claude-3-sonnet-20240229',
 'claude-instant-v1',
 'claude-v1',
 'codegemma-7b-it-ov',
 'command-medium-nightly',
 'command-xlarge-nightly',
 'deepseek-qwen-14b-gguf',
 'deeps

In [84]:
llm_model_name = "bling-phi-3-gguf"
generator(llm_model_name, 1)

Loading model for LLM inference -  bling-phi-3-gguf

Contract Name:  0 TriviaQA_0.txt

update: llm answer -  Not Found.
update: Right answer -  S

update: Prompt state saved at:  C:\Users\xavie\llmware_data\prompt_history\296c13dc-d7c7-444e-a087-7b7df00314ec

update: CSV output for human review -  {'report_name': 'interaction_report_2025-11-12_203431.csv', 'report_fp': 'C:\\Users\\xavie\\llmware_data\\prompt_history\\interaction_report_2025-11-12_203431.csv', 'results': 1}

Contract Name:  1 TriviaQA_1.txt


IndexError: list index out of range

In [68]:
# FUNCTION TO MAKE GENERATOR MODEL
def generator(llm_model_name, digit):
    print("Loading model for LLM inference - ", llm_model_name)
    query = dataset["train"][digit]["question"]
    prompter = Prompt().load_model(llm_model_name, temperature=0.0, sample=False)
    results = Query(library).semantic_query(query, result_count=80, embedding_distance_threshold=1.0)

    # for each document in the library, we will run a query and look at the results

    for i, contract in enumerate([1]):
        qr = []
        if contract != ".DS_Store":
            print("\nContract Name: ", i, contract)

            #   we will look through the list of semantic query results, and pull the top results for each file
            for j, entries in enumerate(results):
                library_fn = entries["file_source"]

                if os.sep in library_fn:
                    # handles difference in windows file formats vs. mac / linux
                    library_fn = library_fn.split(os.sep)[-1]

                if library_fn == contract:
                   # print("Top Retrieval: ", j, entries["distance"], entries["text"])
                    qr.append(entries)

            #   we will add the query results to the prompt
            source = prompter.add_source_query_results(query_results=qr)

            #   run the prompt
            response = prompter.prompt_with_source(query, prompt_name="default_with_context")

            return response

In [72]:
response = generator(llm_model_name, 1); response

Loading model for LLM inference -  bling-phi-3-gguf





Contract Name:  0 1


[{'llm_response': 'Sinclair Lewis.',
  'prompt': 'Which American-born Sinclair won the Nobel Prize for Literature in 1930?',
  'evidence': '',
  'instruction': 'default_with_context',
  'model': 'bling-phi-3-gguf',
  'usage': {'input': 30,
   'output': 6,
   'total': 36,
   'metric': 'tokens',
   'processing_time': 2.619739055633545},
  'time_stamp': '2025-11-12_202951',
  'calling_app_ID': '',
  'rating': '',
  'account_name': 'llmware',
  'prompt_id': 0,
  'batch_id': 0,
  'evidence_metadata': [{'evidence_start_char': 0,
    'evidence_stop_char': 0,
    'page_num': 'NA',
    'source_name': 'NA',
    'doc_id': 'NA',
    'block_id': 'NA'}]}]

In [75]:
response[0]["llm_response"]

'Sinclair Lewis.'