Version 4 of reproduction of the following paper : Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks

What is added from V3 : 
- Based on LLMware, but entirely recoded by hand from A to Z in order to learn

IMPORTS

In [1]:
import os
import re
from llmware.library import Library
from llmware.retrieval import Query
from llmware.setup import Setup
from llmware.status import Status
from llmware.models import ModelCatalog
from llmware.configs import LLMWareConfig, MilvusConfig

import time
from llmware.prompts import Prompt, HumanInTheLoop
from llmware.models import ModelCatalog

from importlib import util


LIBRARY : import documents

In [2]:
#FUNCTION THAT WILL CREATE A LIBRARY
def create_library(library_name):
    print (f"\n > Creating library '{library_name}'...")
    library = Library().create_new_library(library_name)
    return library


In [3]:
# CREATE LIBRARY
library_name = "RAG_V4_Lib"
library = create_library(library_name)


 > Creating library 'RAG_V4_Lib'...


In [4]:
# FUNCTION TO IMPORT SAMPLE FILES FOR TESTING 
def import_sample_files(folder):
    sample_files_path = Setup().load_sample_files(over_write=True) #Loads the sample files from llmware package
    data_path = os.path.join(sample_files_path, folder) #Points to the path where the data we are interested in is stored
    print (f"\n > Loading the llmware sample files at: '{sample_files_path}'...")
    return data_path

#Antother function if we want to use our own files
def import_files(folder):
    print (f"\n > Loading the files from folder: '{folder}'...")
    data_path = os.path.join(os.getcwd(), folder)
    return data_path
    # note: to use own documents, just point to a folder in the working directory that has the documents


In [5]:
# IMPORT SAMPLE FILES FOR TESTING
folder = "Agreements"
data_path = import_sample_files(folder)

[37mINFO: Setup - sample_files - downloading requested sample files from AWS S3 bucket - may take a minute.[39m



 > Loading the llmware sample files at: 'C:\Users\xavie\llmware_data\sample_files'...


In [6]:
data_path

'C:\\Users\\xavie\\llmware_data\\sample_files\\Agreements'

In [7]:
# FUNCTION FOR PARSING THE FILES AND ADDING THEM TO THE LIBRARY

def parse_files(library, data_path):
    print (f"\n > Parsing and adding files from to library ...")
    library.add_files(input_folder_path=data_path, chunk_size=400, max_chunk_size=800, smart_chunking=2)
    

In [8]:
# PARSE THE FILES AND ADD THEM TO THE LIBRARY
parse_files(library, data_path)

[37mINFO: update:  Duplicate files (skipped): 15[39m
[37mINFO: update:  Total uploaded: 0[39m



 > Parsing and adding files from to library ...


RETRIEVER EMBEDDINGS : Turn text into numbers

In [9]:
# FUNCTION TO MAKE EMBEDDINGS
def make_embeddings(embedding_model_name, vector_db):
    print("Generating Embeddings in {} db - with Model- {}".format(vector_db, embedding_model_name))
    LLMWareConfig().set_active_db("sqlite")
    MilvusConfig().set_config("lite", True)
    LLMWareConfig().set_vector_db(vector_db)
    library.install_new_embedding(embedding_model_name=embedding_model_name, vector_db=vector_db, batch_size=200)


In [10]:
# LIST OF THE EMBEDDING MODELS AVAILABLE
embedding_models = ModelCatalog().list_embedding_models()
model_names = [model['model_name'] for model in embedding_models]; model_names

['all-MiniLM-L6-v2',
 'all-mpnet-base-v2',
 'industry-bert-insurance',
 'industry-bert-contracts',
 'industry-bert-asset-management',
 'industry-bert-sec',
 'industry-bert-loans',
 'nomic-ai/nomic-embed-text-v1',
 'jinaai/jina-embeddings-v2-base-en',
 'jinaai/jina-embeddings-v2-small-en',
 'BAAI/bge-small-en-v1.5',
 'BAAI/bge-large-en-v1.5',
 'BAAI/bge-base-en-v1.5',
 'thenlper/gte-small',
 'thenlper/gte-base',
 'thenlper/gte-large',
 'llmrails/ember-v1',
 'WhereIsAI/UAE-Large-V1',
 'text-embedding-ada-002',
 'text-embedding-3-small',
 'text-embedding-3-large',
 'medium',
 'xlarge',
 'embed-english-v3.0',
 'embed-multilingual-v3.0',
 'embed-english-light-v3.0',
 'embed-multilingual-light-v3.0',
 'embed-english-v2.0',
 'embed-english-light-v2.0',
 'embed-multilingual-v2.0',
 'textembedding-gecko@latest']

In [11]:
# MAKE EMBEDDINGS
embedding_model_name = "mini-lm-sbert"
vector_db = "faiss"
make_embeddings(embedding_model_name, vector_db)

Generating Embeddings in faiss db - with Model- mini-lm-sbert


  from .autonotebook import tqdm as notebook_tqdm
[37mINFO: update: embedding_handler - FAISS - Embeddings Created: 200 of 8734[39m
[37mINFO: update: embedding_handler - FAISS - Embeddings Created: 400 of 8734[39m
[37mINFO: update: embedding_handler - FAISS - Embeddings Created: 600 of 8734[39m
[37mINFO: update: embedding_handler - FAISS - Embeddings Created: 800 of 8734[39m
[37mINFO: update: embedding_handler - FAISS - Embeddings Created: 1000 of 8734[39m
[37mINFO: update: embedding_handler - FAISS - Embeddings Created: 1200 of 8734[39m
[37mINFO: update: embedding_handler - FAISS - Embeddings Created: 1400 of 8734[39m
[37mINFO: update: embedding_handler - FAISS - Embeddings Created: 1600 of 8734[39m
[37mINFO: update: embedding_handler - FAISS - Embeddings Created: 1800 of 8734[39m
[37mINFO: update: embedding_handler - FAISS - Embeddings Created: 2000 of 8734[39m
[37mINFO: update: embedding_handler - FAISS - Embeddings Created: 2200 of 8734[39m
[37mINFO: update: 

GENERATOR : PROMPTS

In [13]:
# FUNCTION TO MAKE GENERATOR MODEL
def generator(llm_model_name):
    print("Loading model for LLM inference - ", llm_model_name)
    prompter = Prompt().load_model(llm_model_name, temperature=0.0, sample=False)
    query = "what is the executive's base annual salary"
    results = Query(library).semantic_query(query, result_count=80, embedding_distance_threshold=1.0)

    # for each document in the library, we will run a query and look at the results
    for i, contract in enumerate(os.listdir(data_path)):

        qr = []

        if contract != ".DS_Store":

            print("\nContract Name: ", i, contract)

            #   we will look through the list of semantic query results, and pull the top results for each file
            for j, entries in enumerate(results):

                library_fn = entries["file_source"]
                if os.sep in library_fn:
                    # handles difference in windows file formats vs. mac / linux
                    library_fn = library_fn.split(os.sep)[-1]

                if library_fn == contract:
                    print("Top Retrieval: ", j, entries["distance"], entries["text"])
                    qr.append(entries)

            #   we will add the query results to the prompt
            source = prompter.add_source_query_results(query_results=qr)

            #   run the prompt
            response = prompter.prompt_with_source(query, prompt_name="default_with_context")

            #   note: prompt_with_resource returns a list of dictionary responses
            #   -- depending upon the size of the source context, it may call the llm several times
            #   -- each dict entry represents 1 call to the LLM

            for resp in response:
                if "llm_response" in resp:
                    print("\nupdate: llm answer - ", resp["llm_response"])

            # start fresh for next document
            prompter.clear_source_materials()

In [14]:
llm_model_name = "bling-phi-3-gguf"
generator(llm_model_name)

Loading model for LLM inference -  bling-phi-3-gguf

Contract Name:  0 Amphitrite EXECUTIVE EMPLOYMENT AGREEMENT.pdf
Top Retrieval:  1 0.11939159  Executive's base salary shall be reviewed annually   by the Board (or the compensation committee of the Board), pursuant to Employer's normal   compensation and performance review policies for senior level executives, and may be increased but not   decreased. The amount of any increase for each year shall be determined accordingly. For purposes of this   Agreement, the term “Base Salary” shall mean the amount of Executive's base salary established from   time to time pursuant to this Section 2.2.
Top Retrieval:  39 0.18179223  Executive may also serve on one or more corporate   boards of another company (and committees thereof) upon giving advance notice to the Board prior to   commencing service on any other corporate board.   2.2. Base Salary. For all the services rendered by Executive hereunder, during the Employment Period,   Employer sh

COMBINING RETRIVER AND GENERATOR

STATE OF THE ART MODEL USAGE