#  Auto-Merging Improved RAG with a PostgreSQL Document Store

In [1]:
import time
# Start timer to time the notebook execution
start = time.time()

import pandas as pd
import psycopg2
from sqlalchemy import make_url

import ipywidgets as widgets
widgets.IntSlider()

from llama_index.core import VectorStoreIndex
from llama_index.storage.docstore.postgres import PostgresDocumentStore
from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
)
from llama_index.readers.file import PyMuPDFReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.llms.openai_like import OpenAILike
from llama_index.core.node_parser import (
    HierarchicalNodeParser,
    get_leaf_nodes,
)
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.indices.postprocessor import SentenceTransformerRerank

import sys

utils_path = "../../utils"
if utils_path not in sys.path:
    sys.path.append(utils_path)

from helpers import (
    get_indices_with_nulls,
    remove_elements,
    generate_responses_dict,
    get_short_docs,
)

In [2]:
# Path of the evaluation test set with Q/A pairs
TEST_SET = ("../../05-RAG_Dataset_Generation/LlamaIndex_generation/qa_datasets"
            "/Mixtral-8x7B-Instruct-v0.1/NASA_history_qa_only.csv")

# Retriever and re_ranker settings
SIMIL_TOP_K = 8
RERANK_TOP_N = 5
RERANK_MODEL = "BAAI/bge-reranker-base"

# LLM service settings
LLM_MODEL = "HuggingFaceH4/zephyr-7b-alpha"
LLM_API_BASE = "http://localhost:8010/v1"
LLM_API_KEY = "NO_KEY"
GEN_TEMP=0.1
MAX_TOKENS=512
REP_PENALTY=1.03

# Ingestion pipeline settings
NUM_WORKERS = 4
CHUNK_SIZE = 1024
MIN_DOC_LENGTH = 40 # Min number of words per doc
PDF_FILES_PATH = "../../02-KB-Documents/NASA"

# LLamaIndex embedding model
EMB_MODEL="BAAI/bge-base-en-v1.5" # For better results you can use the "large" variant
DEVICE="cuda:0" # If running out of GPU RAM, switch to "cpu" (although slower)
Settings.embed_model = HuggingFaceEmbedding(
    model_name=EMB_MODEL,
    device=DEVICE
)
EMBEDDING_SIZE = len(Settings.embed_model.get_text_embedding("hi"))

# LLamaIndex LLM provider
Settings.llm = OpenAILike(
    model=LLM_MODEL,
    api_key=LLM_API_KEY,
    api_base=LLM_API_BASE,
    temperature=GEN_TEMP,
    max_tokens=MAX_TOKENS,
    repetition_penalty=REP_PENALTY,
)

## For this notebook, we'll use a set of E-books (PDF) about [NASA's history](https://www.nasa.gov/history/explore-nasas-history)

In [3]:
%%time
# >> Text Extraction
# The PyMuPDFReader takes ~ 1/20 the time it takes to the default reader to ingest the PDF files
# Note: PyMuPDFReader creates a document object per page in a PDF document.

# Lamda function to add the file name as metadata at loading time
filename_fn = lambda filename: {"file_name": filename.split("/")[-1]}

reader = SimpleDirectoryReader(
    input_dir=PDF_FILES_PATH,
    required_exts=[".pdf"],
    file_extractor={".pdf":PyMuPDFReader()},
    file_metadata=filename_fn,
    num_files_limit=10,
)
documents = reader.load_data()

# Filter out documents with null (`\x00') characters
# which are incompatible with PGVector.
# Also remove documents a number of words < MIN_DOC_LENGTH
bad_docs = get_indices_with_nulls(documents)
short_docs = get_short_docs(documents, MIN_DOC_LENGTH)
docs_to_remove = set(bad_docs + short_docs)
documents = remove_elements(documents, docs_to_remove)

CPU times: user 17.9 s, sys: 305 ms, total: 18.2 s
Wall time: 18.1 s


In [4]:
# Create the hierarchical node parser w/ default settings
node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048, 512, 128]
)

# Extract nodes from documents
nodes = node_parser.get_nodes_from_documents(
    documents=documents,
    show_progress=True,
)

# Extract "leaf nodes" to be used later 
leaf_nodes = get_leaf_nodes(nodes)

Parsing documents into nodes:   0%|          | 0/3692 [00:00<?, ?it/s]

### Building the index on DB table NASA_HIST_BOOKS_DOCSTORE

In [5]:
%%time
# PGVector DB params
DB_PORT = 5432
DB_USER = "demouser"
DB_PASSWD = "demopasswd"
DEFAULT_DB = "postgres"
DB_NAME = "vectordb"
DB_HOST = "localhost"
DOC_STORE_TABLE = "NASA_HIST_BOOKS_DOCSTORE"

connection_string = f"postgresql://{DB_USER}:{DB_PASSWD}@{DB_HOST}:{DB_PORT}/{DEFAULT_DB}"
url = make_url(connection_string)

# Drop the DB table if exists
conn = psycopg2.connect(connection_string)
cursor = conn.cursor()
sql = f"DROP TABLE IF EXISTS {DOC_STORE_TABLE}"
cursor.execute(sql)
conn.commit()
conn.close()

# Unlike the other 2 approaches, this time a PGVector index is not used
# instead PostgreSQL document store gets used. 
docstore = PostgresDocumentStore.from_params(
    database=DB_NAME,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name=DOC_STORE_TABLE,
)

# Add the documents to the "docstore". This process is much slower
# than creating a PGVector store. 
docstore.add_documents(nodes)
storage_context = StorageContext.from_defaults(docstore=docstore)

CPU times: user 2min 24s, sys: 23.6 s, total: 2min 48s
Wall time: 7min 49s


In [6]:
%%time
base_index = VectorStoreIndex(
    leaf_nodes,
    storage_context=storage_context,
    show_progress=True,
)

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1260 [00:00<?, ?it/s]

CPU times: user 11min 58s, sys: 1min 1s, total: 13min
Wall time: 14min 1s


In [7]:
# Initialized the base retriever to be improved
base_retriever = base_index.as_retriever(
    similarity_top_k=SIMIL_TOP_K
)

# Initialize the auto-merging retriever from 
# the base retrived and the docstore.
retriever = AutoMergingRetriever(
    vector_retriever=base_retriever,
    storage_context=StorageContext.from_defaults(
        docstore=docstore
    ),
    verbose=True,
)

# Initialize the re-ranker post processor
re_ranker = SentenceTransformerRerank(
    top_n=RERANK_TOP_N,
    model=RERANK_MODEL,
)

# Initialize the query engine from the auto-merging retriever
# and the re-ranking post-processor.
auto_merging_engine = RetrieverQueryEngine.from_args(
    retriever=retriever, 
    node_postprocessors=[re_ranker],
)

In [8]:
%%time
# Run a quick test on the query engine.
print(">> Quick test on the RAG system.")
question = "What are the main Hubble telescope discoveries about exoplanets?"
print(f" > Question: {question}")
response = auto_merging_engine.query(question)
print(f" > Response:\n", response.response)

>> Quick test on the RAG system.
 > Question: What are the main Hubble telescope discoveries about exoplanets?
 > Response:
 

The Hubble telescope has made several significant discoveries about exoplanets, including:

1. Observing the two innermost planets and finding that they lack puffy, hydrogen-dominated atmospheres that are common for gaseous worlds like Neptune.

2. Demonstrating that the basic organic components for life can be detected and measured on planets orbiting other stars, setting the stage for more detailed studies with future observatories.

3. Confirming that a planet orbits two suns.

4. Making recent discoveries about strange new worlds, including the detection of helium in an exoplanet's atmosphere around a turbulent star.

5. Providing more information about the Hubble Space Telescope mission and its discoveries, as well as resources and social media sites for following Hubble's exploration of exoplanets.

Overall, Hubble's contributions are often in partnership

In [9]:
%%time

# Load a test set to run inference on multiple Q/A pairs
test_set_df = pd.read_csv(
    filepath_or_buffer=TEST_SET,
    usecols=['query', 'reference_answer'],
)

# Run inference on the Q/A pairs and keep the responses
# for future comparison vs.other RAG approaches.
responses = generate_responses_dict(
    query_engine=auto_merging_engine,
    test_set_df=test_set_df
)

  0%|          | 0/137 [00:00<?, ?it/s]

> Merging 4 nodes into parent node.
> Parent node id: 47939186-0176-4e87-bf28-16bf844c3cb1.
> Parent node text: The Power for Flight
74
that noise was “the curse of modern times and a major environmental prob­...

> Merging 5 nodes into parent node.
> Parent node id: bdaf5ad9-4c49-4912-94c5-c5cf670d30ee.
> Parent node text: 813
Documents 5-49 (a–d)
In July 1985, an intensive flight evaluation of a three-axis sidestick c...

> Merging 5 nodes into parent node.
> Parent node id: 3e753de3-4051-4369-b230-25137794739b.
> Parent node text: 347
Document 5-23 (a–c)
foodstuffs and galley supplies is 256 pounds[,] making a total of 490 pou...

> Merging 1 nodes into parent node.
> Parent node id: 1188caad-d9b3-4bc5-b878-0bea4fdc8345.
> Parent node text: 25–26.
PLANING-TAIL HULLS
Hydrodynamic research on the planing-tail type of hull has been continu...

> Merging 2 nodes into parent node.
> Parent node id: b8632215-bb5c-45f1-9f04-8888e834c52a.
> Parent node text: Chapter 2: Big Dreams
89
Figure 

In [10]:
# Convert the responses into a Pandas data frame
responses_df = pd.DataFrame.from_dict(responses)

# Serialize the inference results dataframe
file_name = (f"Auto_Merging_RAG_{LLM_MODEL.split('/')[1]}"
             f"_{EMB_MODEL.split('/')[1]}.csv")

responses_df.to_csv(
    path_or_buf=file_name,
    index=False,
)
stop = time.time()
print(f"Notebook execution time: {(stop-start)/60:.1f} minutes")

Notebook execution time: 35.1 minutes
