#  Auto-Merging Improved RAG with a PostgreSQL Document Store

In [1]:
import time
# Start timer to time the notebook execution
start = time.time()

import pandas as pd
import os
import yaml
from dotmap import DotMap
import psycopg2
from sqlalchemy import make_url
from IPython.display import display, Markdown
import ipywidgets as widgets
widgets.IntSlider()

from llama_index.core import VectorStoreIndex
from llama_index.storage.docstore.postgres import PostgresDocumentStore
from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
)
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import Settings
from llama_index.llms.openai_like import OpenAILike
from llama_index.core.node_parser import (
    HierarchicalNodeParser,
    get_leaf_nodes,
)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.nvidia import NVIDIAEmbedding
from llama_index.postprocessor.nvidia_rerank import NVIDIARerank

import sys

utils_path = "../../08-Utils"
if utils_path not in sys.path:
    sys.path.append(utils_path)

from helpers import (
    get_indices_with_nulls,
    remove_elements,
    generate_responses_dict,
    get_short_docs,
    TextCleaner,
    save_results,
)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# Open the Starter Pack global configuration file
with open('../../07-Starter_Pack_config/improved_rag_config.yaml', 'r') as file:
    config = yaml.safe_load(file)
config = DotMap(config)

In [3]:
# LLamaIndex LLM provider
llm_cfg = config.ml_models.llm_generator
Settings.llm = OpenAILike(
        model=llm_cfg.model,
        api_key=llm_cfg.api_key,
        api_base=llm_cfg.api_base,
        temperature=llm_cfg.temperature,
        max_tokens=llm_cfg.max_tokens,
        repetition_penalty=llm_cfg.repetition_penalty,
)

# LLamaIndex embedding model
emb_cfg = config.ml_models.embedder
Settings.embed_model = NVIDIAEmbedding(
        base_url=emb_cfg.api_base,
        model=emb_cfg.model,
        embed_batch_size=emb_cfg.batch_size,
        truncate="END",
)
EMBEDDING_SIZE = len(Settings.embed_model.get_text_embedding("hi"))

## For this notebook, we'll use a set of E-books (PDF) about [NASA's history](https://www.nasa.gov/history/explore-nasas-history)

In [4]:
%%time
# >> Text Extraction
# The PyMuPDFReader takes ~ 1/20 the time it takes to the default reader to ingest the PDF files
# Note: PyMuPDFReader creates a document object per page in a PDF document.

# Lamda function to add the file name as metadata at loading time
filename_fn = lambda filename: {"file_name": filename.split("/")[-1]}

reader = SimpleDirectoryReader(
    input_dir="../" + config.file_paths.kb_doc_dir,
    required_exts=[".pdf"],
    file_extractor={".pdf":PyMuPDFReader()},
    file_metadata=filename_fn,
    num_files_limit=10,
)
documents = reader.load_data()

# Filter out documents with null (`\x00') characters
# which are incompatible with PGVector.
# Also remove documents a number of words < MIN_DOC_LENGTH
bad_docs = get_indices_with_nulls(documents)
short_docs = get_short_docs(
        documents,
        config.llama_index.min_doc_length
)
docs_to_remove = set(bad_docs + short_docs)
documents = remove_elements(documents, docs_to_remove)

CPU times: user 31.6 s, sys: 11.6 s, total: 43.3 s
Wall time: 44 s


In [5]:
# Create the hierarchical node parser w/ default settings
pipeline = IngestionPipeline(
        transformations=[
                TextCleaner(),
                HierarchicalNodeParser.from_defaults(
                        chunk_sizes=[512, 256]
                )
        ],
)

nodes = pipeline.run(
        documents=documents,
        show_progress=True,
)

# Extract "leaf nodes" to be used later 
leaf_nodes = get_leaf_nodes(nodes)

Parsing documents into nodes:   0%|          | 0/3692 [00:00<?, ?it/s]

### Building the index on DB table NASA_HIST_BOOKS_DOCSTORE

In [6]:
%%time

# Connect to the PostgreSQL engine ans initialize de DB to serve as vector/document store.
db_cfg = config.postgresql
connection_string = (f"postgresql://{db_cfg.user}:"
                     f"{db_cfg.password}@{db_cfg.db_host}:{db_cfg.port}/{db_cfg.default_db}")
conn = psycopg2.connect(connection_string)
conn.autocommit = True

# Create a url object to store DB connection parameters
url = make_url(connection_string)
conn = psycopg2.connect(connection_string)
cursor = conn.cursor()
sql = f"DROP TABLE IF EXISTS public.data_{db_cfg.tables.auto_merging};"
cursor.execute(sql)
conn.commit()
conn.close()

# Unlike the other 2 approaches, this time a PGVector index is not used
# instead PostgreSQL document store gets used. 
docstore = PostgresDocumentStore.from_params(
    database=url.database,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name=db_cfg.tables.auto_merging,
    debug=False,
)
# Add the documents to the "docstore". This process is much slower
# than creating a PGVector store.
docstore.add_documents(
        docs=nodes,
        batch_size=512,
)

CPU times: user 3min 22s, sys: 2.6 s, total: 3min 24s
Wall time: 3min 51s


In [9]:
%%time
storage_context = StorageContext.from_defaults(docstore=docstore)
base_index = VectorStoreIndex(
    leaf_nodes,
    storage_context=storage_context,
    show_progress=True,
)

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/806 [00:00<?, ?it/s]

CPU times: user 44.5 s, sys: 7.23 s, total: 51.7 s
Wall time: 4min 15s


In [10]:
# Initialized the base retriever to be improved
base_retriever = base_index.as_retriever(
    similarity_top_k=db_cfg.pgvector.sim_top_k
)

# Initialize the auto-merging retriever from 
# the base retrieved and the docstore.
retriever = AutoMergingRetriever(
    vector_retriever=base_retriever,
    storage_context=StorageContext.from_defaults(
        docstore=docstore
    ),
    verbose=True,
)

# Initialize the re-ranker post processor
rerank_cfg = config.ml_models.re_ranker
re_ranker = NVIDIARerank(
        model=rerank_cfg.model,
        base_url=rerank_cfg.api_base,
        api_key="NONE",
        top_n=rerank_cfg.rerank_top_n,
        truncate="END",
)

# Initialize the query engine from the auto-merging retriever
# and the re-ranking post-processor.
auto_merging_engine = RetrieverQueryEngine.from_args(
    retriever=retriever, 
    node_postprocessors=[re_ranker],
    response_mode=config.llama_index.auto_merge.response_mode,
)

In [11]:
%%time
# Run a quick test on the query engine.
print(">> Quick test on the RAG system.\n")
question = ("As a research assistant, "
            "list the top 10 Hubble telescope discoveries about exoplanets. "
            "Highlight important text using markdown formatting.")
print(f" > Query: {question}")
response = auto_merging_engine.query(question)
display(Markdown(response.response))

>> Quick test on the RAG system.

 > Query: As a research assistant, list the top 10 Hubble telescope discoveries about exoplanets. Highlight important text using markdown formatting.
> Merging 5 nodes into parent node.
> Parent node id: a29825cd-eeb6-4463-ad2d-16f56a2ece9c.
> Parent node text: CONTENTS  About the Hubble Space Telescope                                                     2 ...

> Merging 1 nodes into parent node.
> Parent node id: 62d3d2d3-062e-442c-a601-d39c331fab35.
> Parent node text: Red Stars May Hinder Habitability                                    37  Exploring ‘Goldilocks’ S...



1. **Probing the Atmospheres of Rocky, Habitable-Zone Planets** - The Hubble Space Telescope has been used to study the atmospheres of rocky, habitable-zone exoplanets, which could potentially support life.
2. **Spotting a World with a Glowing Water Atmosphere** - Hubble has detected an exoplanet with a glowing water atmosphere, which could be a sign of life.
3. **Detecting Water Vapor on a Habitable-Zone Exoplanet** - Hubble has detected water vapor on a habitable-zone exoplanet, which is a key ingredient for life.
4. **Exposing the First Evidence of a Possible Exomoon** - Hubble has detected evidence of a possible exomoon, which could be a moon orbiting an exoplanet.
5. **Capturing a Blistering Pitch-Black Planet** - Hubble has captured an image of a blistering pitch-black planet, which is a rare type of exoplanet.
6. **Finding a Shrinking Planet** - Hubble has detected a planet that is shrinking, which could be due to the loss of mass or the contraction of the planet.
7. **Uncovering a Football-Shaped ‘Heavy Metal’ Exoplanet** - Hubble has detected a football-shaped exoplanet that is composed of heavy metals, which is a rare type of exoplanet.
8. **Unraveling Mysteries Surrounding ‘Cotton Candy’ Planets** - Hubble has studied the properties of "cotton candy" planets, which are exoplanets with low densities and high temperatures.
9. **Tracking an Exiled Exoplanet’s Far-Flung Orbit** - Hubble has tracked the orbit of an exiled exoplanet, which is a planet that has been ejected from its parent star system.
10. **Revealing a Volcanic World that May Be on its Second Atmosphere** - Hubble has detected a volcanic world that may be on its second atmosphere, which could be a sign of geological activity.

Note: The above list is based on the provided text and may not be an exhaustive list of all Hubble telescope discoveries about exoplanets.

CPU times: user 958 ms, sys: 54.5 ms, total: 1.01 s
Wall time: 4.27 s


In [12]:
%%time

# Load a test set to run inference on multiple Q/A pairs
test_set_cfg = config.data_files.test_sets
active_set = eval(f"test_set_cfg.{test_set_cfg.active_set}.current")
test_set_dir = test_set_cfg.base_path + active_set
test_set = os.listdir(test_set_dir)[0]
test_set = test_set_dir + test_set

test_set_df = pd.read_csv(
        filepath_or_buffer=test_set,
        usecols=['query', 'reference_answer'],
)

# Run inference on the Q/A pairs and keep the responses
# for future comparison vs.other RAG approaches.
responses = generate_responses_dict(
    query_engine=auto_merging_engine,
    test_set_df=test_set_df
)

  0%|          | 0/40 [00:00<?, ?it/s]

> Merging 1 nodes into parent node.
> Parent node id: 6d5c38b4-f655-4ad9-94b5-07dfc8d9ff7c.
> Parent node text: 271 Document 5-20 (a–c) where L is the length and B the beam of each float in ft., s the spacing ...

> Merging 2 nodes into parent node.
> Parent node id: d048e686-eeda-48e0-8d69-23aaaa14de4a.
> Parent node text: Soon after trans-lunar injection, which happened at 20:18:30 UT on launch  day, ground controller...

> Merging 2 nodes into parent node.
> Parent node id: 136c4fa4-295d-47e7-a873-58f32bb59fde.
> Parent node text: At the time, Zond 5 was  325,000 kilometers from Earth. The spacecraft successfully circled aroun...

> Merging 1 nodes into parent node.
> Parent node id: b66ba26f-ba0f-443a-bd42-8ddc0892d7ab.
> Parent node text: Overall, the F-111 experience was an extremely cautionary tale for the  American aerospace indust...

> Merging 3 nodes into parent node.
> Parent node id: f15162b7-eb18-468e-8d5e-524707ed2ff5.
> Parent node text: The Power for Flight 60 Secretar

In [13]:
# Convert the responses into a Pandas data frame
responses_df = pd.DataFrame.from_dict(responses)
save_results(
        responses_df,
        "Auto_Merging_RAG",
        llm_cfg.model,
        emb_cfg.model,
        rerank_cfg.model
)

stop = time.time()
print(f"Notebook execution time: {(stop-start)/60:.1f} minutes")

Auto_Merging_RAG pipeline responses successfully saved to file: responses/current/Auto_Merging_RAG__llama3-8b-instruct__nv-embedqa-e5-v5__nv-rerankqa-mistral-4b-v3.csv.
Notebook execution time: 28.8 minutes
