# Sentence Window Improved RAG with PGVector

In [1]:
import time
# Start timer to time the notebook execution
start = time.time()

import pandas as pd
import psycopg2
from sqlalchemy import make_url

import ipywidgets as widgets
widgets.IntSlider()

from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
)
from llama_index.readers.file import PyMuPDFReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.llms.openai_like import OpenAILike
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.indices.postprocessor import (
    MetadataReplacementPostProcessor,
    SentenceTransformerRerank,
)

import sys
utils_path = "../../utils"
if utils_path not in sys.path:
    sys.path.append(utils_path)

from helpers import (
    get_indices_with_nulls,
    remove_elements,
    generate_responses_dict,
    get_short_docs,
)

In [2]:
# Path of the evaluation test set with Q/A pairs
TEST_SET = ("../../05-RAG_Dataset_Generation/LlamaIndex_generation/qa_datasets"
            "/Mixtral-8x7B-Instruct-v0.1/NASA_history_qa_only.csv")

# Retriever and re_ranker settings
SIMIL_TOP_K = 8 # Retrieve TOP_K most similar docs from the PGVector store
RERANK_TOP_N = 5 # Rerank and pick the 5 most similar docs
RERANK_MODEL = "BAAI/bge-reranker-base" # Re-ranking model

# LLM service settings
LLM_MODEL = "HuggingFaceH4/zephyr-7b-alpha"
LLM_API_BASE = "http://localhost:8010/v1"
LLM_API_KEY = "NO_KEY"
GEN_TEMP=0.1
MAX_TOKENS=512
REP_PENALTY=1.03

# LLamaIndex LLM provider
Settings.llm = OpenAILike(
    model=LLM_MODEL,
    api_key=LLM_API_KEY,
    api_base=LLM_API_BASE,
    temperature=GEN_TEMP,
    max_tokens=MAX_TOKENS,
    repetition_penalty=REP_PENALTY,
)

# Ingestion pipeline settings
NUM_WORKERS = 4
CHUNK_SIZE = 1024
MIN_DOC_LENGTH = 40 # Min number of words per doc
PDF_FILES_PATH = "../../02-KB-Documents/NASA"

# LLamaIndex embedding model
EMB_MODEL="BAAI/bge-base-en-v1.5" # For better results you can use the "large" variant
DEVICE="cuda:0" # If running out of GPU RAM, switch to "cpu" (although slower)
Settings.embed_model = HuggingFaceEmbedding(
    model_name=EMB_MODEL,
    device=DEVICE
)
EMBEDDING_SIZE = len(Settings.embed_model.get_text_embedding("hi"))

## For this notebook, we'll use a set of E-books (PDF) about [NASA's history](https://www.nasa.gov/history/explore-nasas-history)

In [3]:
%%time
# >> Text Extraction
# The PyMuPDFReader takes ~ 1/20 the time it takes to the default reader to ingest the PDF files
# Note: PyMuPDFReader creates a document object per page in a PDF document.

# Lamda function to add the file name as metadata at loading time
filename_fn = lambda filename: {"file_name": filename.split("/")[-1]}

reader = SimpleDirectoryReader(
    input_dir=PDF_FILES_PATH,
    required_exts=[".pdf"],
    file_extractor={".pdf":PyMuPDFReader()},
    file_metadata=filename_fn,
    num_files_limit=10,
)
documents = reader.load_data()

# Filter out documents with null (`\x00') characters
# which are incompatible with PGVector.
# Also remove documents a number of words < MIN_DOC_LENGTH
bad_docs = get_indices_with_nulls(documents)
short_docs = get_short_docs(documents, MIN_DOC_LENGTH)
docs_to_remove = set(bad_docs + short_docs)
documents = remove_elements(documents, docs_to_remove)

CPU times: user 16.2 s, sys: 395 ms, total: 16.6 s
Wall time: 16.6 s


In [4]:
#  Create the sentence window node parser
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

# Extract the nodes from the documents
nodes = node_parser.get_nodes_from_documents(
    documents=documents,
    show_progress=True,
)

Parsing nodes:   0%|          | 0/3692 [00:00<?, ?it/s]

## Building the index on a new DB table called NASA_HISTORY_BOOKS_SENTENCE_INDEX

In [5]:
%%time
# PGVector DB params
DB_PORT = 5432
DB_USER = "demouser"
DB_PASSWD = "demopasswd"
DEFAULT_DB = "postgres"
DB_NAME = "vectordb"
DB_HOST = "localhost"
TABLE_NAME = "NASA_HISTORY_BOOKS_SENTENCE_INDEX"

connection_string = f"postgresql://{DB_USER}:{DB_PASSWD}@{DB_HOST}:{DB_PORT}/{DEFAULT_DB}"
url = make_url(connection_string)

# Drop the DB table if exists
conn = psycopg2.connect(connection_string)
cursor = conn.cursor()
sql = f"DROP TABLE IF EXISTS {TABLE_NAME}"
cursor.execute(sql)
conn.commit()
conn.close()

# Open the connection to the DB engine
vector_store = PGVectorStore.from_params(
    database=DB_NAME,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name=TABLE_NAME,
    embed_dim=EMBEDDING_SIZE, # embedding model dimension
    cache_ok=True,
    hybrid_search=True,
)

# Populate the new index with documents (nodes) and their embeddings.
storage_context = StorageContext.from_defaults(vector_store=vector_store)
sentence_index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    show_progress=True,
    transformations=None,
)

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1628 [00:00<?, ?it/s]

CPU times: user 11min 54s, sys: 46.6 s, total: 12min 40s
Wall time: 8min 44s


In [6]:
# Initialize a post processor on nodes retrieved by the index
post_processor = MetadataReplacementPostProcessor(
    target_metadata_key="window",
)

# Initialize the re-ranker to post-process nodes retrieved by the index
re_ranker = SentenceTransformerRerank(
    top_n=RERANK_TOP_N,
    model=RERANK_MODEL,
)

# Initialize the index. This type retrieved nodes will be post-processed 
# by the post-processor and re-ranking tasks.
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k=SIMIL_TOP_K,
    node_postprocessors=[
        post_processor,
        re_ranker
    ],
    vector_store_kwargs={"hnsw_ef_search": 256},
)

In [7]:
%%time
# Run a simple test to ensure all is working
print(">> Quick test on the RAG system.")
question = "What are the main Hubble telescope discoveries about exoplanets?"
print(f" > Question: {question}")
response = sentence_window_engine.query(question)
print(f" > Response:\n", response.response)

>> Quick test on the RAG system.
 > Question: What are the main Hubble telescope discoveries about exoplanets?
 > Response:
 

Hubble has revealed exceedingly valuable information about hundreds of other worlds, including the discovery of atmospheres that contain sodium, oxygen, carbon, hydrogen, carbon dioxide, methane, and water vapor. Hubble has confrmed that a planet orbits two suns, and made a detailed global map of another world showing the temperature at different layers in its atmosphere and the amount and distribution of its water vapor. Hubble has also identified water vapor on planets that orbit in their star’s habitable zone. Hubble observes the universe from Earth orbit, just outside our planet’s atmosphere, and can detect objects as faint as 31st magnitude, which is about 10 billion times fainter than the human eye can see. Hubble can see faint objects near bright objects, which is an important requirement for studying the regions around stars and close to the glowing nuc

In [8]:
%%time

# Load a test set to run inference on multiple Q/A pairs
test_set_df = pd.read_csv(
    filepath_or_buffer=TEST_SET,
    usecols=['query', 'reference_answer'],
)

# Run inference on the Q/A pairs and keep the responses
# for future comparison vs.other RAG approaches.
responses = generate_responses_dict(
    query_engine=sentence_window_engine,
    test_set_df=test_set_df
)

  0%|          | 0/137 [00:00<?, ?it/s]

CPU times: user 28.6 s, sys: 2.16 s, total: 30.8 s
Wall time: 11min 8s


In [9]:
# Convert the responses into a Pandas data frame
responses_df = pd.DataFrame.from_dict(responses)

# Serialize the inference results dataframe
file_name = (f"Sentence_Window_RAG_{LLM_MODEL.split('/')[1]}"
             f"_{EMB_MODEL.split('/')[1]}.csv")

responses_df.to_csv(
    path_or_buf=file_name,
    index=False,
)
stop = time.time()
print(f"Notebook execution time: {(stop-start)/60:.1f} minutes")

Notebook execution time: 20.6 minutes
