# Simple RAG (+ Re-ranker) from a PGVector Store

In [1]:
import time
# Start timer to time the notebook execution
start = time.time()

import pandas as pd
from sqlalchemy import make_url

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
from llama_index.llms.openai_like import OpenAILike
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core.indices.postprocessor import SentenceTransformerRerank

import sys
utils_path = "../../utils"
if utils_path not in sys.path:
    sys.path.append(utils_path)

from helpers import generate_responses_dict

In [2]:
# Path of the evaluation test set with Q/A pairs
TEST_SET = ("../../05-RAG_Dataset_Generation/LlamaIndex_generation/"
            "qa_datasets/Mixtral-8x7B-Instruct-v0.1/NASA_history_qa_only.csv")

# vLLM service settings
LLM_MODEL = "HuggingFaceH4/zephyr-7b-alpha" # You may replace this with a different model
LLM_API_BASE = "http://localhost:8010/v1" # The URL vLLM service is accessible from.
LLM_API_KEY = "NO_KEY" # By default, vLLM does not require a key.
GEN_TEMP=0.1 # Generation temperature
MAX_TOKENS=512 # Max tokens the LLM should generate 
REP_PENALTY=1.03 # Word repetition penalty at generation time

# LLamaIndex LLM provider
Settings.llm = OpenAILike(
    model=LLM_MODEL,
    api_key=LLM_API_KEY,
    api_base=LLM_API_BASE,
    temperature=GEN_TEMP,
    max_tokens=MAX_TOKENS,
    repetition_penalty=REP_PENALTY,
)

# Retriever and re_ranker settings
SIMIL_TOP_K = 8 # Retrieve TOP_K most similar docs from the PGVector store
RERANK_TOP_N = 5 # Rerank and pick the 5 most similar docs
RERANK_MODEL = "BAAI/bge-reranker-base" # Re-ranking model

# LLamaIndex embedding model
EMB_MODEL="BAAI/bge-base-en-v1.5" # For better results you can use the "large" variant
DEVICE="cuda:0" # If running out of GPU RAM, switch to "cpu" (although slower)
Settings.embed_model = HuggingFaceEmbedding(
    model_name=EMB_MODEL,
    device=DEVICE
)
EMBEDDING_SIZE = len(Settings.embed_model.get_text_embedding("hi"))

In [3]:
%%time

# PGVector DB params as defines in the reference Docker compose file
## available from the "../PGvector" directory
DB_PORT = 5432
DB_USER = "demouser"
DB_PASSWD = "demopasswd"
DEFAULT_DB = "postgres"
DB_NAME = "vectordb"
DB_HOST = "localhost"
TABLE_NAME = "NASA_HISTORY_BOOKS"
connection_string = f"postgresql://{DB_USER}:{DB_PASSWD}@{DB_HOST}:{DB_PORT}/{DEFAULT_DB}"
url = make_url(connection_string)

# Open the connection to the Vector Store
vector_store = PGVectorStore.from_params(
    database=DB_NAME,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name=TABLE_NAME,
    embed_dim=EMBEDDING_SIZE, # embedding model dimension
    cache_ok=True,
    hybrid_search=True,
)

# Initialize the index object
index = VectorStoreIndex.from_vector_store(vector_store)

# Initialize the re-ranker of retrieved chunks
re_ranker = SentenceTransformerRerank(
    top_n=RERANK_TOP_N,
    model=RERANK_MODEL,
)

# Set the index as query engine
query_engine = index.as_query_engine(
    similarity_top_k=SIMIL_TOP_K,
    node_postprocessors=[re_ranker],
    vector_store_kwargs={"hnsw_ef_search": 256},
)

CPU times: user 8.85 s, sys: 2.08 s, total: 10.9 s
Wall time: 1.91 s


In [4]:
%%time

# Load a test set to run inference on multiple Q/A pairs
test_set_df = pd.read_csv(
    filepath_or_buffer=TEST_SET,
    usecols=['query', 'reference_answer'],
)

# Run inference on the Q/A pairs and keep the responses
# for future comparison vs.other RAG approaches.
responses = generate_responses_dict(
    query_engine=query_engine,
    test_set_df=test_set_df
)

  0%|          | 0/137 [00:00<?, ?it/s]

CPU times: user 36.6 s, sys: 2.87 s, total: 39.5 s
Wall time: 22min 6s


In [5]:
# Convert the responses into a Pandas data frame
responses_df = pd.DataFrame.from_dict(responses)

# Serialize the inference results dataframe
file_name = (f"Standard_RAG_{LLM_MODEL.split('/')[1]}"
             f"_{EMB_MODEL.split('/')[1]}.csv")

responses_df.to_csv(
    path_or_buf=file_name,
    index=False,
)
stop = time.time()
print(f"Notebook execution time: {(stop-start)/60:.1f} minutes")

Notebook execution time: 22.3 minutes
