# Sentence Window Improved RAG

In [1]:
import time
# Start timer to time the notebook execution
start = time.time()

import pandas as pd
import os
import ast
import yaml
from dotmap import DotMap
import psycopg2
from sqlalchemy import make_url
from IPython.display import display, Markdown
import ipywidgets as widgets
widgets.IntSlider()

from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
)
from llama_index.readers.file import PyMuPDFReader
from llama_index.embeddings.nvidia import NVIDIAEmbedding
from llama_index.postprocessor.nvidia_rerank import NVIDIARerank
from llama_index.core import Settings
from llama_index.llms.openai_like import OpenAILike
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor

import sys
utils_path = "../../08-Utils"
if utils_path not in sys.path:
    sys.path.append(utils_path)

from helpers import (
    get_indices_with_nulls,
    remove_elements,
    generate_responses_dict,
    get_short_docs,
    TextCleaner,
    save_results,
)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# Open the Starter Pack global configuration file
with open('../../07-Starter_Pack_config/improved_rag_config.yaml', 'r') as file:
    config = yaml.safe_load(file)
config = DotMap(config)

In [3]:
# LLamaIndex LLM provider
llm_cfg = config.ml_models.llm_generator
Settings.llm = OpenAILike(
        model=llm_cfg.model,
        api_key=llm_cfg.api_key,
        api_base=llm_cfg.api_base,
        temperature=llm_cfg.temperature,
        max_tokens=llm_cfg.max_tokens,
        repetition_penalty=llm_cfg.repetition_penalty,
)

# LLamaIndex embedding model
emb_cfg = config.ml_models.embedder
Settings.embed_model = NVIDIAEmbedding(
        base_url=emb_cfg.api_base,
        model=emb_cfg.model,
        embed_batch_size=emb_cfg.batch_size,
        truncate="END",
)
EMBEDDING_SIZE = len(Settings.embed_model.get_text_embedding("hi"))

## For this notebook, we'll use a set of E-books (PDF) about [NASA's history](https://www.nasa.gov/history/explore-nasas-history)

In [4]:
%%time
# >> Text Extraction
# The PyMuPDFReader takes ~ 1/20 the time it takes to the default reader to ingest the PDF files
# Note: PyMuPDFReader creates a document object per page in a PDF document.

# Lamda function to add the file name as metadata at loading time
filename_fn = lambda filename: {"file_name": filename.split("/")[-1]}

reader = SimpleDirectoryReader(
        input_dir='../' + config.file_paths.kb_doc_dir,
        required_exts=[".pdf"],
        file_extractor={".pdf":PyMuPDFReader()},
        file_metadata=filename_fn,
        num_files_limit=10,
)
documents = reader.load_data()

# Filter out documents with null (`\x00') characters
# which are incompatible with PGVector.
# Also remove documents a number of words < MIN_DOC_LENGTH
bad_docs = get_indices_with_nulls(documents)
short_docs = get_short_docs(
        documents,
        config.llama_index.min_doc_length)
docs_to_remove = set(bad_docs + short_docs)
documents = remove_elements(documents, docs_to_remove)

CPU times: user 33.3 s, sys: 12.2 s, total: 45.5 s
Wall time: 46.4 s


In [5]:
# Create the sentence window node parser
sentence_window = config.llama_index.sentence_window

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=sentence_window.window_size,
    window_metadata_key=sentence_window.win_mdata_key,
    original_text_metadata_key=sentence_window.orig_mdata_key,
)

pipeline = IngestionPipeline(
        transformations=[
                TextCleaner(),
                SentenceWindowNodeParser(
                        window_size=sentence_window.window_size,
                        window_metadata_key=sentence_window.win_mdata_key,
                        original_text_metadata_key=sentence_window.orig_mdata_key,
                )
        ],
)

nodes = pipeline.run(
        documents=documents,
        show_progress=True
)

Parsing nodes:   0%|          | 0/3692 [00:00<?, ?it/s]

## Building the index on a new DB table called NASA_HISTORY_BOOKS_SENTENCE_INDEX

In [6]:
%%time

# Connect to the PostgreSQL engine ans initialize de DB to serve as vector/document store.
db_cfg = config.postgresql
connection_string = (f"postgresql://{db_cfg.user}:"
                     f"{db_cfg.password}@{db_cfg.db_host}:{db_cfg.port}/{db_cfg.default_db}")
conn = psycopg2.connect(connection_string)
conn.autocommit = True

# Create a url object to store DB connection parameters
url = make_url(connection_string)
conn = psycopg2.connect(connection_string)
cursor = conn.cursor()
cursor.execute(f"DROP TABLE IF EXISTS public.data_{db_cfg.tables.sentence_window};")
conn.commit()
conn.close()

# Connect to the PGVector extension
vector_store = PGVectorStore.from_params(
        database=url.database,
        host=url.host,
        password=url.password,
        port=url.port,
        user=url.username,
        table_name=db_cfg.tables.sentence_window,
        embed_dim=EMBEDDING_SIZE, # embedding model dimension
        cache_ok=True,
        hybrid_search=db_cfg.pgvector.hybrid_search, # retrieve nodes based on vector values and keywords
)

# Populate the new index with documents (nodes) and their embeddings.
storage_context = StorageContext.from_defaults(vector_store=vector_store)
sentence_index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    show_progress=True,
    transformations=None,
)

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1725 [00:00<?, ?it/s]

CPU times: user 1min 56s, sys: 18.5 s, total: 2min 15s
Wall time: 18min 57s


In [7]:
# Initialize a post processor on nodes retrieved by the index
post_processor = MetadataReplacementPostProcessor(
    target_metadata_key=sentence_window.win_mdata_key,
)

# Initialize the re-ranker to post-process nodes retrieved by the index
rerank_cfg = config.ml_models.re_ranker
re_ranker = NVIDIARerank(
        model=rerank_cfg.model,
        base_url=rerank_cfg.api_base,
        api_key="NONE",
        top_n=rerank_cfg.rerank_top_n,
        truncate="END",
)

# Initialize the index. This type retrieved nodes will be post-processed 
# by the post-processor and re-ranking tasks.
h_search = ast.literal_eval(db_cfg.pgvector.pgvector_kwargs)
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k=db_cfg.pgvector.sim_top_k,
    node_postprocessors=[
        post_processor,
        re_ranker
    ],
    vector_store_kwargs=h_search,
    response_mode=config.llama_index.sentence_window.response_mode,

)

In [8]:
%%time
# Run a simple test to ensure all is working
print(">> Quick test on the RAG system.\n")
question = ("As a research assistant, "
            "list the top 10 Hubble telescope discoveries about exoplanets. "
            "Highlight important text using markdown formatting.")
print(f" > Query: {question}")
response = sentence_window_engine.query(question)
display(Markdown(response.response))

>> Quick test on the RAG system.

 > Query: As a research assistant, list the top 10 Hubble telescope discoveries about exoplanets. Highlight important text using markdown formatting.


1. **Water vapor on exoplanets**: Hubble has studied exoplanets' atmospheres and found several that contain water vapor, an essential ingredient for life as we know it.
2. **Habitable exoplanets**: Hubble has discovered exoplanets that orbit within their star's habitable zone, where temperatures are mild enough that liquid water could pool on planetary surfaces.
3. **Exoplanet growth**: Hubble has observed the growth of giant planets, including the young system PDS 70, where a giant world is building up mass at a slow rate.
4. **Exoplanet atmospheres**: Hubble has directly imaged the atmospheres of exoplanets, including the giant world PDS 70b, which was observed in ultraviolet light.
5. **Exoplanet moons**: Hubble may have even extended Galileo's discoveries of moons around Jupiter by potentially finding a moon orbiting a planet located 8,000 light-years from Earth.
6. **Exoplanet weather**: Hubble has observed the weather on exoplanets, including a world that snows sunscreen and another with yellow skies.
7. **Exoplanet sizes**: Hubble has discovered exoplanets of various sizes, including a sub-Neptune that is believed to have lost its primordial hydrogen and helium atmosphere due to the intense radiation of its hot, young star.
8. **Exoplanet orbits**: Hubble has studied the orbits of exoplanets, including a planet with two Suns and another with a star that is 3,000 times brighter than the planet in ultraviolet light.
9. **Exoplanet formation**: Hubble has provided insight into the formation of exoplanets, including the discovery of a planet that is still growing and another that may be nearing the end of its formation process.
10. **Exoplanet diversity**: Hubble has revealed the diversity of exoplanets, including worlds with thick atmospheres, rocky surfaces, and even those that may be capable of supporting life.

Note: The above answer is based on the provided text and may not be an exhaustive list of all Hubble telescope discoveries about exoplanets.

CPU times: user 69.2 ms, sys: 8.26 ms, total: 77.5 ms
Wall time: 4.11 s


In [9]:
%%time

# Load a test set to run inference on multiple Q/A pairs
test_set_cfg = config.data_files.test_sets
active_set = eval(f"test_set_cfg.{test_set_cfg.active_set}.current")
test_set_dir = test_set_cfg.base_path + active_set
test_set = os.listdir(test_set_dir)[0]
test_set = test_set_dir + test_set

test_set_df = pd.read_csv(
    filepath_or_buffer=test_set,
    usecols=['query', 'reference_answer'],
    nrows=test_set_cfg.sample_size
)

# Run inference on the Q/A pairs and keep the responses
# for future comparison vs.other RAG approaches.
responses = generate_responses_dict(
    query_engine=sentence_window_engine,
    test_set_df=test_set_df.head(test_set_cfg.sample_size)
)

  0%|          | 0/40 [00:00<?, ?it/s]

CPU times: user 959 ms, sys: 181 ms, total: 1.14 s
Wall time: 3min 30s


In [10]:
# Convert the responses into a Pandas data frame and serialize it.
responses_df = pd.DataFrame.from_dict(responses)
save_results(
        responses_df,
        "Sentence_Window_RAG",
        llm_cfg.model,
        emb_cfg.model,
        rerank_cfg.model
)

stop = time.time()
print(f"Notebook execution time: {(stop-start)/60:.1f} minutes")

Sentence_Window_RAG pipeline responses successfully saved to file: responses/current/Sentence_Window_RAG__llama3-8b-instruct__nv-embedqa-e5-v5__nv-rerankqa-mistral-4b-v3.csv.
Notebook execution time: 23.7 minutes


In [11]:
def dump_col(df, col):
    for i,elem in enumerate(df[col].to_list()):
        print(f" >>>> ITEM {i}")
        print(elem,"\n")

In [13]:
df = responses_df.head(10)
display(df)

Unnamed: 0,query,answer,contexts,ground_truth
0,What was the USSR's primary launch vehicle and...,The USSR's primary launch vehicle for lunar an...,[19 1960 The 4-stage 8K78 launch vehicle (and ...,The USSR's primary launch vehicle for lunar an...
1,What is the function of the main longitudinal ...,The main longitudinal and continuous strength ...,"[The Wind and Beyond, Volume III 264 Keel. Th...","The main longitudinal strength member, known a..."
2,"What limited but reliable technology, develope...",The Russian Soyuz capsule. It was a limited v...,[Chapter 10: Toward a Permanent Human Presenc...,The Russian Soyuz capsule.
3,Compare the management styles and priorities o...,Daniel Goldin and Administrator Richard Truly ...,"[Although not known widely, in the “black” sp...",Daniel Goldin favored horizontal management st...
4,Analyze the role of end credits in documenting...,The role of end credits in documenting film pr...,[50 YEARS OF SOLAR SYSTEM EXPLORATION: HISTORI...,End credits document film production contribut...
5,Compare the objectives and outcomes of Zond 7 ...,Zond 7 was the first fully successful Soviet c...,[The spacecraft was the last 7K-L1 vehicle ma...,"Zond 7, 5, and 6 all had circumlunar objective..."
6,"Which American military aircraft, including th...",The research conducted to solve the inlet-engi...,"[The strategic bomber variant, the FB-111A, fl...","The Grumman F-14 Tomcat, McDonnell Douglas F-1..."
7,How do astronomers utilize a Super-Jupiter's c...,"Astronomers used Hubble to observe the planet,...",[ Using a Super-Jupiter’s Cloudy Skies ...,Astronomers use a Super-Jupiter's cloudy skies...
8,Compare the technological goals of the X-34 pr...,The X-34 program aimed to develop a reusable l...,[Promise Denied 130 The X-34 A-1 with its Lock...,The X-34 program aimed to demonstrate low-cost...
9,Compare Pratt & Whitney Aircraft's contributio...,Pratt & Whitney and General Electric Aircraft ...,[It resulted from considerable pioneering res...,Pratt & Whitney Aircraft contributed to aircra...
