# Data Crawling

In [1]:
from bs4 import BeautifulSoup
import requests
import json

In [2]:
url = 'https://www.llamaindex.ai/blog/improving-vector-search-reranking-with-postgresml-and-llamaindex'
r = requests.get(url)
s = BeautifulSoup(r.content, 'html.parser')
p = s.find_all('p', class_='Text_text__zPO0D Text_text-size-16__PkjFu')

p_list = []
counter = 1
for item in p:
    content = {}
    if item.text.strip() and (item.next_sibling and item.next_sibling.name != 'pre'):
        content['content'] = item.text.strip()
        content['id'] = f'paragraph_{counter}'
        p_list.append(content)
        counter += 1

In [36]:
with open('sing_doc.json', 'w') as f:
    json.dump(p_list, f, indent=4)

# Embeddings

In [17]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings
from llama_index.core import Document

In [3]:
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding

In [4]:
from llama_index.core import SummaryIndex, VectorStoreIndex
from llama_index.core.tools import QueryEngineTool
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector

In [14]:
with open('sing_doc.json', 'r') as file:
    data = json.load(file)

In [18]:
# Create documents with metadata, filtering out empty paragraphs
documents = [
    Document(
        text=item['content'].strip(), 
        metadata={"id": item['id']}
    ) 
    for item in data if item['content'].strip()  # Filter out empty paragraphs
]

print(len(documents))

16


In [20]:
# Split documents into nodes
splitter = SentenceSplitter(chunk_size=1024)
nodes = splitter.get_nodes_from_documents(documents)
print("len of nodes:", len(nodes))

len of nodes: 16


In [23]:
# Load LLM Model
Settings.llm = Gemini(api_key=gemini_key, model='models/gemini-pro')
Settings.embed_model = GeminiEmbedding(api_key=gemini_key, model='models/embedding-001')
print('LLM model and embedding loaded')

LLM model and embedding loaded


In [24]:
# Vector Store Index
summary_index = SummaryIndex(nodes)
vector_index = VectorStoreIndex(nodes)
print('Summary and Vector Index loaded')

Summary and Vector Index loaded


In [25]:
# Summary Query Engine
summary_query_engine = summary_index.as_query_engine(
    response_node="tree summarize",
    use_async=True
)

In [27]:
# Vector Query Engine
vector_query_engine = vector_index.as_query_engine()

summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine,
    description="Useful for summarization questions related to any topic in Deep Learning paper"
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description="Useful for retrieving specific context from the Deep Learning paper."
)

query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=[summary_tool, vector_tool],
    verbose=True
)

In [28]:
question1 = "What is the summary of the document?"
response1 = query_engine.query(question1)
print("Question 1:", question1)
print(str(response1))

[1;3;38;5;200mSelecting query engine 0: The question asks for a summary of the document, which is a task related to summarization..
[0mQuestion 1: What is the summary of the document?
This document discusses the use of cross-encoders for reranking in search systems. Cross-encoders directly compare query-result pairs for similarity, making them effective for evaluating new, unseen data without the need for extensive user interaction data for fine-tuning. The document provides an example of implementing a simple reranking system using LlamaIndex and the PostgresML managed index, which handles storing, splitting, embedding, and querying documents. The example shows how to use the mixedbread-ai/mxbai-rerank-base-v1 model to rerank the top 100 results from a semantic search, resulting in more precise answers.


In [29]:
question2 = "What are the key points in the document? Which paragraph are you referring to?"
response2 = query_engine.query(question2)
print("Question 2:", question2)
print(str(response2))

[1;3;38;5;200mSelecting query engine 0: The question asks for key points in the document, which is related to summarization..
[0mQuestion 2: What are the key points in the document? Which paragraph are you referring to?
**Key Points:**

* Search systems use keyword and semantic methods to match queries to content.
* Reranking can improve result relevance, especially for new content.
* Cross-encoders directly compare query-result pairs for similarity, making them effective for reranking.
* Cross-encoders complement traditional reranking systems by addressing their limitations in deep text analysis.
* PostgresML Managed Index can handle storing, splitting, embedding, and querying documents for reranking.

**Paragraph:** 1, 2, 3, 4, 6, 10, 11, 12, 15
