In [1]:
import warnings
warnings.filterwarnings('ignore')


In [2]:
import utils 
import os
import openai
openai.api_key = utils.get_openai_api_key()

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [6]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files = ["./unmeshmaliWeeklyBlog.pdf"]
).load_data()

In [7]:
from llama_index import Document
document = Document(text = "\n\n".join([doc.text for doc in documents]))

In [8]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]), "\n")
print(documents[0])

<class 'list'> 

43 

<class 'llama_index.schema.Document'> 

Doc ID: 4ee6cb87-3dd2-417d-a3ef-80919f6fa4ef
Text: unmeshmali.com Posts published on February 18, 2024, from
unmeshmali.com. Printed on February 18, 2024 using Print My Blog Week
10, 2021 March 11, 2021 Categories: Weekly Notes Work Working on
multiple things like always. I have a couple technical projects in
hand that involve Python and Java programming. I also learned Docker
to be able to depl...


# Window-sentence retrieval setup

In [9]:
from llama_index.node_parser import SentenceWindowNodeParser

# creating note parser with default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_metadata_key ="window", 
    window_size=2,
    original_text_metadata_key="original_text"
)

In [10]:
text = "Hi. This is Unmesh. I am in week 7 of 2024. This has been a busy year at work so far. I want to make this year extremely productive."

nodes = node_parser.get_nodes_from_documents([Document(text = text)])

In [11]:
print([x.text for x in nodes])

['Hi. ', 'This is Unmesh. ', 'I am in week 7 of 2024. ', 'This has been a busy year at work so far. ', 'I want to make this year extremely productive.']


In [12]:
# Checking metadata around some nodes (each node is a sentence)
print(nodes[4].metadata["window"])

I am in week 7 of 2024.  This has been a busy year at work so far.  I want to make this year extremely productive.


# Building the Index

In [15]:
from llama_index.llms import OpenAI
# Createing the GPT 3.5 model object
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [16]:
from llama_index import ServiceContext

sentence_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    node_parser=node_parser,
)

In [17]:
from llama_index import VectorStoreIndex

sentence_index = VectorStoreIndex.from_documents(
    [document], service_context=sentence_context
)

In [18]:
sentence_index.storage_context.persist(persist_dir="./sentence_index")

# Building the post-processor

In [20]:
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

postproc =  MetadataReplacementPostProcessor(
    target_metadata_key="window"
)

In [21]:
from llama_index.schema import NodeWithScore
from copy import deepcopy

scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes]
nodes_old = [deepcopy(n) for n in nodes]

In [27]:
nodes_old[1].text

'This is Unmesh. '

In [25]:
replaced_nodes = postproc.postprocess_nodes(scored_nodes)

In [26]:
print(replaced_nodes[1].text)

Hi.  This is Unmesh.  I am in week 7 of 2024. 


# Adding a reranker

In [31]:
from llama_index.indices.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(
    top_n=2, 
    model = "BAAI/bge-reranker-base"
)

config.json: 100%|██████████| 799/799 [00:00<00:00, 565kB/s]
model.safetensors: 100%|██████████| 1.11G/1.11G [00:24<00:00, 44.7MB/s]
tokenizer_config.json: 100%|██████████| 443/443 [00:00<00:00, 491kB/s]
sentencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:00<00:00, 36.0MB/s]
tokenizer.json: 100%|██████████| 17.1M/17.1M [00:00<00:00, 41.7MB/s]
special_tokens_map.json: 100%|██████████| 279/279 [00:00<00:00, 405kB/s]


from llama_index import QueryBundle
from llama_index.schema import TextNode, NodeWithScore

query = QueryBundle("I want a dog")

scored_nodes = [
    NodeWithScore(node=TextNode(text = "This is a cat"), score = 0.3), 
    NodeWithScore(node=TextNode(text="This is a dog"), score = 0.4)
]

reranked_nodes = rerank.postprocess_nodes(
    scored_nodes, query_bundle=query
)

print([(x.text, x.score) for x in reranked_nodes])

# Running the query engine

In [49]:
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k=6, node_postprocessors=[postproc, rerank]
)

In [52]:
window_response = sentence_window_engine.query(
    "Where was Unmesh employed in 2022"
)

RuntimeError: cannot reuse already awaited coroutine

In [51]:
from llama_index.response.notebook_utils import display_response

display_response(window_response)

NameError: name 'window_response' is not defined