In [1]:
from llama_index import SimpleDirectoryReader, VectorStoreIndex, Document, SummaryIndex, StorageContext, load_index_from_storage, ServiceContext, set_global_service_context
from llama_index.llms import OpenAI
from llama_index.node_parser import SimpleNodeParser
from llama_index import (VectorStoreIndex, get_response_synthesizer)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor
# import logging, sys
# import openai

#### Enabling Debugging Mode

In [21]:
# logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
# openai.log="debug"

#### One-Time Setup


In [2]:
#loading the document, setting up the service context, llm to use, chunk size, chunk overlap
documents = SimpleDirectoryReader('./data').load_data()
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.8)
#build the nodes
parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=24)
nodes = parser.get_nodes_from_documents(documents)
service_context= ServiceContext.from_defaults(llm=llm, chunk_size=1024, chunk_overlap=20, node_parser=parser)

set up the global service once and for all

In [None]:
set_global_service_context(service_context)

Using another LLM

In [None]:
from llama_index.llms import PaLM
service_context=ServiceContext.from_defaults(llm=PaLM())

#### Document & Node Parser

we can load each document manually and add metadata and other fields like relationship after loading the documents

In [None]:
document2 = Document(text="./data/script.txt", metadata={"filename": "script.txt", "category": "paul_graham_essay"})
documents[0].metadata = {"category": "paul_graham_essays"}
print(documents[0].metadata)
print(documents[0].id_)

#### Creating shared Storage from nodes

We can build multiple indices from this shared storage

In [26]:
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)

#### Building Index (saved to disk)

In [27]:
#Only run once at the beggining
vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
summary_index = SummaryIndex(nodes, storage_context=storage_context)
vector_index.set_index_id("vector_index")
summary_index.set_index_id("summary_index")

saving the storage (with 2 indices) on disk - root folder

In [28]:
storage_context.persist(persist_dir="./storage")

#### Creating the Query Engine 
Rebuilding the index from our shared storage facility

In [None]:
storage_context = StorageContext.from_defaults(persist_dir="./storage")
vector_index = load_index_from_storage(storage_context, index_id="vector_index")
summary_index = load_index_from_storage(storage_context, index_id="summary_index")


#### Configure the Retriever & Build Query Engine
An index can have a variety of index-specific retrieval modes. For instance, a summary index supports the default SummaryIndexRetriever that retrieves all nodes, and SummaryIndexEmbeddingRetriever that retrieves the top-k nodes by embedding similarity.

In [46]:
# SummaryIndexRetriever
summary_retriever = summary_index.as_retriever(retriever_mode='default')
summary_query_engine = RetrieverQueryEngine(summary_retriever)


# SummaryIndexEmbeddingRetriever
# retriever = summary_index.as_retriever(retriever_mode='embedding')

#VectroIndexRetriever
vector_retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=3)
response_synthesizer = get_response_synthesizer()

vector_query_engine = RetrieverQueryEngine(retriever=vector_retriever,
                                           response_synthesizer=response_synthesizer,
                                           node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)])

# default option
# summary_query_engine = summary_index.as_query_engine()
# vector_query_engine= vector_index.as_query_engine()

#### Run Queries

In [51]:
response = vector_query_engine.query("why do people love langchain?")
print(f"Here is the response: \n {response}\n Here is the source nodes:\n {response.get_formatted_sources()}")

Here is the response: 
 People love Langchain because it makes their life as a developer easier. It provides abstractions that allow for easy switching between vector datastores or embeddings with just a small code change. It also simplifies tasks like chunking files and loading them into a vector store. While there may be some challenges in getting started with Langchain, the documentation has improved and it is appreciated by many developers. Additionally, Langchain offers pluggability, which is useful for supporting different vendor LLMs and avoiding vendor lock-in. Overall, Langchain is seen as a valuable tool for developers in the LLM ecosystem.
 Here is the source nodes:
 > Source (Doc id: 8f7fdf70-18bb-4dfe-91e0-d44cad8eaf6b): We're working on building separate tooling to assist with this that we hope to launch soon.  morg...

> Source (Doc id: 6d9ead2a-c1e1-4737-b19a-1eb47d13b130): handonam 3 months ago  
              

the consistency and conventionality of the methods in the

In [45]:
response = summary_query_engine.query("summarize the text")
print(f"Here is the response: \n {response}\n Here is the source nodes:\n {response.get_formatted_sources()}")

Here is the response: 
 The text is a discussion among individuals regarding their concerns and experiences with using a framework called LangChain. Some express frustration with the difficulty of debugging LangChain and the lack of transparency in its functionality. Others mention their reluctance to use LangChain for large complex projects due to its limitations. However, there is also a contrasting viewpoint that praises LangChain for its ability to merge different tactics from the ecosystem into a simpler and more intuitive framework. The text concludes with a playful narrative about a critic who initially dismisses LangChain but eventually realizes their own biases and narrow perspective.
 Here is the source nodes:
 > Source (Doc id: 2daff2ab-b77b-4ccb-ac11-2bfe3686aad7): messages.  ntindle 3 months ago  
              

This is our worry with building Auto-GPT as wel...
