In [None]:
! pip install llama_index
! pip install llama-parse
! pip install llmsherpa
! pip install llama-index-readers-pdf-marker
! pip install llama-index-readers-llama-parse
! pip install llama-index-readers-smart-pdf-loader
! pip install llama-index-indices-managed-postgresml
! pip install llama-index-storage-index-store-postgres
! pip install llama-index-storage-index-store-mongodb
! pip install llama-index-storage-index-store-postgres
! pip install llama-index-storage-docstore-postgres
! pip install llama-index-storage-docstore-mongodb
! pip install llama-index-vector-stores-postgres
! pip install llama-index-vector-stores-pinecone
! pip install llama-index-vector-stores-chroma
! pip install llama-index-llms-openai
! pip install llama-index-llms-ollama
! pip install llama-index-extractors-entity
! pip install llama-index-extractors-marvin
! pip install unstructured
! pip install lxml
! pip install spacy
! pip install jieba

In [None]:
from env import AppConfig
from llama_index.core import Settings
config = AppConfig()
logger = config.logger
# resp = Settings.llm.complete("hello")
# config.logger.debug(resp)
# question = 'who comes from Lawrence Berkely National Labatorry in this book'

```01.01. laod vector index from database ```

In [None]:
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.core import Settings, VectorStoreIndex
from sqlalchemy import make_url
url = make_url(config.pg_uri)
pg_vec_store = PGVectorStore.from_params(
    database=url.database, 
    host=url.host, 
    password=url.password, 
    port=url.port, 
    user=url.username, 
    table_name="vec_store", 
    embed_dim=4096,  # openai embedding dimension 
    hnsw_kwargs={
            "hnsw_m": 16,
            "hnsw_ef_construction": 64,
            "hnsw_ef_search": 40,
            "hnsw_dist_method": "vector_cosine_ops",
        })
pg_vec_index = VectorStoreIndex.from_vector_store(vector_store=pg_vec_store, embed_model=Settings.embed_model)

```01.02 define meter-filters ```

In [None]:
from llama_index.core.vector_stores.types import (FilterOperator, 
                                                  FilterCondition, 
                                                  MetadataFilter, 
                                                  MetadataFilters) 

filters = MetadataFilters(
    filters=[MetadataFilter(key="department",  value="Procurement"  ),
             MetadataFilter(key="security_classification",  value='',  operator=FilterOperator.LTE)],  
    condition=FilterCondition.AND)

```01.03. Define Selectors ```

In [None]:
from llama_index.core.selectors import (LLMMultiSelector, 
                                        PydanticMultiSelector,
                                        LLMSingleSelector)
options = [
    "option 1: this is good for summarization questions",  
    "option 2: this is useful for precise definitions",  
    "option 3: this is useful for comparing concepts",]
selector = LLMSingleSelector.from_defaults() 
selections = selector.select(options,  
                           query="What's the definition of space?"  )
logger.debug(type(selections))

```01.04.Retrievers```

In [None]:
from llama_index.core.retrievers import (AutoMergingRetriever, 
                                         BaseRetriever,
                                         BaseImageRetriever, 
                                         EmptyIndexRetriever, 
                                         KeywordTableSimpleRetriever,
                                         KGTableRetriever, 
                                         KnowledgeGraphRAGRetriever,
                                         LLMSynonymRetriever, ListIndexRetriever,
                                         RecursiveRetriever,
                                         RouterRetriever,
                                         TextToCypherRetriever,
                                         TransformRetriever,
                                         TreeRootRetriever,
                                         TreeSelectLeafRetriever,
                                         TreeSelectLeafEmbeddingRetriever,
                                         SummaryIndexEmbeddingRetriever, 
                                         SummaryIndexLLMRetriever,
                                         SummaryIndexRetriever,
                                         VectorIndexRetriever,
                                         VectorContextRetriever,
                                         VectorIndexAutoRetriever,
                                         )
from llama_index.core.vector_stores.types import VectorStoreQueryMode

retriever = VectorIndexRetriever(index=pg_vec_index, 
                                 vector_store_query_mode=VectorStoreQueryMode.DEFAULT, 
                                 embed_model=Settings.embed_model, 
                                 filters=[],
                                 callback_manager=None,
                                 alpha=0.9,
                                 verbose=True)

```01.05.Define Tools ```

In [None]:
from llama_index.core.tools import RetrieverTool
vector_tool = RetrieverTool.from_defaults(retriever=retriever, description="....")

router_retriever = RouterRetriever(selector=selector,
                                   retriever_tools=[vector_tool], llm=Settings.llm)
resp = router_retriever.retrieve(question)
logger.debug(type(resp))

```01.06. DecomposeQueryTransform```

In [None]:
from llama_index.core.indices.query.query_transform.base import  DecomposeQueryTransform  
decompose = DecomposeQueryTransform(llm=Settings.llm,)  
query_bundle = decompose.run("Who comes from Lawrence Berkeley National Labaratory and when did the LBNL established") 
logger.debug(f'bundle: {query_bundle.query_str}')
query_engine = idx.as_query_engine()
resp = query_engine.query(query_bundle.query_str)
logger.debug(resp)


```01.07. [OpenAIQuestionGenerator]() ```


```02.00 Postprocessors```
- Node Filtering Postprocessors
- Node Transforming Postprocessors
- Node Re-Ranking Postprocessors

```02.01. Postprocessors```

In [None]:
from llama_index.core.postprocessor import (AutoPrevNextNodePostprocessor, 
                                            EmbeddingRecencyPostprocessor,
                                            FixedRecencyPostprocessor,
                                            KeywordNodePostprocessor, #designed to refine the selection of nodes based on specific  keywords.
                                            LongContextReorder,
                                            MetadataReplacementPostProcessor,
                                            NERPIINodePostprocessor,
                                            PIINodePostprocessor, 
                                            PrevNextNodePostprocessor, # designed to enhance node retrieval by fetching additional  nodes based on their relational context in the document.
                                            SimilarityPostprocessor,
                                            SentenceEmbeddingOptimizer,
                                            TimeWeightedPostprocessor,)
original_nodes = retriever.retrieve(question)
pp =SimilarityPostprocessor(similarity_cutoff=0.8)
remaining_nodes = pp.postprocess_nodes(original_nodes)
for node in remaining_nodes:
    logger.debug(node)

```02.02 ReRanking ```
* CohereRerank: [website](https://cohere.com/rerank)
* LongLLMLinguaPostprocessor: [Github](https://github.com/microsoft/LLMLingua/blob/main/examples/RAGLlamaIndex.ipynb)
* how to gauge the quality of the re-ranking step?
* understand the model drift phenomenon.

In [None]:
from llama_index.core.postprocessor.llm_rerank import LLMRerank
from llama_index.core.postprocessor.rankGPT_rerank import RankGPTRerank
from llama_index.core.postprocessor.sbert_rerank import SentenceTransformerRerank

```03.00. Response Synthesizer```

In [None]:
from llama_index.core import get_response_synthesizer
from llama_index.core.schema import TextNode, NodeWithScore

nodes = [ 
    TextNode(text=
        "The town square clock was built in 1895"
    ), 
    TextNode(text=
        "A turquoise parrot lives in the Amazon"
    ), 
    TextNode(text=
        "A rare orchid blooms only at midnight"
    ), 
] 

node_with_score_list = [NodeWithScore(node=node) for node in nodes] 
synth = get_response_synthesizer( 
    response_mode="refine", # this is a template.[ResponseMode.IMPACT]
    use_async=False, 
    streaming=False, 
) 

response = synth.synthesize( 
    "When was the clock built?", 
    nodes=node_with_score_list 
) 
logger.debug(response) 

```04.00 Implement output parsing techniques```
* [GuarDrailsOutputParser](http://www.guardrailsai.com)

In [None]:
from llama_index.core.output_parsers.langchain import LangchainOutputParser
from llama_index.core.output_parsers.base import BaseOutputParser
from llama_index.core.output_parsers.pydantic import PydanticOutputParser
from llama_index.core.output_parsers.selection import SelectionOutputParser

```05.00. Exploring different methods of building query Engines```

In [None]:
from llama_index.core.query_engine import (BaseQueryEngine,
                                           ComposableGraphQueryEngine,
                                           CogniswitchQueryEngine,
                                           CitationQueryEngine, 
                                           CustomQueryEngine, 
                                           JSONalyzeQueryEngine, 
                                           KnowledgeGraphQueryEngine,
                                           MultiStepQueryEngine,  
                                           NLSQLTableQueryEngine, 
                                           PandasQueryEngine, 
                                           PGVectorSQLQueryEngine, 
                                           RetrieverQueryEngine, 
                                           RetrieverRouterQueryEngine, 
                                           RetryGuidelineQueryEngine, 
                                           RetryQueryEngine, 
                                           RouterQueryEngine, # used for seperater the query corresponding with perticular retrievers
                                           RetrySourceQueryEngine, 
                                           SQLJoinQueryEngine, 
                                           SimpleMultiModalQueryEngine,
                                           SQLAutoVectorQueryEngine, 
                                           SQLTableRetrieverQueryEngine,
                                           SubQuestionQueryEngine, # devide complex question to sub question and invoke corresponding retriever. 
                                           TransformQueryEngine, 
                                           ToolRetrieverRouterQueryEngine,)

```05.01.implement advanced routing with RouterQueryEngine```

In [None]:
from llama_index.core.tools import QueryEngineTool
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import PydanticMultiSelector 
from llama_index.core import SummaryIndex, SimpleDirectoryReader
from llama_index.core.extractors import TitleExtractor
import nest_asyncio

nest_asyncio.apply()

documents = SimpleDirectoryReader("files").load_data()

title_extractor = TitleExtractor()
for doc in documents:
    title_metadata = title_extractor.extract([doc])
    doc.metadata.update(title_metadata[0])

indexes = []
query_engines = []
tools = []

for doc in documents:
    document_title = doc.metadata['document_title']
    index = SummaryIndex.from_documents([doc])
    query_engine = index.as_query_engine(
        response_mode="tree_summarize",
        use_async=True,
    )
    tool = QueryEngineTool.from_defaults(
        query_engine=query_engine,
        description=f"Contains data about {document_title}",
    )
    indexes.append(index)
    query_engines.append(query_engine)
    tools.append(tool)

qe = RouterQueryEngine(
    selector=PydanticMultiSelector.from_defaults(),
    query_engine_tools=tools
)

response = qe.query(
    "Tell me about Rome and dogs"
)
print(response)

```05.02.Query multiple documents with SubQuestionQueryEngine```

In [None]:
from llama_index.core.tools import QueryEngineTool
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.selectors import PydanticMultiSelector
from llama_index.core.extractors import TitleExtractor
from llama_index.core import SummaryIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("files/sample").load_data()
title_extractor = TitleExtractor()
for doc in documents:
    title_metadata = title_extractor.extract([doc])
    doc.metadata.update(title_metadata[0])

indexes = []
query_engines = []
tools = []

for doc in documents:
    document_title = doc.metadata['document_title']
    file_name = doc.metadata['file_name']
    index = SummaryIndex.from_documents([doc])
    query_engine = index.as_query_engine(
        response_mode="tree_summarize",
        use_async=True,
    )
    tool = QueryEngineTool.from_defaults(
        query_engine=query_engine,
        name=file_name,
        description=f"Contains data about {document_title}",
    )
    indexes.append(index)
    query_engines.append(query_engine)
    tools.append(tool)

qe = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=tools,
    use_async=True
)

response = qe.query(
    "Compare buildings from ancient Athens and ancient Rome"
)
print(response)