In [None]:
! pip install llama_index
! pip install llama-parse
! pip install llmsherpa
! pip install llama-index-readers-pdf-marker
! pip install llama-index-readers-llama-parse
! pip install llama-index-readers-smart-pdf-loader
! pip install llama-index-indices-managed-postgresml
! pip install llama-index-storage-index-store-postgres
! pip install llama-index-storage-index-store-mongodb
! pip install llama-index-storage-index-store-postgres
! pip install llama-index-storage-docstore-postgres
! pip install llama-index-storage-docstore-mongodb
! pip install llama-index-vector-stores-postgres
! pip install llama-index-vector-stores-pinecone
! pip install llama-index-vector-stores-chroma
! pip install llama-index-llms-openai
! pip install llama-index-llms-ollama
! pip install llama-index-extractors-entity
! pip install llama-index-extractors-marvin
! pip install unstructured
! pip install lxml

```01.01✅.Set Application Runtime Environment```

In [None]:
from env import AppConfig
from llama_index.core import Settings
from env import AppConfig
config = AppConfig()
logger = config.logger
# using ollama
Settings.llm = config.llm
Settings.embed_model = config.embedding

```01.02✅.Test if local llm is working```

In [None]:
logger.debug(config.llm.complete("hi"))

 ```02.01✅.Read Documents 1: Using SimpleDirectoryReader   ```

In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import (DocxReader, 
                                      HWPReader, 
                                      PDFReader, 
                                      EpubReader, 
                                      FlatReader, 
                                      HTMLTagReader, 
                                      ImageCaptionReader, 
                                      ImageReader, 
                                      ImageVisionLLMReader, 
                                      IPYNBReader, 
                                      MarkdownReader, 
                                      MboxReader, 
                                      PptxReader, 
                                      PandasCSVReader, 
                                      PandasExcelReader,
                                      VideoAudioReader, 
                                      UnstructuredReader, 
                                      PyMuPDFReader, 
                                      ImageTabularChartReader, 
                                      XMLReader, 
                                      PagedCSVReader, 
                                      CSVReader, 
                                      RTFReader,)
reader = SimpleDirectoryReader("./pdf_files")
docs = reader.load_data()

```02.02✅.Read Documents 2: Using LlamaParse```

In [None]:
import os
import nest_asyncio
from llama_parse import LlamaParse

from llama_index.readers.smart_pdf_loader import SmartPDFLoader
from llama_index.readers.pdf_marker import PDFMarkerReader
from pathlib import Path

nest_asyncio.apply()

parses = LlamaParse(
    api_key=os.getenv("LLAMA_CLOUD_API_KEY"),  # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type="markdown",  # "markdown" and "text" are available
    verbose=True,
)

docs = parses.load_data(f"{Path.cwd()}/pdf_files/2407.21290v1.pdf")
logger.debug(len(docs))


# ❌
# llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
# pdf_url = "https://arxiv.org/pdf/1910.13461.pdf"  # also allowed is a file path e.g. /home/downloads/xyz.pdf
# pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)
# documents = pdf_loader.load_data(pdf_url)

# ❌
# path = Path("/Users/tju/Downloads/Books/1.pdf")
# reader = PDFMarkerReader()
# document = reader.load_data(path)

```03.01✅.Using splitter to separate documents to chunks```

In [None]:
from llama_index.core.node_parser import (TokenTextSplitter, 
                                          MetadataAwareTextSplitter,
                                          SentenceSplitter, 
                                          CodeSplitter)
splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=100, separator=" ", include_prev_next_rel=True)
nodes = splitter.get_nodes_from_documents(documents=docs)
logger.debug(len(nodes))

```03.02✅.Using Parser to separate documents to chunks```

In [None]:
from llama_index.core.node_parser import(SentenceWindowNodeParser, 
                                         SemanticDoubleMergingSplitterNodeParser, 
                                         SimpleFileNodeParser, 
                                         SemanticSplitterNodeParser,
                                         NodeParser, 
                                         LlamaParseJsonNodeParser,
                                         HTMLNodeParser, 
                                         JSONNodeParser, 
                                         SimpleNodeParser, 
                                         MarkdownNodeParser,
                                         LangchainNodeParser, 
                                         HierarchicalNodeParser, 
                                         MarkdownElementNodeParser,
                                         UnstructuredElementNodeParser)
# has error message
# parser = UnstructuredElementNodeParser()
parser = SimpleFileNodeParser()
nodes = parser.get_nodes_from_documents(documents=docs)
logger.debug(len(nodes))

```03.03✅.Using pipeline to seperate documents to chunks```

In [None]:
from llama_index.extractors.entity import EntityExtractor
from llama_index.extractors.marvin import MarvinMetadataExtractor

entity_extractor = EntityExtractor(model_name="your model name on huggingface")
entity_meta_list = entity_extractor.extract(nodes=nodes)
for item in entity_meta_list:
    logger.debug(item)

``` 04.00. Using Extractor to refine meta_data ```

In [None]:
from llama_index.core.extractors import (
    TitleExtractor, 
    SummaryExtractor, 
    KeywordExtractor,
    PydanticProgramExtractor,
    QuestionsAnsweredExtractor,)



```04.01✅.Extract meta-data 1: TitleExtractor  ```

In [None]:
import nest_asyncio
nest_asyncio.apply()
keyword_extractor = TitleExtractor()
keyword_meta_list = keyword_extractor.extract(nodes)
for item in keyword_meta_list:
    logger.debug(item)

```04.02✅.Extract meta-data 2: KeywordExtractor```

In [None]:
import nest_asyncio
nest_asyncio.apply()
title_extractor = TitleExtractor()
title_meta_list = title_extractor.extract(nodes)

keyword_extractor = KeywordExtractor()
keyword_meta_list = keyword_extractor.extract(nodes=nodes)

for item in keyword_meta_list:
    logger.debug(item)

for item in title_meta_list:
    logger.debug(item)

```04.03✅. Read Document By Pipepine working with Extrctors to refine meta-data```

In [None]:
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor

sentence_splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=200)
title_extractor = TitleExtractor();
keyword_extractor = KeywordExtractor()
transformations = [sentence_splitter, title_extractor, keyword_extractor, config.embedding]
pipeline = IngestionPipeline( transformations = transformations)
nodes = pipeline.run(documents=docs, num_workers=4)
logger.debug(f"nodes: {len(nodes)}")

```05.01✅.Prepare Storage Context: Mongo```

In [None]:
from llama_index.storage.docstore.mongodb import MongoDocumentStore
from llama_index.storage.index_store.mongodb import MongoIndexStore
from llama_index.storage.kvstore.mongodb import MongoDBKVStore
from llama_index.core import StorageContext
mongo_storage_context = StorageContext.from_defaults(
    docstore=MongoDocumentStore.from_uri(uri=config.mongo_uri),
    index_store=MongoIndexStore.from_uri(uri=config.mongo_uri),
    vector_store=MongoDBKVStore.from_uri(uri=config.mongo_uri)
)

```05.02✅. Prepare Storage Context: Postgres```

In [None]:
from llama_index.storage.index_store.postgres import PostgresIndexStore
from llama_index.storage.docstore.postgres import PostgresDocumentStore
from llama_index.storage.kvstore.postgres import PostgresKVStore
from llama_index.vector_stores.postgres.base import PGVectorStore
from llama_index.core import StorageContext
from sqlalchemy import make_url
url = make_url(config.pg_uri)
logger.debug(url)

pg_idx_store = PostgresIndexStore.from_uri(uri=config.pg_uri, table_name='idx_store')
pg_doc_store = PostgresDocumentStore.from_uri(uri=config.pg_uri, table_name='doc_store')
pg_vec_store=PGVectorStore.from_params(
        database=url.database,
        host=url.host,
        password=url.password,
        port=url.port,
        user=url.username,
        table_name="vec_store",
        embed_dim=4096,  # openai embedding dimension
        hnsw_kwargs={
            "hnsw_m": 16,
            "hnsw_ef_construction": 64,
            "hnsw_ef_search": 40,
            "hnsw_dist_method": "vector_cosine_ops",
        },
    )

pg_storage_context = StorageContext.from_defaults(index_store=pg_idx_store, 
                                                  docstore=pg_doc_store, 
                                                  vector_store=pg_vec_store)

```06.01✅.Persist Chuncks```
* to avoid a document being stored multiple times, we need to make a query to confirm whether the document exists before we are going to persist any docs
* if the doc has been stored, it will be necessary to delete it first.

In [None]:
pg_storage_context.docstore.add_documents(docs=docs)
# mongo_storage_context.docstore.add_documents(nodes)

```07.01✅.create document indices and persist indices```

In [None]:
from llama_index.core import (DocumentSummaryIndex, 
                              KeywordTableIndex, 
                              KnowledgeGraphIndex, 
                              PropertyGraphIndex,
                              RAKEKeywordTableIndex,
                              SimpleKeywordTableIndex,
                              SummaryIndex, 
                              TreeIndex, 
                              VectorStoreIndex,
                              ListIndex, 
                              GPTListIndex,
                              GPTVectorStoreIndex,
                              GPTTreeIndex,
                              GPTSimpleKeywordTableIndex,)

# summary_index = SummaryIndex(nodes, storage_context=pg_storage_context)
vector_index = VectorStoreIndex.from_documents(documents=docs, 
                                               embed_model=config.embedding, 
                                               storage_context=pg_storage_context)
# simple_keyword_index = SimpleKeywordTableIndex(nodes=nodes, storage_context=pg_storage_context)
# pg_storage_context.index_store.persist()


```07.02✅.Create Indices By ollama.ai, and persist data in online database.```

In [None]:
from llama_index.storage.docstore.postgres import PostgresDocumentStore
from llama_index.storage.kvstore.postgres import PostgresKVStore
from llama_index.indices.managed.postgresml import PostgresMLIndex
# conn_str = 'postgresql://llm:llm@192.168.1.3:5432/llm?sslmode=require'
# kv_store = PostgresKVStore(connection_string=conn_str, async_connection_string=conn_str, table_name="pdf",)
# doc_store = PostgresDocumentStore(postgres_kvstore=kv_store)
os.environ[
    "PGML_DATABASE_URL"
] = "postgres://u_avmhinwq8sk1pgv:hktkhlft7grwzt5@437a9d42-c398-4c00-9906-c9b5fc2e7d61.gcp.db.postgresml.org:6432/pgml_7eqv4awesvjc0u9"
index = PostgresMLIndex.from_documents(collection_name= "llama-index-test-1", documents= docs)
# retriever = index.as_retriever()
# results = retriever.retrieve("how many chapters in this book")
# logger.debug(results)

```08.01✅.Make query```

In [None]:
query_engine = vector_index.as_query_engine()
# resp = query_engine.query("what do they argue about?")
# logger.debug(resp)
resp = query_engine.query("what are the Results and Discussions in the given context?")
logger.debug(resp)