> Global Environment Settings.

In [None]:
! pip install llama-index-readers-smart-pdf-loader
! pip install llama_index
! pip install llmsherpa
! pip install llama-index-readers-pdf-marker
! pip install llama-index-readers-llama-parse
! pip install llama-parse
! pip install llama-index-storage-index-store-postgres
! pip install llama-index-indices-managed-postgresml
! pip install llama-index-storage-index-store-postgres
! pip install llama-index-storage-docstore-postgres
! pip install llama-index-storage-docstore-mongodb
! pip install llama-index-storage-index-store-mongodb
! pip install llama-index-llms-openai
! pip install llama-index-extractors-entity
! pip install llama-index-extractors-marvin
! pip install unstructured
! pip install lxml

```01.01✅.Set Application Runtime Environment```

In [36]:
import os
from env import AppConfig
from dotenv import load_dotenv
from llama_index.core import Settings
load_dotenv(".env")
mongo_uri = os.getenv("MONGO_URI")
pg_uri = os.getenv("PG_URI")
config = AppConfig()
logger = config.logger
llm = config.llm
embedding = config.embedding
# using ollama
Settings.llm = llm
Settings.embed_model = embedding

DEBUG:env:host: http://localhost:11434, mode: mistral:latest
DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='/Users/tju/Workspace/books/llama-index-practise/.venv/lib/python3.11/site-packages/certifi/cacert.pem'
DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False
DEBUG:httpx:load_verify_locations cafile='/Users/tju/Workspace/books/llama-index-practise/.venv/lib/python3.11/site-packages/certifi/cacert.pem'
DEBUG:env:conn: postgresql://llm:llm@192.168.1.3:5432/llm?sslmode=require


```01.02✅.Test if local llm is working```

In [None]:
logger.debug(llm.complete("hi"))

 ```02.01✅.Read Documents 1: Using SimpleDirectoryReader   ```

In [39]:
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import (DocxReader, 
                                      HWPReader, 
                                      PDFReader, 
                                      EpubReader, 
                                      FlatReader, 
                                      HTMLTagReader, 
                                      ImageCaptionReader, 
                                      ImageReader, 
                                      ImageVisionLLMReader, 
                                      IPYNBReader, 
                                      MarkdownReader, 
                                      MboxReader, 
                                      PptxReader, 
                                      PandasCSVReader, 
                                      PandasExcelReader,
                                      VideoAudioReader, 
                                      UnstructuredReader, 
                                      PyMuPDFReader, 
                                      ImageTabularChartReader, 
                                      XMLReader, 
                                      PagedCSVReader, 
                                      CSVReader, 
                                      RTFReader,)
reader = SimpleDirectoryReader("./files")
docs = reader.load_data()

DEBUG:llama_index.core.readers.file.base:> [SimpleDirectoryReader] Total files added: 1
DEBUG:fsspec.local:open file: /Users/tju/Workspace/books/llama-index-practise/files/2407.21290v1.pdf
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: TrackSorter: A Transformer-based sorting algori...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: variables from a continuous, multi-dimensional ...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: Figure 2: The detector schematic shows the top ...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: last predicted token is the [SEP] token. This t...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: a handful of samples. This adversely affects Wo...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: [12] L. Heinrich, T . Golling, M. Kagan, S. Kle...
DEBUG:env:Node ID: 65e5de3e-5ac7-41a9-b3df-bf879aec01dc
Text: TrackSorter: A Transformer-based sorting algorithm for track
finding in

```02.02✅.Read Documents 2: Using LlamaParse```

In [None]:
import os
import nest_asyncio
from llama_parse import LlamaParse

from llama_index.readers.smart_pdf_loader import SmartPDFLoader
from llama_index.readers.pdf_marker import PDFMarkerReader
from pathlib import Path
import numpy

nest_asyncio.apply()

parses = LlamaParse(
    api_key=os.getenv("LLAMA_CLOUD_API_KEY"),  # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type="markdown",  # "markdown" and "text" are available
    verbose=True,
)

document = parses.load_data("/Users/tju/Downloads/Books/1.pdf")
logger.debug(len(document))


# ❌
# llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
# pdf_url = "https://arxiv.org/pdf/1910.13461.pdf"  # also allowed is a file path e.g. /home/downloads/xyz.pdf
# pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)
# documents = pdf_loader.load_data(pdf_url)

# ❌
# path = Path("/Users/tju/Downloads/Books/1.pdf")
# reader = PDFMarkerReader()
# document = reader.load_data(path)

```03.01✅.Using splitter to separate documents to chunks```

In [44]:
from llama_index.core.node_parser import (TokenTextSplitter, 
                                          MetadataAwareTextSplitter,
                                          SentenceSplitter, 
                                          CodeSplitter)
splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=100, separator=" ", include_prev_next_rel=True)
nodes = splitter.get_nodes_from_documents(documents=docs)
logger.debug(len(nodes))

DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: TrackSorter: A Transformer-based sorting algori...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: variables from a continuous, multi-dimensional ...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: Figure 2: The detector schematic shows the top ...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: last predicted token is the [SEP] token. This t...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: a handful of samples. This adversely affects Wo...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: [12] L. Heinrich, T . Golling, M. Kagan, S. Kle...
DEBUG:env:6


```03.02✅.Using Parser to separate documents to chunks```

In [48]:
from llama_index.core.node_parser import(SentenceWindowNodeParser, 
                                         SemanticDoubleMergingSplitterNodeParser, 
                                         SimpleFileNodeParser, 
                                         SemanticSplitterNodeParser,
                                         NodeParser, 
                                         LlamaParseJsonNodeParser,
                                         HTMLNodeParser, 
                                         JSONNodeParser, 
                                         SimpleNodeParser, 
                                         MarkdownNodeParser,
                                         LangchainNodeParser, 
                                         HierarchicalNodeParser, 
                                         MarkdownElementNodeParser,
                                         UnstructuredElementNodeParser)
# has error message
# parser = UnstructuredElementNodeParser()
parser = SimpleFileNodeParser()
nodes = parser.get_nodes_from_documents(documents=docs)
logger.debug(len(nodes))

DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: TrackSorter: A Transformer-based sorting algori...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: variables from a continuous, multi-dimensional ...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: Figure 2: The detector schematic shows the top ...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: last predicted token is the [SEP] token. This t...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: a handful of samples. This adversely affects Wo...
DEBUG:llama_index.core.node_parser.node_utils:> Adding chunk: [12] L. Heinrich, T . Golling, M. Kagan, S. Kle...
DEBUG:env:6


```03.03✅.Using pipeline to seperate documents to chunks```

In [51]:
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor

sentence_splitter = SentenceSplitter(chunk_size=1000, chunk_overlap=200)
title_extractor = TitleExtractor();
transformations = [sentence_splitter, title_extractor, embedding]
pipeline = IngestionPipeline( transformations = transformations)
nodes = pipeline.run(documents=docs, num_workers=4)
logger.debug(f"nodes: {len(nodes)}")



100%|██████████| 1/1 [00:26<00:00, 26.85s/it]
100%|██████████| 1/1 [00:39<00:00, 39.93s/it]
100%|██████████| 1/1 [00:42<00:00, 42.65s/it]
100%|██████████| 1/1 [00:47<00:00, 47.70s/it]
100%|██████████| 1/1 [00:21<00:00, 21.90s/it]
100%|██████████| 1/1 [00:24<00:00, 24.14s/it]


DEBUG:env:nodes: 6


```04.01✅.Extract meta-data 1: TitleExtractor  ```

In [49]:
from llama_index.core.extractors import (
    TitleExtractor, 
    SummaryExtractor, 
    KeywordExtractor,
    PydanticProgramExtractor,
    QuestionsAnsweredExtractor,)

import nest_asyncio
nest_asyncio.apply()
title_extractor = TitleExtractor()
title_meta_list = title_extractor.extract(nodes)

for item in title_meta_list:
    logger.debug(item)

DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Content-Type', b'application/json; charset=utf-8'), (b'Date', b'Fri, 22 Nov 2024 16:55:41 GMT'), (b'Content-Length', b'1216')])
INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
DEBUG:httpcore.http11:receive_response_body.started request=<Request [b'POST']>
DEBUG:httpcore.http11:receive_response_body.complete
DEBUG:httpcore.http11:response_closed.started
DEBUG:httpcore.http11:response_closed.complete


100%|██████████| 1/1 [00:10<00:00, 10.76s/it]

DEBUG:httpcore.http11:send_request_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_headers.complete
DEBUG:httpcore.http11:send_request_body.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_body.complete
DEBUG:httpcore.http11:receive_response_headers.started request=<Request [b'POST']>





DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Content-Type', b'application/json; charset=utf-8'), (b'Date', b'Fri, 22 Nov 2024 16:55:47 GMT'), (b'Content-Length', b'1069')])
INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
DEBUG:httpcore.http11:receive_response_body.started request=<Request [b'POST']>
DEBUG:httpcore.http11:receive_response_body.complete
DEBUG:httpcore.http11:response_closed.started
DEBUG:httpcore.http11:response_closed.complete


  0%|          | 0/1 [00:00<?, ?it/s]

DEBUG:httpcore.http11:send_request_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_headers.complete
DEBUG:httpcore.http11:send_request_body.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_body.complete
DEBUG:httpcore.http11:receive_response_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Content-Type', b'application/json; charset=utf-8'), (b'Date', b'Fri, 22 Nov 2024 16:55:59 GMT'), (b'Content-Length', b'1353')])
INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
DEBUG:httpcore.http11:receive_response_body.started request=<Request [b'POST']>
DEBUG:httpcore.http11:receive_response_body.complete
DEBUG:httpcore.http11:response_closed.started
DEBUG:httpcore.http11:response_closed.complete


100%|██████████| 1/1 [00:11<00:00, 11.91s/it]

DEBUG:httpcore.http11:send_request_headers.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_headers.complete
DEBUG:httpcore.http11:send_request_body.started request=<Request [b'POST']>
DEBUG:httpcore.http11:send_request_body.complete
DEBUG:httpcore.http11:receive_response_headers.started request=<Request [b'POST']>





DEBUG:httpcore.http11:receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Content-Type', b'application/json; charset=utf-8'), (b'Date', b'Fri, 22 Nov 2024 16:56:03 GMT'), (b'Content-Length', b'757')])
INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
DEBUG:httpcore.http11:receive_response_body.started request=<Request [b'POST']>
DEBUG:httpcore.http11:receive_response_body.complete
DEBUG:httpcore.http11:response_closed.started
DEBUG:httpcore.http11:response_closed.complete
DEBUG:env:{'document_title': ' "Transformer-based Track Finding in High Energy Physics: A Novel Approach using Tokenization and Sorting with TRACK SORTER - Overcoming Challenges of Point Cloud Data Processing through LLM Capabilities and Efficient Algorithm Implementation"\n\nThis title reflects the main entities, themes, and contributions of the paper by providing a comprehensive description. It includes:\n\n1. The use of transformer-based models for track findin

```04.02✅.Extract meta-data 2: KeywordExtractor```

In [None]:
import nest_asyncio
nest_asyncio.apply()
keyword_extractor = TitleExtractor()
keyword_meta_list = keyword_extractor.extract(nodes)
for item in keyword_meta_list:
    logger.debug(item)

```04.03❌.Extract meta-data : it needs huggingface api key.```

In [None]:
from llama_index.extractors.entity import EntityExtractor
from llama_index.extractors.marvin import MarvinMetadataExtractor

entity_extractor = EntityExtractor(model_name="your model name on huggingface")
entity_meta_list = entity_extractor.extract(nodes=nodes)
for item in entity_meta_list:
    logger.debug(item)

```05.01✅.Prepare Storage Context: Mongo```

In [None]:
from llama_index.storage.docstore.mongodb import MongoDocumentStore
from llama_index.storage.index_store.mongodb import MongoIndexStore
from llama_index.core import StorageContext
mongo_storage_context = StorageContext.from_defaults(
    docstore=MongoDocumentStore.from_uri(uri=mongo_uri),
    index_store=MongoIndexStore.from_uri(uri=mongo_uri),
)

```05.02✅. Prepare Storage Context: Postgres```

In [None]:
from llama_index.storage.index_store.postgres import PostgresIndexStore
from llama_index.storage.docstore.postgres import PostgresDocumentStore
from llama_index.core import StorageContext

pg_storage_context = StorageContext.from_defaults(
    index_store=PostgresIndexStore.from_uri(uri=pg_uri),
    docstore=PostgresDocumentStore.from_uri(uri=pg_uri))

```06.01✅.Persist Chuncks```

In [None]:
pg_storage_context.docstore.add_documents(nodes)
mongo_storage_context.docstore.add_documents(nodes)

```07.01✅.create document indices and persist indices```

In [None]:
from llama_index.core import (DocumentSummaryIndex, 
                              KeywordTableIndex, 
                              KnowledgeGraphIndex, 
                              PropertyGraphIndex,
                              RAKEKeywordTableIndex,
                              SimpleKeywordTableIndex,
                              SummaryIndex, 
                              TreeIndex, 
                              VectorStoreIndex,
                              ListIndex, 
                              GPTListIndex,
                              GPTVectorStoreIndex,
                              GPTTreeIndex,
                              GPTSimpleKeywordTableIndex,)

summary_index = SummaryIndex(nodes, storage_context=pg_storage_context)
vector_index = VectorStoreIndex(nodes=nodes, embed_model=embedding, storage_context=pg_storage_context)
simple_keyword_index = SimpleKeywordTableIndex(nodes=nodes, storage_context=pg_storage_context)
pg_storage_context.persist()


```07.02✅.Create Indices By ollama.ai, and persist data in online database.```

In [8]:
from llama_index.storage.docstore.postgres import PostgresDocumentStore
from llama_index.storage.kvstore.postgres import PostgresKVStore
from llama_index.indices.managed.postgresml import PostgresMLIndex
# conn_str = 'postgresql://llm:llm@192.168.1.3:5432/llm?sslmode=require'
# kv_store = PostgresKVStore(connection_string=conn_str, async_connection_string=conn_str, table_name="pdf",)
# doc_store = PostgresDocumentStore(postgres_kvstore=kv_store)
os.environ[
    "PGML_DATABASE_URL"
] = "postgres://u_avmhinwq8sk1pgv:hktkhlft7grwzt5@437a9d42-c398-4c00-9906-c9b5fc2e7d61.gcp.db.postgresml.org:6432/pgml_7eqv4awesvjc0u9"
index = PostgresMLIndex.from_documents(collection_name= "llama-index-test-1", documents= document)
# retriever = index.as_retriever()
# results = retriever.retrieve("how many chapters in this book")
# logger.debug(results)

```08.01✅.Make query```

In [None]:
query_engine = vector_index.as_query_engine()
resp = query_engine.query("please give the document summary.")
logger.debug(resp)