#### 0.0.Pip

In [None]:
! pip install llama_index
! pip install llama-parse
! pip install llmsherpa
! pip install llama-index-readers-pdf-marker
! pip install llama-index-readers-llama-parse
! pip install llama-index-readers-smart-pdf-loader
! pip install llama-index-indices-managed-postgresml
! pip install llama-index-storage-index-store-postgres
! pip install llama-index-storage-index-store-mongodb
! pip install llama-index-storage-index-store-postgres
! pip install llama-index-storage-docstore-postgres
! pip install llama-index-storage-docstore-mongodb
! pip install llama-index-vector-stores-postgres
! pip install llama-index-vector-stores-pinecone
! pip install llama-index-vector-stores-mongodb
! pip install llama-index-vector-stores-chroma
! pip install llama-index-vector-stores-redis
! pip install llama-index-embeddings-huggingface
! pip install llama-index-embeddings-instructor
! pip install llama-index-llms-openai
! pip install llama-index-llms-ollama
! pip install llama-index-extractors-entity
! pip install llama-index-extractors-marvin
! pip install unstructured
! pip install lxml

#### 1.0.Set Application Runtime Environment

##### 1.1.Initialize AppConfig

In [None]:
import sys
sys.path.append("../")

from llama_index.core import Settings
from common.env import AppConfig

config = AppConfig()
logger = config.logger

##### 1.2.Test if local llm is working

In [None]:
logger.debug(config.llm.complete("你好"))

#### 2.0 Using File or Directory Reader

##### 2.1.Reading documents using SimpleDirectoryReader

In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import (DocxReader, 
                                      HWPReader, 
                                      PDFReader, 
                                      EpubReader, 
                                      FlatReader, 
                                      HTMLTagReader, 
                                      ImageCaptionReader, 
                                      ImageReader, 
                                      ImageVisionLLMReader, 
                                      IPYNBReader, 
                                      MarkdownReader, 
                                      MboxReader, 
                                      PptxReader, 
                                      PandasCSVReader, 
                                      PandasExcelReader,
                                      VideoAudioReader, 
                                      UnstructuredReader, 
                                      PyMuPDFReader, 
                                      ImageTabularChartReader, 
                                      XMLReader, 
                                      PagedCSVReader, 
                                      CSVReader, 
                                      RTFReader,)
reader = SimpleDirectoryReader("./pdf_files")
docs = reader.load_data()

##### 2.2.Read Documents 2: Using LlamaParse

In [None]:
import os
import nest_asyncio
from llama_parse import LlamaParse

from llama_index.readers.smart_pdf_loader import SmartPDFLoader
from llama_index.readers.pdf_marker import PDFMarkerReader
from pathlib import Path

nest_asyncio.apply()

parses = LlamaParse(
    api_key=os.getenv("LLAMA_CLOUD_API_KEY"),  # can also be set in your env as LLAMA_CLOUD_API_KEY
    result_type="markdown",  # "markdown" and "text" are available
    verbose=True,
)

docs = parses.load_data(f"{Path.cwd()}/pdf_files/2407.21290v1.pdf")
logger.debug(len(docs))


# ❌
# llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
# pdf_url = "https://arxiv.org/pdf/1910.13461.pdf"  # also allowed is a file path e.g. /home/downloads/xyz.pdf
# pdf_loader = SmartPDFLoader(llmsherpa_api_url=llmsherpa_api_url)
# documents = pdf_loader.load_data(pdf_url)

# ❌
# path = Path("/Users/tju/Downloads/Books/1.pdf")
# reader = PDFMarkerReader()
# document = reader.load_data(path)

#### 3.0. Using Splitter, Parse, Pipepine to Separate Documents to Chunks

##### 3.1.SentenceSplitter

In [None]:
from llama_index.core.node_parser import (TokenTextSplitter, 
                                          MetadataAwareTextSplitter,
                                          SentenceSplitter, 
                                          CodeSplitter)
splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=128, separator=" ", include_prev_next_rel=True)
nodes = splitter.get_nodes_from_documents(documents=docs)
logger.debug(len(nodes))

##### 3.2.Using Parser to separate documents to chunks

In [None]:
from llama_index.core.node_parser import(SentenceWindowNodeParser, 
                                         SemanticDoubleMergingSplitterNodeParser, 
                                         SimpleFileNodeParser, 
                                         SemanticSplitterNodeParser,
                                         NodeParser, 
                                         LlamaParseJsonNodeParser,
                                         HTMLNodeParser, 
                                         JSONNodeParser, 
                                         SimpleNodeParser, 
                                         MarkdownNodeParser,
                                         LangchainNodeParser, 
                                         HierarchicalNodeParser, 
                                         MarkdownElementNodeParser,
                                         UnstructuredElementNodeParser)
# import nest_asyncio
# nest_asyncio.apply()
sentence_splitter = SentenceSplitter()
# has error message
# parser = UnstructuredElementNodeParser()
# parser = SimpleFileNodeParser()
parser = SemanticSplitterNodeParser.from_defaults(embed_model=Settings.embed_model)
# parser = SentenceWindowNodeParser.from_defaults(window_size=2, window_metadata_key="text_window", original_text_metadata_key='original_sentence')
nodes = parser.build_semantic_nodes_from_documents(docs)
logger.debug(len(nodes))

#### 4.0.Using Extractor to Refine meta_data

##### 4.1.TitleExtractor

In [None]:
import nest_asyncio
nest_asyncio.apply()
keyword_extractor = TitleExtractor()
keyword_meta_list = keyword_extractor.extract(nodes)
for item in keyword_meta_list:
    logger.debug(item)

##### 4.2.KeywordExtractor

In [None]:
import nest_asyncio
nest_asyncio.apply()
title_extractor = TitleExtractor()
keyword_meta_list = keyword_extractor.extract(nodes=nodes)
for item in keyword_meta_list:
    logger.debug(item)

##### 4.3.Entity Extractor

In [None]:
from llama_index.extractors.entity import EntityExtractor
from llama_index.extractors.marvin import MarvinMetadataExtractor

entity_extractor = EntityExtractor(device="cpu", label_entities=True)
entity_meta_list = entity_extractor.extract(nodes=nodes)
for item in entity_meta_list:
    logger.debug(item)

##### 4.4. using pipepine to ingest data

In [None]:
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter, SentenceSplitter
from llama_index.core.extractors import TitleExtractor, KeywordExtractor

token_text_splitter = TokenTextSplitter(include_prev_next_rel=True)
sentence_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=20, include_prev_next_rel=True)
title_extractor = TitleExtractor()
keyword_extractor = KeywordExtractor()
# entity_extractor = EntityExtractor(device="cpu", label_entities=True)
transformations = [sentence_splitter, title_extractor, keyword_extractor, config.embedding]
pipeline = IngestionPipeline(transformations = transformations,)
nodes = pipeline.run(documents=docs, num_workers=8,)
logger.debug(f"nodes: {len(nodes)}")

##### 4.5. Test Embedding

In [None]:
from llama_index.core import  VectorStoreIndex
# index = VectorStoreIndex.from_documents(docs)
index = VectorStoreIndex(nodes=nodes, embed_model=Settings.embed_model)

query_engine = index.as_query_engine()
# resp = query_engine.query("林斌是谁")
# resp = query_engine.query("林斌是哪年出生的")
resp = query_engine.query("文档内容中提到哪家会计师事务所")
# resp = query_engine.query('详细介绍一下文件中的林斌')
# resp = query_engine.query('林斌的教育背景')
# resp = query_engine.query('林斌有硕士学历吗')
# resp = query_engine.query('林斌本科毕业于哪所大学')
logger.debug(resp);

#### 5.0 Preparing Store Context

##### 5.1.mongodb

In [None]:
from llama_index.storage.docstore.mongodb import MongoDocumentStore
from llama_index.storage.index_store.mongodb import MongoIndexStore
from llama_index.core import StorageContext
mongo_storage_context = StorageContext.from_defaults(
    docstore=MongoDocumentStore.from_uri(uri=config.mongo_uri),
    index_store=MongoIndexStore.from_uri(uri=config.mongo_uri),
)

##### 5.2.[postgresql](pg.sql)

In [None]:
from llama_index.storage.index_store.postgres import PostgresIndexStore
from llama_index.storage.docstore.postgres import PostgresDocumentStore
from llama_index.storage.kvstore.postgres import PostgresKVStore
from llama_index.vector_stores.postgres.base import PGVectorStore
from llama_index.core import StorageContext
from sqlalchemy import make_url
url = make_url(config.pg_uri)
logger.debug(url)
db_schema='qwen'

pg_doc_store = PostgresDocumentStore.from_uri(uri=config.pg_uri, 
                                              table_name='doc_store', 
                                              schema_name=db_schema)

pg_idx_store = PostgresIndexStore.from_uri(uri=config.pg_uri, 
                                           table_name='idx_store', 
                                           schema_name=db_schema)

pg_vec_store = PGVectorStore.from_params(database=url.database, 
                                         host=url.host, 
                                         port=url.port, 
                                         password=url.password, 
                                         user=url.username, 
                                         schema_name=db_schema, 
                                         table_name="vec_store_3584", 
                                         embed_dim=3584,  # openai embedding dimension 
                                         hnsw_kwargs = {
                                            "hnsw_m": 16,
                                            "hnsw_ef_construction": 64,
                                            "hnsw_ef_search": 40,
                                            "hnsw_dist_method": "vector_cosine_ops",
                                         },
    )
pg_storage_context = StorageContext.from_defaults(index_store=pg_idx_store, 
                                                  docstore=pg_doc_store, 
                                                  vector_store=pg_vec_store)

#### 6.0.Persist Indices

##### 6.1.Persist Chuncks

In [None]:
pg_storage_context.docstore.add_documents(docs=docs)
# mongo_storage_context.docstore.add_documents(nodes)

##### 6.2. Persist Indices

In [None]:
from llama_index.core import (DocumentSummaryIndex, 
                              KeywordTableIndex, 
                              KnowledgeGraphIndex, 
                              PropertyGraphIndex,
                              RAKEKeywordTableIndex,
                              SimpleKeywordTableIndex,
                              SummaryIndex, 
                              TreeIndex, 
                              VectorStoreIndex,
                              ListIndex, 
                              GPTListIndex,
                              GPTVectorStoreIndex,
                              GPTTreeIndex,
                              GPTSimpleKeywordTableIndex,)

# summary_index = SummaryIndex(nodes, storage_context=pg_storage_context)
vector_index = VectorStoreIndex(nodes=nodes, embed_model=config.embedding, storage_context=pg_storage_context)
# simple_keyword_index = SimpleKeywordTableIndex(nodes=nodes, storage_context=pg_storage_context)
# pg_storage_context.index_store.persist()


In [None]:
! pip install jieba

##### 06.03.Create Indices By ollama.ai, and persist data in online database.```

In [None]:
from llama_index.storage.docstore.postgres import PostgresDocumentStore
from llama_index.storage.kvstore.postgres import PostgresKVStore
from llama_index.indices.managed.postgresml import PostgresMLIndex
# conn_str = 'postgresql://llm:llm@192.168.1.3:5432/llm?sslmode=require'
# kv_store = PostgresKVStore(connection_string=conn_str, async_connection_string=conn_str, table_name="pdf",)
# doc_store = PostgresDocumentStore(postgres_kvstore=kv_store)
index = PostgresMLIndex.from_documents(collection_name= "llama-index-test-1", documents= docs)
# retriever = index.as_retriever()
# results = retriever.retrieve("how many chapters in this book")
# logger.debug(results)