In [None]:
! pip install llama_index
! pip install llama-parse
! pip install llmsherpa
! pip install llama-index-readers-pdf-marker
! pip install llama-index-readers-llama-parse
! pip install llama-index-readers-smart-pdf-loader
! pip install llama-index-indices-managed-postgresml
! pip install llama-index-storage-index-store-postgres
! pip install llama-index-storage-index-store-mongodb
! pip install llama-index-storage-index-store-postgres
! pip install llama-index-storage-docstore-postgres
! pip install llama-index-storage-docstore-mongodb
! pip install llama-index-vector-stores-postgres
! pip install llama-index-vector-stores-pinecone
! pip install llama-index-vector-stores-mongodb
! pip install llama-index-vector-stores-chroma
! pip install llama-index-vector-stores-redis
! pip install llama-index-embeddings-huggingface
! pip install llama-index-embeddings-instructor
! pip install llama-index-embeddings-ollama
! pip install llama-index-llms-openai
! pip install llama-index-llms-ollama
! pip install llama-index-extractors-entity
! pip install llama-index-extractors-marvin
! pip install unstructured
! pip install lxml

##### 1. Runtime Environment

- initialize llm

In [None]:
import sys
sys.path.append("../")

from llama_index.core import Settings
from common.llm import LlmConfig, LocalLLM
config = LlmConfig(LocalLLM.LM_STUDIO)
logger = config.logger

- test local llm

In [None]:
logger.debug(config.llm.complete("你好"))

##### 2. read documents

- read docs from diretory

In [None]:
from common.reader import read_files_from_directory

from pathlib import Path
docs = read_files_from_directory("./pdf")

- read docs by llama cloud

In [None]:
from common.reader import read_files_by_llamaParse
file_name = f'{Path.cwd()}/pdf/2023_530-576.pdf'
docs = read_files_by_llamaParse(file_input=file_name)

##### 3. generate nodes

- splitter

In [None]:
from common.parser import sentence_splitter
nodes = sentence_splitter(docs=docs)

- parser

In [None]:
from common.parser import sematic_splitter_node_parse
nodes = sematic_splitter_node_parse(docs=docs, embedding=Settings.embed_model)
logger.debug(len(nodes))

- pipeline

In [None]:
from common.parser import gen_nodes_by_pipeline
nodes = gen_nodes_by_pipeline(docs=docs, embedding=config.embedding)
logger.debug(f"nodes: {len(nodes)}")

##### 4. extract meta-data

- title extractor

In [None]:
from common.extractor import title_extractor
meta_data_List = title_extractor(nodes=nodes)

- keyword extractor

In [None]:
from common.extractor import keyword_extractor
meta_data_list = keyword_extractor(nodes=nodes)

- entity extractor

In [None]:
from common.extractor import entity_extractor
meta_data_list = entity_extractor(nodes=nodes)

##### 5. Preparing Store Context

- mongo

In [None]:
from common.storage import get_mongo_storage_context
mongo_storage_context = get_mongo_storage_context()

- [pg](pg.sql)

In [None]:
from common.storage import get_pg_storage_context
pg_storage_context = get_pg_storage_context(db_schema="qwen", dims=1536)

- neo4j

In [None]:
from common.storage import get_neo4j_storage_context
neo4j_storage_context = get_neo4j_storage_context()

##### 6. index

- vector index

In [None]:
from llama_index.core import (DocumentSummaryIndex, 
                              KeywordTableIndex, 
                              KnowledgeGraphIndex, 
                              PropertyGraphIndex,
                              RAKEKeywordTableIndex,
                              SimpleKeywordTableIndex,
                              SummaryIndex, 
                              TreeIndex, 
                              VectorStoreIndex,
                              ListIndex, 
                              GPTListIndex,
                              GPTVectorStoreIndex,
                              GPTTreeIndex,
                              GPTSimpleKeywordTableIndex,)

vector_index = VectorStoreIndex(nodes=nodes, 
                                embed_model=config.embedding, 
                                storage_context=pg_storage_context)
# pg_storage_context.index_store.persist()

In [None]:
query_engine = vector_index.as_query_engine()
q = "用于临床检测的自身抗体主要有几类"
q = "列出表20-2的内容"
q = "请描述自身抗体检测的实验室分析路径"
resp = query_engine.query(q)
logger.debug(resp)

- summary index

In [None]:
from llama_index.core import SummaryIndex
summary_index = SummaryIndex(nodes, storage_context=pg_storage_context)

- simple keyword index

In [None]:
from llama_index.core import SimpleKeywordTableIndex
simple_keyword_index = SimpleKeywordTableIndex(nodes=nodes, storage_context=pg_storage_context)

- ollama.ai

In [None]:
from llama_index.storage.docstore.postgres import PostgresDocumentStore
from llama_index.storage.kvstore.postgres import PostgresKVStore
from llama_index.indices.managed.postgresml import PostgresMLIndex
index = PostgresMLIndex.from_documents(collection_name= "llama-index-test-1", documents= docs)