In [1]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from langgraph.checkpoint.memory import MemorySaver # checkpointer
# from langgraph.prebuilt import create_react_agent # tool?
# from langchain_community.tools.tavily_search import TavilySearchResults 
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.runnables import RunnableLambda
import os

with open('../key.txt', "r", encoding="utf-8") as file:
    content = file.readline()
    
os.environ['LANGCHAIN_TRACING_V2'] = "true"
os.environ['LANGCHAIN_API_KEY'] = content.split(' ')[-1]
os.environ['LANGCHAIN_ENDPOINT'] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_PROJECT'] = "project-test-1"


### build chroma vector-store

In [3]:
documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Goldfish are popular pets for beginners, requiring relatively simple care.",
        metadata={"source": "fish-pets-doc"},
    ),
    Document(
        page_content="Parrots are intelligent birds capable of mimicking human speech.",
        metadata={"source": "bird-pets-doc"},
    ),
    Document(
        page_content="Rabbits are social animals that need plenty of space to hop around.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

vectorstore = Chroma.from_documents(
    documents,
    embedding=OpenAIEmbeddings(),
) # see presistence location in : vectorstore._client_settings

# vectorstore.similarity_search("cat") # query
# await vectorstore.asimilarity_search_with_score("pet")

In [19]:
retriever = RunnableLambda(vectorstore.similarity_search).bind(k=1)  # select top result
retriever.batch(["cat", "shark"])
retriever.invoke('cat')

[[Document(metadata={'source': 'mammal-pets-doc'}, page_content='Cats are independent pets that often enjoy their own space.')],
 [Document(metadata={'source': 'fish-pets-doc'}, page_content='Goldfish are popular pets for beginners, requiring relatively simple care.')]]

### build chroma + langChain RAG

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter
from langchain.globals import set_verbose, set_debug

set_verbose(False)
set_debug(False)
os.environ['LANGCHAIN_ENDPOINT'] = ""

message = """
Answer this question using the provided context only.

{question}

Context:
{context}
"""

prompt = ChatPromptTemplate.from_messages([("human", message)])
model = ChatOpenAI(model_name = 'gpt-4o-mini')
if 0:  # Case: Input = dict
    chain = ( RunnablePassthrough.assign(context = (itemgetter("question") | retriever) ) 
             | prompt | model )
    print(chain.invoke({'question':"tell me about cat"}).content)

if 1:  # Case: Input = text
    chain = ( {'context': retriever, 'question': RunnablePassthrough()}  
             | prompt | model )
    print(chain.invoke("tell me about dog").content)
    

### Parent document retriever

In [1]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

loaders = [
    PyPDFLoader("E:\数智项目资料\数字员工\卓越成效\卓越绩效管理.pdf"),
    TextLoader("E:\数智项目资料\数字员工\卓越成效\数字员工分析-1.txt", encoding="utf-8")
]

docs = []
for loader in loaders: # 2 docs in 1 
    docs.extend(loader.load())

In [16]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=OpenAIEmbeddings()
)

# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
)
retriever.add_documents(docs, ids=None) # 1. Add docs to vectorstore / parentRetriever

In [28]:
sub_docs = vectorstore.similarity_search("数字员工") # 2. search small trunk / vectorstore
print(len(sub_docs))
parent_docs = retriever.invoke('数字员工')
print(len(parent_docs))

4
1


### To be Continued
- Retrieve: + hard rules & filters
- Retrieve: Ensemble retrieved results
- Embedding: Multiple embeddings
- Model: Weigh returned documents
- Taxonamy: 