## Lib import

In [None]:
import os
from typing import List, Literal, Annotated
from typing_extensions import TypedDict
from pydantic import BaseModel, Field

from langchain import PromptTemplate, LLMChain
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_core.tools import tool
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, SystemMessage
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_chroma import Chroma
from langchain_text_splitters import TokenTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain.schema import Document
from langchain_community.retrievers import TavilySearchAPIRetriever
from langchain_huggingface import HuggingFaceEmbeddings

from langgraph.graph import START, END, MessagesState, StateGraph
from langgraph.checkpoint.memory import MemorySaver, InMemorySaver
from langgraph.graph import START, END, MessagesState, StateGraph
from langgraph.checkpoint.memory import InMemorySaver

## API key get

In [None]:
from dotenv import load_dotenv
load_dotenv()

deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
silicon_api_key = os.getenv("SILICON_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")

In [None]:
# silicon
silicon_base_url =  "https://api.siliconflow.cn/v1"
silicon_llm_model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

# huggingface
huggingface_embed_model = "sentence-transformers/all-MiniLM-L6-v2"

## LLM Init

In [None]:
# init LLM mod
llm = ChatOpenAI(
    model=silicon_llm_model,
    openai_api_key=silicon_api_key,
    base_url=silicon_base_url,
    temperature=1.2,
    max_tokens=2048,
)

## Vector

In [None]:
root_path = '../'
WEB_DATA_DIR = os.path.join(root_path, "downloads/website")
VECTOR_STORE_DIR = os.path.join(root_path, "var/vector")

In [None]:
import chromadb
from chromadb.config import Settings

html_directory = WEB_DATA_DIR
persist_directory = VECTOR_STORE_DIR
collection_id = "calerie-health"
tenant_id = "agent_tenant"

################################################################################
## Config embeddings model
################################################################################
print(f"Create HuggingFaceEmbeddings {huggingface_embed_model}")
embeddings = HuggingFaceEmbeddings(model_name=huggingface_embed_model)

def vector_create(persist_directory):
    # load
    loader = DirectoryLoader(
        path=html_directory, glob="**/*.html", loader_cls=UnstructuredHTMLLoader
    )

    documents = loader.load()

    # split
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
    all_splits = text_splitter.split_documents(documents)

    vectordb = Chroma.from_documents(
        documents=documents,
        collection_name=collection_id,
        embedding=embeddings,
        persist_directory=persist_directory
    )

    # Index chunks
    _ = vectordb.add_documents(documents=all_splits)

    # Also add the original documents
    # _ = vectordb.add_documents(documents=documents)

    return vectordb


def vector_get(persist_directory):
    vectordb = Chroma(
        collection_name=collection_id,
        embedding_function=embeddings,
        persist_directory=persist_directory,
    )
    return vectordb


################################################################################
## Creat vector store
################################################################################
if os.path.exists(persist_directory):
    print(f"Get Vector store from {persist_directory}")
    vector_store = vector_get(persist_directory=persist_directory)
else:
    print(f"Create Vector store to {persist_directory}")
    # os.makedirs(persist_directory)
    vector_store = vector_create(persist_directory=persist_directory)
    print(f"Create Vector finish.")

In [None]:
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.5, "k": 1}
)

In [None]:
retriever.invoke("What are the ingredients in Alpha Hope?")