## Lib import

In [None]:
import os
import logging
from typing import List, Literal, Annotated, Optional, Union
from typing_extensions import TypedDict
import chromadb
from chromadb.config import Settings
from pydantic import BaseModel, Field, validator

from langchain import PromptTemplate, LLMChain
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_deepseek import ChatDeepSeek
from langchain_core.tools import tool
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, SystemMessage
from langchain_core.runnables import RunnableSerializable
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_chroma import Chroma
from langchain_text_splitters import TokenTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain.schema import Document
from langchain_community.retrievers import TavilySearchAPIRetriever
from langchain_huggingface import HuggingFaceEmbeddings

from langgraph.graph import START, END, MessagesState, StateGraph
from langgraph.checkpoint.memory import MemorySaver, InMemorySaver
from langgraph.graph import START, END, MessagesState, StateGraph
from langgraph.checkpoint.memory import InMemorySaver

## Init and API key

In [None]:
from dotenv import load_dotenv
load_dotenv()

# key
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")
silicon_api_key = os.getenv("SILICON_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")

# deepseek
deepseek_llm_model = "deepseek-chat"

# silicon
silicon_base_url =  "https://api.siliconflow.cn/v1"
silicon_llm_model = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

# huggingface
huggingface_embed_model = "sentence-transformers/all-MiniLM-L6-v2"

In [None]:
root_path = '../'
html_directory = os.path.join(root_path, "downloads/website")
persist_directory = os.path.join(root_path, "var/vector")

## Vector

In [None]:
!pip install --upgrade nltk

os.environ["TOKENIZERS_PARALLELISM"] = "false"

import nltk
nltk.download("punkt_tab")
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
collection_id = "calerie-health"

################################################################################
## Config embeddings model
################################################################################
embeddings = HuggingFaceEmbeddings(model_name=huggingface_embed_model)
print(f"Created HuggingFaceEmbeddings {huggingface_embed_model}")

def vector_create(persist_directory):
    # load
    loader = DirectoryLoader(
        path=html_directory, glob="**/*.html", loader_cls=UnstructuredHTMLLoader
    )

    documents = loader.load()

    # split
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=100)
    all_splits = text_splitter.split_documents(documents)

    vectordb = Chroma.from_documents(
        documents=documents,
        collection_name=collection_id,
        embedding=embeddings,
        persist_directory=persist_directory
    )

    # Index chunks
    _ = vectordb.add_documents(documents=all_splits)

    # Also add the original documents
    # _ = vectordb.add_documents(documents=documents)

    return vectordb


################################################################################
## Creat vector store
################################################################################
if not os.path.exists(persist_directory):
    print(f"Create Vector store to {persist_directory}")
    # os.makedirs(persist_directory)
    vector_store = vector_create(persist_directory=persist_directory)
    print(f"Create Vector finish.")
else:
    print(f"Exist Vector store {persist_directory}")

In [None]:
files = os.listdir(persist_directory)
print(files)