In [None]:
! pip install langchain langchain-community langchain-openai huggingface_hub bs4 chromadb


In [None]:
! pip install sentence-transformers

In [None]:
! pip install hf_xet

In [None]:
import os
import bs4
from huggingface_hub import InferenceClient

from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.language_models import LLM
from langchain_core.outputs import Generation
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from typing import List, Optional
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.outputs import LLMResult

In [None]:
class KimiLLM(LLM):
    model: str = "moonshotai/Kimi-K2-Instruct"
    temperature: float = 0.7
    max_tokens: int = 512
    client: Optional[InferenceClient] = None

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        if self.client is None:
            self.client = InferenceClient(api_key=os.environ["HF_TOKEN"], provider="auto")

    @property
    def _llm_type(self) -> str:
        return "kimi-custom"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        messages = [{"role": "user", "content": prompt}]
        completion = self.client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
        )
        return completion.choices[0].message.content

    def generate(
        self,
        prompts: List[str],
        stop: Optional[List[str]] = None,
        callbacks: Optional[List] = None,
        **kwargs,
    ) -> LLMResult:
        generations = []
        for prompt in prompts:
            output = self._call(prompt, stop)
            generations.append([Generation(text=output)])
        return LLMResult(generations=generations)

In [None]:
#os.environ["HF_TOKEN"]=

In [None]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)
retriever = vectorstore.as_retriever()

In [None]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate.from_template(
    "Answer the question based on the context below.\n\nContext:\n{context}\n\nQuestion:\n{question}"
)


In [None]:
llm = KimiLLM()


In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
result = rag_chain.invoke("What is Task Decomposition?")
print(result)

In [None]:
print(InferenceClient.__module__)
print(InferenceClient.__init__.__code__.co_varnames)