In [None]:
%pip install langchain==0.1.8 langchainhub==0.1.14

### Messages

In [None]:
from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    SystemMessage,
    HumanMessage,
    FunctionMessage,
    ToolMessage,
)

### Streaming Variant

All the chat messages have a streaming variant that contains `Chunk` in the name.

In [None]:
from langchain_core.messages import (
    AIMessageChunk,
    SystemMessageChunk,
    HumanMessageChunk,
    FunctionMessageChunk,
    ToolMessageChunk,
)

These chunks are used when streaming output from chat models, and they all define an additive property!

In [None]:
AIMessageChunk(content="Hello") + AIMessageChunk(content=" World!")
AIMessageChunk(content='Hello World!')

## Base Chat Model

Let’s implement a chat model that echoes back the first n characetrs of the last message in the prompt!

Let’s implement a chat model that echoes back the first n characetrs of the last message in the prompt!

To do so, we will inherit from BaseChatModel and we’ll need to implement the following methods/properties:

In addition, you have the option to specify the following:

To do so inherit from BaseChatModel which is a lower level class and implement the methods:

- _generate - Use to generate a chat result from a prompt

- The property _llm_type - Used to uniquely identify the type of the model. Used for logging.

Optional:

- `_stream` - Use to implement streaming.

- `_agenerate` - Use to implement a native async method.

- `_astream` - Use to implement async version of `_stream`.

- The property `_identifying_params` - Represent model parameterization for logging purposes.

In [None]:
from transformers import AutoModelForSeq2SeqLM
model_name = "lmsys/fastchat-t5-3b-v1.0"
root_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [None]:
from langchain_openai import ChatOpenAI
# Choose the LLM that will drive the agent
# Only certain models support this
llm = ChatOpenAI(model="gpt-3.5-turbo-1106", temperature=0)

In [None]:
# Test response JSON
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name, use_fast=False)

message = """The _stream function Answer the following questions as best you can. You have access to the following tools:
            check_order_status: check_order_status(order_id: str) -> str - System use this tool to check order status
            Use the following format:
            Question: the input question you must answer
            Thought: you should always think about what to do
            Action: the action to take, should be one of [check_order_status]
            Action Input: the input to the action
            Observation: the result of the action
            ... (this Thought/Action/Action Input/Observation can repeat N times)
            Thought: I now know the final answer
            Final Answer: the final answer to the original input question

            Begin!

            Question: Could you please check my order status?
            Thought: [
                AIMessage(content="Invoking: `check_order_status` with `{'order_id': 1}`"),
                HumanMessage(
                    content="TOOL RESPONSE: Delivered
                    USER'S INPUT: Okay, so what is the response to my last comment? If using information obtained from the tools you must mention it explicitly without mentioning the tool names and without mentioning the action - I have forgotten all TOOL RESPONSES! Remember to respond with a markdown code snippet of a json blob with a single action, and NOTHING else - even if you just want to respond to the user. Do NOT respond with anything except a JSON snippet no matter what!")
            ]"""

inputs = tokenizer(message, return_tensors="pt")
output = root_model.generate(inputs["input_ids"], max_new_tokens=100)[0]
response = tokenizer.decode(output, skip_special_tokens=True)


In [None]:
print(response)

In [None]:
from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, Iterator, List, Optional, cast
from transformers import AutoModelForSeq2SeqLM, T5Tokenizer

from langchain_core.pydantic_v1 import Field, root_validator
from langchain_core.callbacks import (
    AsyncCallbackManagerForLLMRun,
    CallbackManagerForLLMRun,
)
from langchain_core.language_models import BaseChatModel, SimpleChatModel
from langchain_core.messages import AIMessageChunk, BaseMessage, BaseMessageChunk
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
from langchain_core.runnables import run_in_executor, RunnableConfig
from langchain_core.language_models.base import BaseLanguageModel, LanguageModelInput


class AuthBaseMessage(BaseMessage):
    auth_config: str =  ""


class AuthBaseMessageChunk(BaseMessageChunk):
    auth_config: str =  ""


class FastChat(BaseChatModel):
    """A custom chat model that echoes the first `n` characters of the input.

    When contributing an implementation to LangChain, carefully document
    the model including the initialization parameters, include
    an example of how to initialize the model and include any relevant
    links to the underlying models documentation or API.

    Example:

        .. code-block:: python

            model = FastChat(n=2)
            result = model.invoke([HumanMessage(content="hello")])
            result = model.batch([[HumanMessage(content="hello")],
                                 [HumanMessage(content="world")]])
    """

    n: int
    """The number of characters from the last message of the prompt to be echoed."""

    model_name: str = Field(default="lmsys/fastchat-t5-3b-v1.0", alias="model")
    """Model name to use"""

    auth_config: str = ""

    @root_validator
    def validate_environment(cls, values: Dict) -> Dict:
        # Initialize a model. Not for sure this is the best place to load the model local.
        if not values.get("client"):
            values["client"] = root_model
            # model_name = "lmsys/fastchat-t5-3b-v1.0"
            # values["client"] = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        if not values.get("tokenizer"):
          model_name = "lmsys/fastchat-t5-3b-v1.0"
          values["tokenizer"] = T5Tokenizer.from_pretrained(model_name, use_fast=False)

        return values

    def invoke(
        self,
        input: LanguageModelInput,
        config: Optional[RunnableConfig] = None,
        *,
        stop: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> BaseMessage:
        message = super().invoke(input=input, config=config, stop=stop, **kwargs)
        auth_message = cast(AuthBaseMessage, message)
        auth_message.auth_config = input.auth_config
        return auth_message

    def stream(
        self,
        input: LanguageModelInput,
        config: Optional[RunnableConfig] = None,
        *,
        stop: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> Iterator[BaseMessageChunk]:
        for message_chunk in super().stream(input, config, stop=stop, **kwargs):
            auth_message_chunk = cast(AuthBaseMessageChunk, message_chunk)
            auth_message_chunk.auth_config = input.auth_config
            yield auth_message_chunk


    async def astream(
        self,
        input: LanguageModelInput,
        config: Optional[RunnableConfig] = None,
        *,
        stop: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> AsyncIterator[BaseMessageChunk]:
        """Override to extract and store the auth_config"""
        async for message_chunk in super().astream(input, config, stop=stop, **kwargs):
            auth_message_chunk = cast(AuthBaseMessageChunk, message_chunk)
            auth_message_chunk.auth_config = input.auth_config
            yield auth_message_chunk

    def _generate(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> ChatResult:
        """Override the _generate method to implement the chat model logic.

        This can be a call to an API, a call to a local model, or any other
        implementation that generates a response to the input prompt.

        Args:
            messages: the prompt composed of a list of messages.
            stop: a list of strings on which the model should stop generating.
                  If generation stops due to a stop token, the stop token itself
                  SHOULD BE INCLUDED as part of the output. This is not enforced
                  across models right now, but it's a good practice to follow since
                  it makes it much easier to parse the output of the model
                  downstream and understand why generation stopped.
            run_manager: A run manager with callbacks for the LLM.
        """
        last_message = messages[-1]
        tokens = last_message.content[: self.n]

        # Invoke model to generate the completion
        inputs = self.tokenizer(last_message.content, return_tensors="pt")
        output = self.client.generate(inputs["input_ids"], max_new_tokens=100)[0]
        response = self.tokenizer.decode(output, skip_special_tokens=True)

        print("RESPONSE: ", response)
        # Pass the response to the message output
        message = AIMessage(content=response)
        generation = ChatGeneration(message=message)
        return ChatResult(generations=[generation])

    def _stream(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> Iterator[ChatGenerationChunk]:
        """Stream the output of the model.

        This method should be implemented if the model can generate output
        in a streaming fashion. If the model does not support streaming,
        do not implement it. In that case streaming requests will be automatically
        handled by the _generate method.

        Args:
            messages: the prompt composed of a list of messages.
            stop: a list of strings on which the model should stop generating.
                  If generation stops due to a stop token, the stop token itself
                  SHOULD BE INCLUDED as part of the output. This is not enforced
                  across models right now, but it's a good practice to follow since
                  it makes it much easier to parse the output of the model
                  downstream and understand why generation stopped.
            run_manager: A run manager with callbacks for the LLM.
        """
        last_message = messages[-1]
        tokens = last_message.content[: self.n]

        print("The _stream function", last_message.content)

      # Invoke model to generate the completion
        inputs = self.tokenizer(last_message.content, return_tensors="pt")

        for token in self.client.generate(inputs["input_ids"], max_new_tokens=100)[0]:
            decoded_token = self.tokenizer.decode(token, skip_special_tokens=True)
            chunk = ChatGenerationChunk(message=AIMessageChunk(content=decoded_token))

            if run_manager:
                run_manager.on_llm_new_token(decoded_token, chunk=chunk)

            yield chunk

    async def _astream(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> AsyncIterator[ChatGenerationChunk]:
        """An async variant of astream.

        If not provided, the default behavior is to delegate to the _generate method.

        The implementation below instead will delegate to `_stream` and will
        kick it off in a separate thread.

        If you're able to natively support async, then by all means do so!
        """

        result = await run_in_executor(
            None,
            self._stream,
            messages,
            stop=stop,
            run_manager=run_manager.get_sync() if run_manager else None,
            **kwargs,
        )
        for chunk in result:
            yield chunk

    def _load_fastchat_model(self):
        """Load the fastchat model locally"""
        return AutoModelForSeq2SeqLM.from_pretrained(self.model_name)

    @property
    def _llm_type(self) -> str:
        """Get the type of language model used by this chat model."""
        return "echoing-chat-model-advanced"

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """Return a dictionary of identifying parameters."""
        return {"n": self.n}

In [None]:
model = FastChat(n=3)

In [None]:
model.invoke("Hi how are you?")

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name, use_fast=False)

message = "Hello, are you a chat bot?"

inputs = tokenizer(message, return_tensors="pt")

for chunk in root_model.generate(inputs["input_ids"], max_new_tokens=100)[0]:
  print(chunk)
  # output = root_model.generate(inputs["input_ids"], max_new_tokens=100)[0]
  response = tokenizer.decode(chunk, skip_special_tokens=True)
  print(response)

# output = root_model.generate(inputs["input_ids"], max_new_tokens=100)[0]
# response = tokenizer.decode(output, skip_special_tokens=True)

# print(response)

## Create Tools

In [55]:
from langchain.tools import tool

@tool
def check_order_status(order_id: str, auth_config: str) -> str:
  """System use this tool to check order status"""
  return "Delivered"


tools = [check_order_status]

## Create Prompt

In [None]:
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain.tools.render import render_text_description
from langchain_core.prompt_values import PromptValue, ChatPromptValue

template = '''Answer the following questions as best you can. You have access to the following tools:

            {tools}

            Use the following format:

            Question: the input question you must answer
            Thought: you should always think about what to do
            Action: the action to take, should be one of [{tool_names}]
            Action Input: the input to the action
            Observation: the result of the action
            ... (this Thought/Action/Action Input/Observation can repeat N times)
            Thought: I now know the final answer
            Final Answer: the final answer to the original input question

            Begin!

            Question: {input}
            Thought: {agent_scratchpad}'''

class AuthChatPromptValue(ChatPromptValue):
    """Chat prompt value.

    A type of a prompt value that is built from messages with additional auth_config field.
    """
    auth_config: str = ''

# Custom prompt template class to assign the authentication config
class ChatPromptTemplateWithAuth(ChatPromptTemplate):
    def format_prompt(self, **kwargs: Any) -> PromptValue:
        """
        Format prompt. Should return a PromptValue.
        Args:
            **kwargs: Keyword arguments to use for formatting.

        Returns:
            PromptValue.
        """
        messages = self.format_messages(**kwargs)
        prompt_value = AuthChatPromptValue(messages=messages)
        prompt_value.auth_config = kwargs.get("auth_config", None)
        return prompt_value

prompt = ChatPromptTemplateWithAuth.from_template(template)
prompt = prompt.partial(
    tools=render_text_description(list(tools)),
    tool_names=", ".join([t.name for t in tools]),
)

## Create FastChatAgent Action

In [None]:
# Refer from libs/langchain/langchain/agents/output_parsers/openai_tools.py
import re
import random
from typing import List, Union

from langchain_core.agents import AgentAction, AgentActionMessageLog, AgentFinish
from langchain_core.messages import (
    AIMessage,
    BaseMessage,
)
from langchain_core.outputs import ChatGeneration, Generation

from langchain.agents.agent import MultiActionAgentOutputParser

from langchain.output_parsers import RegexParser


class FastChatToolAgentAction(AgentActionMessageLog):
    tool_call_id: str
    """Tool call that this message is responding to."""

def parse_ai_message_to_fastchat_tool_action(message: BaseMessage) -> Union[List[AgentAction], AgentFinish]:
  """Parse an AI message potentially containing tool_calls"""
  if not isinstance(message, AIMessage):
    raise TypeError(f"Expected an AI message got {type(message)}")

  actions: List = []

  content_msg = f"responded: {message.content}\n" if message.content else "\n"

  # Parse LLM completion to retrieve the action and action input
  response = re.sub(' +', ' ', message.content)

  action_parser = RegexParser(
      regex=r"Action: (.*)", output_keys=["action"],
  )
  action_input_parser = RegexParser(
      regex=r"Action Input: (.*)", output_keys=["action_input"],
  )

  dict1 = {}
  dict2 = {}
  try :
    dict1 = action_parser.parse(response)
    dict2 = action_input_parser.parse(response)
  except ValueError as e:
    pass
  except Exception as e:
    raise e

  if (dict1.get("action")):
    function_name = dict1.get("action")
    tool_input = {
       "order_id": 1, "auth_config": "test"}
    random.seed(10)

    tool_call_id = str(random.random())
    log = f"\nInvoking: `{function_name}` with `{tool_input}`"


    actions.append(
        FastChatToolAgentAction(
            tool=function_name,
            tool_input=tool_input,
            log=log,
            message_log=[message],
            tool_call_id=tool_call_id
        )
    )
  else:
    #  Finish
    return AgentFinish(
      return_values={"output": message.content}, log=str(message.content)
    )
  return actions


class FastChatToolAgentOutputParser(MultiActionAgentOutputParser):

    @property
    def _type(self) -> str:
        return "fastchat-tools-agent-output-parser"

    def parse_result(
        self, result: List[Generation], *, partial: bool = False
    ) -> Union[List[AgentAction], AgentFinish]:
        if not isinstance(result[0], ChatGeneration):
            raise ValueError("This output parser only works on ChatGeneration output")
        message = result[0].message
        print(FastChatToolAgentOutputParser)
        print("---------------------BEGIN------------")
        print(message.content)
        print("---------------------END------------")
        return parse_ai_message_to_fastchat_tool_action(message)

    def parse(self, text: str) -> Union[List[AgentAction], AgentFinish]:
        raise ValueError("Can only parse messages")

## Create chain and agent executor

In [None]:
# Create a custom agent executor for debugging
from langchain_core.tools import BaseTool
from typing import (
    Any,
    AsyncIterator,
    Callable,
    Dict,
    Iterator,
    List,
    Optional,
    Sequence,
    Tuple,
    Union,
)
from langchain_core.agents import AgentAction, AgentFinish, AgentStep
from langchain_core.callbacks import (
    AsyncCallbackManagerForChainRun,
    AsyncCallbackManagerForToolRun,
    BaseCallbackManager,
    CallbackManagerForChainRun,
    CallbackManagerForToolRun,
    Callbacks,
)
from langchain.agents import AgentExecutor
from langchain_core.exceptions import OutputParserException

class CustomAgentExecutor(AgentExecutor):
      def _iter_next_step(
        self,
        name_to_tool_map: Dict[str, BaseTool],
        color_mapping: Dict[str, str],
        inputs: Dict[str, str],
        intermediate_steps: List[Tuple[AgentAction, str]],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> Iterator[Union[AgentFinish, AgentAction, AgentStep]]:
        """Take a single step in the thought-action-observation loop.

        Override this to take control of how the agent makes and acts on choices.
        """
        print("CustomAgentExecutor._iter_next_step", run_manager)
        try:
            intermediate_steps = self._prepare_intermediate_steps(intermediate_steps)

            print("Agent instance type:", type(self.agent))
            # Call the LLM to see what to do.
            output = self.agent.plan(
                intermediate_steps,
                callbacks=run_manager.get_child() if run_manager else None,
                **inputs,
            )
        except OutputParserException as e:
            if isinstance(self.handle_parsing_errors, bool):
                raise_error = not self.handle_parsing_errors
            else:
                raise_error = False
            if raise_error:
                raise ValueError(
                    "An output parsing error occurred. "
                    "In order to pass this error back to the agent and have it try "
                    "again, pass `handle_parsing_errors=True` to the AgentExecutor. "
                    f"This is the error: {str(e)}"
                )
            text = str(e)
            if isinstance(self.handle_parsing_errors, bool):
                if e.send_to_llm:
                    observation = str(e.observation)
                    text = str(e.llm_output)
                else:
                    observation = "Invalid or incomplete response"
            elif isinstance(self.handle_parsing_errors, str):
                observation = self.handle_parsing_errors
            elif callable(self.handle_parsing_errors):
                observation = self.handle_parsing_errors(e)
            else:
                raise ValueError("Got unexpected type of `handle_parsing_errors`")
            output = AgentAction("_Exception", observation, text)
            if run_manager:
                run_manager.on_agent_action(output, color="green")
            tool_run_kwargs = self.agent.tool_run_logging_kwargs()
            observation = ExceptionTool().run(
                output.tool_input,
                verbose=self.verbose,
                color=None,
                callbacks=run_manager.get_child() if run_manager else None,
                **tool_run_kwargs,
            )
            yield AgentStep(action=output, observation=observation)
            return



        print("Start Debug")
        print("output type: ", type(output))
        print("output=", output)
        print("End Debug")
        # If the tool chosen is the finishing tool, then we end and return.
        if isinstance(output, AgentFinish):
            yield output
            return

        actions: List[AgentAction]
        if isinstance(output, AgentAction):
            actions = [output]
        else:
            actions = output
        for agent_action in actions:
            yield agent_action
        for agent_action in actions:
            print("self._perform_agent_action: ", agent_action)
            yield self._perform_agent_action(
                name_to_tool_map, color_mapping, agent_action, run_manager
            )

## Format to FastChatTool Message

In [None]:
import json
from typing import List, Sequence, Tuple

from langchain_core.agents import AgentAction
from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    ToolMessage,
)

from langchain.agents.output_parsers.openai_tools import OpenAIToolAgentAction


def _create_tool_message(
    agent_action: FastChatToolAgentAction, observation: str
) -> ToolMessage:
    """Convert agent action and observation into a function message.
    Args:
        agent_action: the tool invocation request from the agent
        observation: the result of the tool invocation
    Returns:
        FunctionMessage that corresponds to the original tool invocation
    """
    if not isinstance(observation, str):
        try:
            content = json.dumps(observation, ensure_ascii=False)
        except Exception:
            content = str(observation)
    else:
        content = observation
    return ToolMessage(
        tool_call_id=agent_action.tool_call_id,
        content=content,
        additional_kwargs={"name": agent_action.tool},
    )


def format_to_fastchat_tool_messages(
    intermediate_steps: Sequence[Tuple[AgentAction, str]],
) -> List[BaseMessage]:
    """Convert (AgentAction, tool output) tuples into FunctionMessages.

    Args:
        intermediate_steps: Steps the LLM has taken to date, along with observations

    Returns:
        list of messages to send to the LLM for the next prediction

    """
    messages = []
    for agent_action, observation in intermediate_steps:
        if isinstance(agent_action, FastChatToolAgentAction):
            new_messages = list(agent_action.message_log) + [
                _create_tool_message(agent_action, observation)
            ]
            messages.extend([new for new in new_messages if new not in messages])
        else:
            messages.append(AIMessage(content=agent_action.log))
    return messages


In [56]:
from langchain_core.runnables import Runnable, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.agents.format_scratchpad import format_log_to_messages
from langchain.agents.json_chat.prompt import TEMPLATE_TOOL_RESPONSE

from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad.openai_tools import (
    format_to_openai_tool_messages,
)


llm_with_tools = model.bind(tools=tools)
agent = (
    RunnablePassthrough.assign(
        agent_scratchpad=lambda x: format_to_fastchat_tool_messages(
            x["intermediate_steps"],
        )
    )
    | prompt
    | llm_with_tools
    # # | llm
    | FastChatToolAgentOutputParser()
)

# Create an agent executor by passing in the agent and tools
agent_executor = CustomAgentExecutor(agent=agent, tools=tools, verbose=True, max_iterations=2, return_intermediate_steps=True)
# agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, max_iterations=5, return_intermediate_steps=True)

## Executor Agent

In [58]:
auth_config = {"access_token": "my_bearer_token"}
result = agent_executor.invoke({"input": "My order id is 12345. Could you please check my order status?", "auth_config": "my_bearer_token"})
print("Result", result)
# model.bind



[1m> Entering new CustomAgentExecutor chain...[0m
CustomAgentExecutor._iter_next_step <langchain_core.callbacks.manager.CallbackManagerForChainRun object at 0x78c87a2c4f50>
Agent instance type: <class 'langchain.agents.agent.RunnableMultiActionAgent'>
The _stream function Answer the following questions as best you can. You have access to the following tools:

            check_order_status: check_order_status(order_id: str, **kwargs) -> str - System use this tool to check order status

            Use the following format:

            Question: the input question you must answer
            Thought: you should always think about what to do
            Action: the action to take, should be one of [check_order_status]
            Action Input: the input to the action
            Observation: the result of the action
            ... (this Thought/Action/Action Input/Observation can repeat N times)
            Thought: I now know the final answer
            Final Answer: the final an

In [None]:
# result["intermediate_steps"][1]
# type(llm_with_tools)

In [None]:
prompt_value = prompt.invoke({"input": "Could you please check my order status?", "agent_scratchpad": "", "auth_config": "my_bearer_token"})
# prompt_value.auth_config

In [None]:
# message = model.invoke(input=prompt_value)
message.dict().get("auth_config")


In [None]:
action = FastChatToolAgentOutputParser().invoke(input=message)
# action[0]

## Create Agent

In [None]:
from langchain.agents import AgentExecutor, initial_agent
# Get the prompt to use - you can modify this!
prompt = hub.pull("hwchase17/structured-chat-agent")

# Choose the agent that will drive the agent
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106")

# Construct the JSON agent
agent = create_structured_chat_agent(llm, tools, prompt)

In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoConfig, AutoModel

model_name = "lmsys/fastchat-t5-3b-v1.0"
model_cache = "~/.cache/huggingface/hub/models--lmsys--fastchat-t5-3b-v1.0/snapshots/0b1da230a891854102d749b93f7ddf1f18a81024"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name, use_fast=False)

message = "Hello, are you a chat bot?"

inputs = tokenizer(message, return_tensors="pt")
output = model.generate(inputs["input_ids"], max_new_tokens=100)[0]
response = tokenizer.decode(output, skip_special_tokens=True)