In [None]:
%pip install langchain==0.1.8 langchainhub==0.1.14

### Messages

In [None]:
from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    SystemMessage,
    HumanMessage,
    FunctionMessage,
    ToolMessage,
)

### Streaming Variant

All the chat messages have a streaming variant that contains `Chunk` in the name.

In [None]:
from langchain_core.messages import (
    AIMessageChunk,
    SystemMessageChunk,
    HumanMessageChunk,
    FunctionMessageChunk,
    ToolMessageChunk,
)

These chunks are used when streaming output from chat models, and they all define an additive property!

In [None]:
AIMessageChunk(content="Hello") + AIMessageChunk(content=" World!")
AIMessageChunk(content='Hello World!')

## Base Chat Model

Let’s implement a chat model that echoes back the first n characetrs of the last message in the prompt!

Let’s implement a chat model that echoes back the first n characetrs of the last message in the prompt!

To do so, we will inherit from BaseChatModel and we’ll need to implement the following methods/properties:

In addition, you have the option to specify the following:

To do so inherit from BaseChatModel which is a lower level class and implement the methods:

- _generate - Use to generate a chat result from a prompt

- The property _llm_type - Used to uniquely identify the type of the model. Used for logging.

Optional:

- `_stream` - Use to implement streaming.

- `_agenerate` - Use to implement a native async method.

- `_astream` - Use to implement async version of `_stream`.

- The property `_identifying_params` - Represent model parameterization for logging purposes.

In [None]:
from transformers import AutoModelForSeq2SeqLM
model_name = "lmsys/fastchat-t5-3b-v1.0"
root_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional
from transformers import AutoModelForSeq2SeqLM, T5Tokenizer

from langchain_core.pydantic_v1 import Field, root_validator
from langchain_core.callbacks import (
    AsyncCallbackManagerForLLMRun,
    CallbackManagerForLLMRun,
)
from langchain_core.language_models import BaseChatModel, SimpleChatModel
from langchain_core.messages import AIMessageChunk, BaseMessage, HumanMessage
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
from langchain_core.runnables import run_in_executor

class FastChat(BaseChatModel):
    """A custom chat model that echoes the first `n` characters of the input.

    When contributing an implementation to LangChain, carefully document
    the model including the initialization parameters, include
    an example of how to initialize the model and include any relevant
    links to the underlying models documentation or API.

    Example:

        .. code-block:: python

            model = FastChat(n=2)
            result = model.invoke([HumanMessage(content="hello")])
            result = model.batch([[HumanMessage(content="hello")],
                                 [HumanMessage(content="world")]])
    """

    n: int
    """The number of characters from the last message of the prompt to be echoed."""

    model_name: str = Field(default="lmsys/fastchat-t5-3b-v1.0", alias="model")
    """Model name to use"""

    @root_validator
    def validate_environment(cls, values: Dict) -> Dict:
        # Initialize a model. Not for sure this is the best place to load the model locally.
        if not values.get("client"):
            values["client"] = root_model
            # model_name = "lmsys/fastchat-t5-3b-v1.0"
            # values["client"] = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        if not values.get("tokenizer"):
          model_name = "lmsys/fastchat-t5-3b-v1.0"
          values["tokenizer"] = T5Tokenizer.from_pretrained(model_name, use_fast=False)

        return values

    def _generate(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> ChatResult:
        """Override the _generate method to implement the chat model logic.

        This can be a call to an API, a call to a local model, or any other
        implementation that generates a response to the input prompt.

        Args:
            messages: the prompt composed of a list of messages.
            stop: a list of strings on which the model should stop generating.
                  If generation stops due to a stop token, the stop token itself
                  SHOULD BE INCLUDED as part of the output. This is not enforced
                  across models right now, but it's a good practice to follow since
                  it makes it much easier to parse the output of the model
                  downstream and understand why generation stopped.
            run_manager: A run manager with callbacks for the LLM.
        """
        last_message = messages[-1]
        tokens = last_message.content[: self.n]

        # Invoke model to generate the completion
        inputs = self.tokenizer(last_message.content, return_tensors="pt")
        output = self.client.generate(inputs["input_ids"], max_new_tokens=100)[0]
        response = self.tokenizer.decode(output, skip_special_tokens=True)

        print("RESPONSE: ", response)
        # Pass the response to the message output
        message = AIMessage(content=response)
        generation = ChatGeneration(message=message)
        return ChatResult(generations=[generation])

    def _stream(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> Iterator[ChatGenerationChunk]:
        """Stream the output of the model.

        This method should be implemented if the model can generate output
        in a streaming fashion. If the model does not support streaming,
        do not implement it. In that case streaming requests will be automatically
        handled by the _generate method.

        Args:
            messages: the prompt composed of a list of messages.
            stop: a list of strings on which the model should stop generating.
                  If generation stops due to a stop token, the stop token itself
                  SHOULD BE INCLUDED as part of the output. This is not enforced
                  across models right now, but it's a good practice to follow since
                  it makes it much easier to parse the output of the model
                  downstream and understand why generation stopped.
            run_manager: A run manager with callbacks for the LLM.
        """
        last_message = messages[-1]
        tokens = last_message.content[: self.n]

        print("The _stream function", last_message.content)

      # Invoke model to generate the completion
        inputs = self.tokenizer(last_message.content, return_tensors="pt")

        for token in self.client.generate(inputs["input_ids"], max_new_tokens=100)[0]:
            decoded_token = self.tokenizer.decode(token, skip_special_tokens=True)
            chunk = ChatGenerationChunk(message=AIMessageChunk(content=decoded_token))

            if run_manager:
                run_manager.on_llm_new_token(decoded_token, chunk=chunk)

            yield chunk

    async def _astream(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> AsyncIterator[ChatGenerationChunk]:
        """An async variant of astream.

        If not provided, the default behavior is to delegate to the _generate method.

        The implementation below instead will delegate to `_stream` and will
        kick it off in a separate thread.

        If you're able to natively support async, then by all means do so!
        """

        print("The _astream function")

        result = await run_in_executor(
            None,
            self._stream,
            messages,
            stop=stop,
            run_manager=run_manager.get_sync() if run_manager else None,
            **kwargs,
        )
        for chunk in result:
            yield chunk

    def _load_fastchat_model(self):
        """Load the fastchat model locally"""
        return AutoModelForSeq2SeqLM.from_pretrained(self.model_name)

    @property
    def _llm_type(self) -> str:
        """Get the type of language model used by this chat model."""
        return "echoing-chat-model-advanced"

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """Return a dictionary of identifying parameters."""
        return {"n": self.n}

In [None]:
model = FastChat(n=3)

In [None]:
model.invoke("Hi how are you?")

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name, use_fast=False)

message = "Hello, are you a chat bot?"

inputs = tokenizer(message, return_tensors="pt")

for chunk in root_model.generate(inputs["input_ids"], max_new_tokens=100)[0]:
  print(chunk)
  # output = root_model.generate(inputs["input_ids"], max_new_tokens=100)[0]
  response = tokenizer.decode(chunk, skip_special_tokens=True)
  print(response)

# output = root_model.generate(inputs["input_ids"], max_new_tokens=100)[0]
# response = tokenizer.decode(output, skip_special_tokens=True)

# print(response)

## Create Tools

In [None]:
from langchain.tools import tool

@tool
def check_order_status(order_id: str) -> str:
  """System use this tool to check order status"""
  return "Delivered"


tools = [check_order_status]

## Create Prompt

In [None]:
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain.tools.render import render_text_description


template = '''Answer the following questions as best you can. You have access to the following tools:

            {tools}

            Use the following format:

            Question: the input question you must answer
            Thought: you should always think about what to do
            Action: the action to take, should be one of [{tool_names}]
            Action Input: the input to the action
            Observation: the result of the action
            ... (this Thought/Action/Action Input/Observation can repeat N times)
            Thought: I now know the final answer
            Final Answer: the final answer to the original input question

            Begin!

            Question: {input}
            Thought: {agent_scratchpad}'''
prompt = ChatPromptTemplate.from_template(template)
prompt = prompt.partial(
    tools=render_text_description(list(tools)),
    tool_names=", ".join([t.name for t in tools]),
)

## Create chain and agent executor

In [None]:
# Create a custom agent executor for debugging
from langchain_core.tools import BaseTool
from typing import (
    Any,
    AsyncIterator,
    Callable,
    Dict,
    Iterator,
    List,
    Optional,
    Sequence,
    Tuple,
    Union,
)
from langchain_core.agents import AgentAction, AgentFinish, AgentStep
from langchain_core.callbacks import (
    AsyncCallbackManagerForChainRun,
    AsyncCallbackManagerForToolRun,
    BaseCallbackManager,
    CallbackManagerForChainRun,
    CallbackManagerForToolRun,
    Callbacks,
)
from langchain.agents import AgentExecutor

class CustomAgentExecutor(AgentExecutor):
      def _iter_next_step(
        self,
        name_to_tool_map: Dict[str, BaseTool],
        color_mapping: Dict[str, str],
        inputs: Dict[str, str],
        intermediate_steps: List[Tuple[AgentAction, str]],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> Iterator[Union[AgentFinish, AgentAction, AgentStep]]:
        """Take a single step in the thought-action-observation loop.

        Override this to take control of how the agent makes and acts on choices.
        """
        print("CustomAgentExecutor._iter_next_step")
        try:
            intermediate_steps = self._prepare_intermediate_steps(intermediate_steps)

            # Call the LLM to see what to do.
            output = self.agent.plan(
                intermediate_steps,
                callbacks=run_manager.get_child() if run_manager else None,
                **inputs,
            )
        except OutputParserException as e:
            if isinstance(self.handle_parsing_errors, bool):
                raise_error = not self.handle_parsing_errors
            else:
                raise_error = False
            if raise_error:
                raise ValueError(
                    "An output parsing error occurred. "
                    "In order to pass this error back to the agent and have it try "
                    "again, pass `handle_parsing_errors=True` to the AgentExecutor. "
                    f"This is the error: {str(e)}"
                )
            text = str(e)
            if isinstance(self.handle_parsing_errors, bool):
                if e.send_to_llm:
                    observation = str(e.observation)
                    text = str(e.llm_output)
                else:
                    observation = "Invalid or incomplete response"
            elif isinstance(self.handle_parsing_errors, str):
                observation = self.handle_parsing_errors
            elif callable(self.handle_parsing_errors):
                observation = self.handle_parsing_errors(e)
            else:
                raise ValueError("Got unexpected type of `handle_parsing_errors`")
            output = AgentAction("_Exception", observation, text)
            if run_manager:
                run_manager.on_agent_action(output, color="green")
            tool_run_kwargs = self.agent.tool_run_logging_kwargs()
            observation = ExceptionTool().run(
                output.tool_input,
                verbose=self.verbose,
                color=None,
                callbacks=run_manager.get_child() if run_manager else None,
                **tool_run_kwargs,
            )
            yield AgentStep(action=output, observation=observation)
            return



        print("Start Debug")
        print("output type: ", type(output))
        print("output=", output)
        print("End Debug")
        # If the tool chosen is the finishing tool, then we end and return.
        if isinstance(output, AgentFinish):
            yield output
            return

        actions: List[AgentAction]
        if isinstance(output, AgentAction):
            actions = [output]
        else:
            actions = output
        for agent_action in actions:
            yield agent_action
        for agent_action in actions:
            yield self._perform_agent_action(
                name_to_tool_map, color_mapping, agent_action, run_manager
            )

In [None]:
from langchain_core.runnables import Runnable, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.agents.format_scratchpad import format_log_to_messages
from langchain.agents.json_chat.prompt import TEMPLATE_TOOL_RESPONSE
from langchain.agents import AgentExecutor

llm_with_tools = model.bind(tools=tools)

agent = (
    RunnablePassthrough.assign(
        agent_scratchpad=lambda x: format_log_to_messages(
            x["intermediate_steps"], template_tool_response=TEMPLATE_TOOL_RESPONSE
        )
    )
    | prompt
    | llm_with_tools
    | StrOutputParser()
)

# Create an agent executor by passing in the agent and tools
agent_executor = CustomAgentExecutor(agent=agent, tools=tools, verbose=True)

## Executor Agent

In [None]:
agent_executor.invoke({"input": "Could you please check my order status?"})
# model.bind

## Create Agent

In [None]:
from langchain.agents import AgentExecutor, initial_agent
# Get the prompt to use - you can modify this!
prompt = hub.pull("hwchase17/structured-chat-agent")

# Choose the agent that will drive the agent
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106")

# Construct the JSON agent
agent = create_structured_chat_agent(llm, tools, prompt)

In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoConfig, AutoModel

model_name = "lmsys/fastchat-t5-3b-v1.0"
model_cache = "~/.cache/huggingface/hub/models--lmsys--fastchat-t5-3b-v1.0/snapshots/0b1da230a891854102d749b93f7ddf1f18a81024"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name, use_fast=False)

message = "Hello, are you a chat bot?"

inputs = tokenizer(message, return_tensors="pt")
output = model.generate(inputs["input_ids"], max_new_tokens=100)[0]
response = tokenizer.decode(output, skip_special_tokens=True)