In [None]:
%pip install langchain==0.1.8 langchainhub==0.1.14

### Messages

In [None]:
from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    SystemMessage,
    HumanMessage,
    FunctionMessage,
    ToolMessage,
)

### Streaming Variant

All the chat messages have a streaming variant that contains `Chunk` in the name.

In [None]:
from langchain_core.messages import (
    AIMessageChunk,
    SystemMessageChunk,
    HumanMessageChunk,
    FunctionMessageChunk,
    ToolMessageChunk,
)

These chunks are used when streaming output from chat models, and they all define an additive property!

In [None]:
AIMessageChunk(content="Hello") + AIMessageChunk(content=" World!")
AIMessageChunk(content='Hello World!')

## Base Chat Model

Let’s implement a chat model that echoes back the first n characetrs of the last message in the prompt!

Let’s implement a chat model that echoes back the first n characetrs of the last message in the prompt!

To do so, we will inherit from BaseChatModel and we’ll need to implement the following methods/properties:

In addition, you have the option to specify the following:

To do so inherit from BaseChatModel which is a lower level class and implement the methods:

- _generate - Use to generate a chat result from a prompt

- The property _llm_type - Used to uniquely identify the type of the model. Used for logging.

Optional:

- `_stream` - Use to implement streaming.

- `_agenerate` - Use to implement a native async method.

- `_astream` - Use to implement async version of `_stream`.

- The property `_identifying_params` - Represent model parameterization for logging purposes.

In [None]:
from transformers import AutoModelForSeq2SeqLM
model_name = "lmsys/fastchat-t5-3b-v1.0"
root_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
from typing import Any, AsyncIterator, Dict, Iterator, List, Optional
from transformers import AutoModelForSeq2SeqLM, T5Tokenizer

from langchain_core.pydantic_v1 import Field, root_validator
from langchain_core.callbacks import (
    AsyncCallbackManagerForLLMRun,
    CallbackManagerForLLMRun,
)
from langchain_core.language_models import BaseChatModel, SimpleChatModel
from langchain_core.messages import AIMessageChunk, BaseMessage, HumanMessage
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
from langchain_core.runnables import run_in_executor

class FastChat(BaseChatModel):
    """A custom chat model that echoes the first `n` characters of the input.

    When contributing an implementation to LangChain, carefully document
    the model including the initialization parameters, include
    an example of how to initialize the model and include any relevant
    links to the underlying models documentation or API.

    Example:

        .. code-block:: python

            model = FastChat(n=2)
            result = model.invoke([HumanMessage(content="hello")])
            result = model.batch([[HumanMessage(content="hello")],
                                 [HumanMessage(content="world")]])
    """

    n: int
    """The number of characters from the last message of the prompt to be echoed."""

    model_name: str = Field(default="lmsys/fastchat-t5-3b-v1.0", alias="model")
    """Model name to use"""

    @root_validator
    def validate_environment(cls, values: Dict) -> Dict:
        # Initialize a model. Not for sure this is the best place to load the model locally.
        if not values.get("client"):
            values["client"] = root_model
            # model_name = "lmsys/fastchat-t5-3b-v1.0"
            # values["client"] = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        if not values.get("tokenizer"):
          model_name = "lmsys/fastchat-t5-3b-v1.0"
          values["tokenizer"] = T5Tokenizer.from_pretrained(model_name, use_fast=False)

        return values

    def _generate(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> ChatResult:
        """Override the _generate method to implement the chat model logic.

        This can be a call to an API, a call to a local model, or any other
        implementation that generates a response to the input prompt.

        Args:
            messages: the prompt composed of a list of messages.
            stop: a list of strings on which the model should stop generating.
                  If generation stops due to a stop token, the stop token itself
                  SHOULD BE INCLUDED as part of the output. This is not enforced
                  across models right now, but it's a good practice to follow since
                  it makes it much easier to parse the output of the model
                  downstream and understand why generation stopped.
            run_manager: A run manager with callbacks for the LLM.
        """
        last_message = messages[-1]
        tokens = last_message.content[: self.n]

        # Invoke model to generate the completion
        inputs = self.tokenizer(last_message.content, return_tensors="pt")
        output = self.client.generate(inputs["input_ids"], max_new_tokens=100)[0]
        response = self.tokenizer.decode(output, skip_special_tokens=True)

        # Pass the response to the message output
        message = AIMessage(content=response)
        generation = ChatGeneration(message=message)
        return ChatResult(generations=[generation])

    def _stream(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> Iterator[ChatGenerationChunk]:
        """Stream the output of the model.

        This method should be implemented if the model can generate output
        in a streaming fashion. If the model does not support streaming,
        do not implement it. In that case streaming requests will be automatically
        handled by the _generate method.

        Args:
            messages: the prompt composed of a list of messages.
            stop: a list of strings on which the model should stop generating.
                  If generation stops due to a stop token, the stop token itself
                  SHOULD BE INCLUDED as part of the output. This is not enforced
                  across models right now, but it's a good practice to follow since
                  it makes it much easier to parse the output of the model
                  downstream and understand why generation stopped.
            run_manager: A run manager with callbacks for the LLM.
        """
        print("The _stream function")

        last_message = messages[-1]
        tokens = last_message.content[: self.n]

        for token in tokens:
            chunk = ChatGenerationChunk(message=AIMessageChunk(content=token))

            if run_manager:
                run_manager.on_llm_new_token(token, chunk=chunk)

            yield chunk

    async def _astream(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[AsyncCallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> AsyncIterator[ChatGenerationChunk]:
        """An async variant of astream.

        If not provided, the default behavior is to delegate to the _generate method.

        The implementation below instead will delegate to `_stream` and will
        kick it off in a separate thread.

        If you're able to natively support async, then by all means do so!
        """

        print("The _astream function")

        result = await run_in_executor(
            None,
            self._stream,
            messages,
            stop=stop,
            run_manager=run_manager.get_sync() if run_manager else None,
            **kwargs,
        )
        for chunk in result:
            yield chunk

    def _load_fastchat_model(self):
        """Load the fastchat model locally"""
        return AutoModelForSeq2SeqLM.from_pretrained(self.model_name)

    @property
    def _llm_type(self) -> str:
        """Get the type of language model used by this chat model."""
        return "echoing-chat-model-advanced"

    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """Return a dictionary of identifying parameters."""
        return {"n": self.n}

In [None]:
model = FastChat(n=3)

In [None]:
model.invoke("Hi how are you?")

## Create chain

In [None]:
  from langchain_core.runnables import Runnable, RunnablePassthrough
  from langchain.agents.format_scratchpad import format_log_to_messages
  from langchain.agents.json_chat.prompt import TEMPLATE_TOOL_RESPONSE

  agent = (
      RunnablePassthrough.assign(
          agent_scratchpad=lambda x: format_log_to_messages(
              x["intermediate_steps"], template_tool_response=TEMPLATE_TOOL_RESPONSE
          )
      )
      | prompt
      | llm_to_use
      | JSONAgentOutputParser()

## Create Agent

In [None]:
from langchain.agents import AgentExecutor, initial_agent
# Get the prompt to use - you can modify this!
prompt = hub.pull("hwchase17/structured-chat-agent")

# Choose the agent that will drive the agent
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106")

# Construct the JSON agent
agent = create_structured_chat_agent(llm, tools, prompt)

In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoConfig, AutoModel

model_name = "lmsys/fastchat-t5-3b-v1.0"
model_cache = "~/.cache/huggingface/hub/models--lmsys--fastchat-t5-3b-v1.0/snapshots/0b1da230a891854102d749b93f7ddf1f18a81024"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name, use_fast=False)

message = "Hello, are you a chat bot?"

inputs = tokenizer(message, return_tensors="pt")
output = model.generate(inputs["input_ids"], max_new_tokens=100)[0]
response = tokenizer.decode(output, skip_special_tokens=True)