Skip to content

[Issue]: how to deploy the local model correctly and run the agentchat_custom_model.ipynb? #3264

Closed
@lambda7xx

Description

@lambda7xx

Describe the issue

I use the Local-LLMs/ to deploy my local model
but the result by llm is weird

Steps to reproduce

lunch the local model

In terminate 1: my command is 'python -m fastchat.serve.controller '
In terminate 2: command is python -m fastchat.serve.model_worker --model-path chatglm2-6b
In terminate 3: command is python -m fastchat.serve.openai_api_server --host localhost --port 8000

code

My script is

from types import SimpleNamespace

from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

import autogen
from autogen import AssistantAgent, UserProxyAgent



# custom client with custom model loader


class CustomModelClient:
    def __init__(self, config, **kwargs):
        print(f"CustomModelClient config: {config}")
        self.device = config.get("device", "cpu")
        self.model = AutoModelForCausalLM.from_pretrained(config["model"], trust_remote_code=True).to(self.device)
        self.model_name = config["model"]
        self.tokenizer = AutoTokenizer.from_pretrained(config["model"], use_fast=False)
        #yself.tokenizer.pad_token_id = self.tokenizer.eos_token_id

        # params are set by the user and consumed by the user since they are providing a custom model
        # so anything can be done here
        gen_config_params = config.get("params", {})
        self.max_length = gen_config_params.get("max_length", 256)

        print(f"Loaded model {config['model']} to {self.device}")

    def create(self, params):
        if params.get("stream", False) and "messages" in params:
            raise NotImplementedError("Local models do not support streaming.")
        else:
            num_of_responses = params.get("n", 1)

            # can create my own data response class
            # here using SimpleNamespace for simplicity
            # as long as it adheres to the ClientResponseProtocol

            response = SimpleNamespace()

            inputs = self.tokenizer.apply_chat_template(
                params["messages"], return_tensors="pt", add_generation_prompt=True
            ).to(self.device)
            inputs_length = inputs.shape[-1]

            # add inputs_length to max_length
            max_length = self.max_length + inputs_length
            generation_config = GenerationConfig(
                max_length=max_length,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.eos_token_id,
            )

            response.choices = []
            response.model = self.model_name

            for _ in range(num_of_responses):
                outputs = self.model.generate(inputs, generation_config=generation_config)
                # Decode only the newly generated text, excluding the prompt
                text = self.tokenizer.decode(outputs[0, inputs_length:])
                choice = SimpleNamespace()
                choice.message = SimpleNamespace()
                choice.message.content = text
                choice.message.function_call = None
                response.choices.append(choice)

            return response

    def message_retrieval(self, response):
        """Retrieve the messages from the response."""
        choices = response.choices
        return [choice.message.content for choice in choices]

    def cost(self, response) -> float:
        """Calculate the cost of the response."""
        response.cost = 0
        return 0

    @staticmethod
    def get_usage(response):
        # returns a dict of prompt_tokens, completion_tokens, total_tokens, cost, model
        # if usage needs to be tracked, else None
        return {}


config_list_custom = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={"model_client_cls": ["CustomModelClient"]},
)


assistant = AssistantAgent("assistant", llm_config={"config_list": config_list_custom})
user_proxy = UserProxyAgent(
    "user_proxy",
    code_execution_config={
        "work_dir": "coding",
        "use_docker": False,  # Please set use_docker=True if docker is available to run the generated code. Using docker is safer than running the generated code directly.
    },
)

assistant.register_model_client(model_client_cls=CustomModelClient)


user_proxy.initiate_chat(assistant, message="Write python code to print Hello World!")


# custom client with custom model loader


# class CustomModelClientWithArguments(CustomModelClient):
#     def __init__(self, config, loaded_model, tokenizer, **kwargs):
#         print(f"CustomModelClientWithArguments config: {config}")

#         self.model_name = config["model"]
#         self.model = loaded_model
#         self.tokenizer = tokenizer

#         self.device = config.get("device", "cpu")

#         gen_config_params = config.get("params", {})
#         self.max_length = gen_config_params.get("max_length", 256)
#         print(f"Loaded model {config['model']} to {self.device}")



# # load model here


# config = config_list_custom[0]
# device = config.get("device", "cpu")
# loaded_model = AutoModelForCausalLM.from_pretrained(config["model"]).to(device)
# tokenizer = AutoTokenizer.from_pretrained(config["model"], use_fast=False)
# tokenizer.pad_token_id = tokenizer.eos_token_id


# config_list_custom = autogen.config_list_from_json(
#     "OAI_CONFIG_LIST",
#     filter_dict={"model_client_cls": ["CustomModelClientWithArguments"]},
# )


# assistant = AssistantAgent("assistant", llm_config={"config_list": config_list_custom})


# assistant.register_model_client(
#     model_client_cls=CustomModelClientWithArguments,
#     loaded_model=loaded_model,
#     tokenizer=tokenizer,
# )


# user_proxy.initiate_chat(assistant, message="Write python code to print Hello World!")

My OAI_CONFIG_LIST is

[
    {
        "model": "chatglm2-6b",
        "base_url": "http://localhost:8000/v1",
        "api_type": "openai",
        "api_key": "NULL",
        "model_client_cls": "CustomModelClient",
        "device": "cuda"
    }
]

Screenshots and logs

my log is

oaded model chatglm2-6b to cuda
user_proxy (to assistant):

Write python code to print Hello World!

--------------------------------------------------------------------------------
No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues.
assistant (to user_proxy):

"A_S    A_Savor.
.

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: 

>>>>>>>> NO HUMAN INPUT RECEIVED.

>>>>>>>> USING AUTO REPLY...
user_proxy (to assistant):



--------------------------------------------------------------------------------
assistant (to user_proxy):

A_S     A_Savor.
.

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: 

>>>>>>>> NO HUMAN INPUT RECEIVED.

>>>>>>>> USING AUTO REPLY...
user_proxy (to assistant):



--------------------------------------------------------------------------------
assistant (to user_proxy):

<=D                                                                                                                                                                                             >|

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: Write python code to print Hello World!
user_proxy (to assistant):

Write python code to print Hello World!

--------------------------------------------------------------------------------
assistant (to user_proxy):

=_=_=_=_=

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: I need code
user_proxy (to assistant):

I need code

--------------------------------------------------------------------------------
assistant (to user_proxy):

=                                                                               ~

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: python code
user_proxy (to assistant):

python code

--------------------------------------------------------------------------------
assistant (to user_proxy):

===

--------------------------------------------------------------------------------
Replying as user_proxy. Provide feedback to assistant. Press enter to skip and use auto-reply, or type 'exit' to end the conversation: 

I think the assistant agent should write a python code but it does not.

Additional Information

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    0.2Issues which are related to the pre 0.4 codebasemodelsPertains to using alternate, non-GPT, models (e.g., local models, llama, etc.)needs-triage

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions