In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)


In [2]:
!pip install --quiet --upgrade pyautogen~=0.2.0b4 torch git+https://github.com/huggingface/transformers sentencepiece

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
!pip install --quiet --upgrade accelerate bitsandbytes

In [4]:
import autogen
from autogen import AssistantAgent, UserProxyAgent
from transformers import AutoTokenizer, GenerationConfig, AutoModelForCausalLM, BitsAndBytesConfig
from types import SimpleNamespace
import torch

In [5]:
import os
os.environ['OAI_CONFIG_LIST'] ='[{"model": "google/gemma-7b-it","model_client_cls": "CustomModelClient","device": "cuda","n": 1,"params": {"max_length": 1000}}]'

In [6]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

custom_model  = AutoModelForCausalLM.from_pretrained("google/gemma-7b-it",
                                                          quantization_config=quantization_config,
                                                          low_cpu_mem_usage=True,
                                                          torch_dtype=torch.float16)
custom_tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b-it")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:

def replace_system_role_with_user(structure):
    # Iterate through each element in the structure
    updated_structure = []
    previous_role = None
    for item in structure:
        updated_structure.append(item)
        # Check if the role is 'system' and replace it with 'user'
        if item.get('role') == 'system':
            item['role'] = 'user'
            dummy_response = {'content': 'ok', 'role': 'assistant'}
            updated_structure.append(dummy_response)
            #item['role']. = 'assistant'
        elif item.get('role') == 'user':
            updated_structure.append({'content': 'ok', 'role': 'assistant'})
        elif item.get('role') == 'assistant':
            if updated_structure[-2]['role'] == 'assistant':
                updated_structure.pop()

        #previous_role = item['role']

    if updated_structure and updated_structure[-1]['role'] == 'assistant':
        updated_structure.pop()
    # Return the modified structure
    return updated_structure

# custom client with custom model loader
class CustomModelClient:
    def __init__(self, config, **kwargs):
        print(f"CustomModelClient config: {config}")
        self.device = config.get("device", "cpu")

        self.model = custom_model
        self.model_name = config["model"]
        self.tokenizer = custom_tokenizer
        self.tokenizer.pad_token_id = self.tokenizer.eos_token_id

        # params are set by  the user and consumed by the user since they are providing a custom model
        # so anything can be done here
        gen_config_params = config.get("params", {})
        self.max_length = gen_config_params.get("max_length", 256)

        print(f"Loaded model {config['model']} to {self.device}")

    def create(self, params):
        if params.get("stream", False) and "messages" in params:
            raise NotImplementedError("Local models do not support streaming.")
        else:
            num_of_responses = params.get("n", 1)

            # can create my own data response class
            # here using SimpleNamespace for simplicity
            # as long as it adheres to the ClientResponseProtocol

            response = SimpleNamespace()

            chat_template = replace_system_role_with_user(params["messages"])
            #chat_template = params["messages"]

            inputs = self.tokenizer.apply_chat_template(
                chat_template, return_tensors="pt", add_generation_prompt=True
            ).to(self.device)
            inputs_length = inputs.shape[-1]

            # add inputs_length to max_length
            max_length = self.max_length + inputs_length
            generation_config = GenerationConfig(
                max_length=max_length,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.eos_token_id,
            )

            response.choices = []
            response.model = self.model_name

            for _ in range(num_of_responses):
                outputs = self.model.generate(inputs, generation_config=generation_config)
                # Decode only the newly generated text, excluding the prompt
                text = self.tokenizer.decode(outputs[0, inputs_length:])
                choice = SimpleNamespace()
                choice.message = SimpleNamespace()
                choice.message.content = text
                choice.message.function_call = None
                response.choices.append(choice)

            return response

    def message_retrieval(self, response):
        """Retrieve the messages from the response."""
        choices = response.choices
        return [choice.message.content for choice in choices]

    def cost(self, response) -> float:
        """Calculate the cost of the response."""
        response.cost = 0
        return 0

    @staticmethod
    def get_usage(response):
        # returns a dict of prompt_tokens, completion_tokens, total_tokens, cost, model
        # if usage needs to be tracked, else None
        return {}

In [9]:
config_list_custom = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
)
llm_config={"config_list": config_list_custom, "cache_seed": 43, "temperature": 0.7}





# Case - Group Orchestration

In [10]:
user_proxy = autogen.UserProxyAgent(
    name="User_proxy",
    system_message="A human admin.",
    code_execution_config=False,
    human_input_mode="TERMINATE",
    is_termination_msg=lambda x: x.get("content", "") and x.get("content", "").rstrip().endswith("TERMINATE"),
)
writer = autogen.AssistantAgent(
    name="Writer",
    system_message="""You are a blog post writer who is capable of writing a travel blog.
                      You generate one iteration of an article once at a time.
                      You never provide review comments.
                      You are open to comments and willing to make changes in your article based on these comments.
                    """,
    llm_config=llm_config,
    description="""This is a blog post writer who is capable of writing travel blogs.
                   The writer is open to any comments and recommendations for improving the article.
                   Ask writer to iterate article every time when there is a new change recommendation from editor.

                """
)
editor = autogen.AssistantAgent(
    name="Editor",
    system_message="""You review blog posts and give change recommendations to make the article more viral on social media, for example, adding hashtag.
                      You never write or revise blogs by yourself.
                       """,
    llm_config=llm_config,
    description="""This is an editor who reviews the blogs of writers and provides change ideas.
                    The editor should be called every time when the writer provides a version of a blog post.

                """
)

groupchat = autogen.GroupChat(agents=[user_proxy, writer, editor], messages=[], max_round=6)
manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)



[autogen.oai.client: 03-14 05:27:58] {417} INFO - Detected custom model client in config: CustomModelClient, model client can not be used until register_model_client is called.


INFO:autogen.oai.client:Detected custom model client in config: CustomModelClient, model client can not be used until register_model_client is called.


[autogen.oai.client: 03-14 05:27:58] {417} INFO - Detected custom model client in config: CustomModelClient, model client can not be used until register_model_client is called.


INFO:autogen.oai.client:Detected custom model client in config: CustomModelClient, model client can not be used until register_model_client is called.


[autogen.oai.client: 03-14 05:27:58] {417} INFO - Detected custom model client in config: CustomModelClient, model client can not be used until register_model_client is called.


INFO:autogen.oai.client:Detected custom model client in config: CustomModelClient, model client can not be used until register_model_client is called.


In [11]:
writer.register_model_client(model_client_cls=CustomModelClient)
editor.register_model_client(model_client_cls=CustomModelClient)
manager.register_model_client(model_client_cls=CustomModelClient)

CustomModelClient config: {'model': 'google/gemma-7b-it', 'model_client_cls': 'CustomModelClient', 'device': 'cuda', 'n': 1, 'params': {'max_length': 1000}}
Loaded model google/gemma-7b-it to cuda
CustomModelClient config: {'model': 'google/gemma-7b-it', 'model_client_cls': 'CustomModelClient', 'device': 'cuda', 'n': 1, 'params': {'max_length': 1000}}
Loaded model google/gemma-7b-it to cuda
CustomModelClient config: {'model': 'google/gemma-7b-it', 'model_client_cls': 'CustomModelClient', 'device': 'cuda', 'n': 1, 'params': {'max_length': 1000}}
Loaded model google/gemma-7b-it to cuda


In [12]:
user_proxy.initiate_chat(
    manager, message="""Generate a 100-word blog post about traveling in Bohol Island.
                      """
)

User_proxy (to chat_manager):

Generate a 100-word blog post about traveling in Bohol Island.
                      

--------------------------------------------------------------------------------
Writer (to chat_manager):

## Island Dreams: Exploring the Jewel of Bohol

Bohol Island, a gem nestled in the heart of the Philippines, is a paradise teeming with natural beauty and cultural treasures. From the breathtaking Chocolate Hills to the serene Loboc River, Bohol offers a captivating experience for travelers of all ages.

Upon arrival, you'll be greeted by the warm hospitality of the locals, known for their vibrant smiles and genuine warmth. Immerse yourself in the vibrant history of the island at the historic Alona Church, or marvel at the intricate architecture of the Baclayon Church.

For a glimpse into the natural wonders of Bohol, head to the Chocolate Hills, a surreal landscape adorned with hundreds of cone-shaped hills. Take a boat ride down the Loboc River, surrounded by lu

ChatResult(chat_id=None, chat_history=[{'content': 'Generate a 100-word blog post about traveling in Bohol Island.\n                      ', 'role': 'assistant'}, {'content': "## Island Dreams: Exploring the Jewel of Bohol\n\nBohol Island, a gem nestled in the heart of the Philippines, is a paradise teeming with natural beauty and cultural treasures. From the breathtaking Chocolate Hills to the serene Loboc River, Bohol offers a captivating experience for travelers of all ages.\n\nUpon arrival, you'll be greeted by the warm hospitality of the locals, known for their vibrant smiles and genuine warmth. Immerse yourself in the vibrant history of the island at the historic Alona Church, or marvel at the intricate architecture of the Baclayon Church.\n\nFor a glimpse into the natural wonders of Bohol, head to the Chocolate Hills, a surreal landscape adorned with hundreds of cone-shaped hills. Take a boat ride down the Loboc River, surrounded by lush greenery and cascading waterfalls.\n\nBoh

In [13]:
!nvidia-smi

Thu Mar 14 05:28:34 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0              72W /  70W |   8293MiB / 15360MiB |     71%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    