# Introduction

this notebook demos example of using llm in a MPS model_name (apple silicon GPU) using torch 2.x to do a local function calling

Referece:
* https://ollama.com/blog/tool-support
* torch 2.x MPS model_name: https://pytorch.org/docs/stable/notes/mps.html
* https://blog.pamelafox.org/2024/08/making-ollama-compatible-rag-app.html
* https://platform.openai.com/docs/guides/function-calling

ollama server log on mac `tail -f $HOME/.ollama/logs/server.log`

In [1]:
import os
import torch
import applyllm as apl
# from dotenv import load_dotenv
from dotenv import dotenv_values

print(apl.__version__)

0.0.7


In [2]:
# check that MPS is availabe (Metal Performance Shaders)
if not torch.backends.mps.is_available():
    print("MPS is not available")
else:
    print("MPS is available")
    mps_device = torch.device("mps")
    print(mps_device)



MPS is available
mps


## Define global variables

In [3]:
# model_name = "ollama3.2"
# model_name = "mistral-nemo"
# model_name = "mistral-nemo-inst"
# model_name = "ollama-inst"
# model_name = "lmstudio"
# model_name = "openwebui"

OLLAMA = "ollama"
LMSTUDIO = "lmstudio"
OPENWEBUI = "openwebui"

from dataclasses import dataclass, field
@dataclass
class ModelChoice:
    model_name: str = "ollama3.2"
    model: str = ""
    endpoint: str = "http://localhost:11434/v1"
    backend_type: str = OLLAMA
    api_key: str = "EMPTY"
    sys_msg: dict = field(default_factory=dict)

# set the ollama qwen2.5-coder:14b model with tool calling ability
# LMStudio models qwen instruct are not tool calling compatible
model_choice = ModelChoice(
    model_name="",
    model="qwen2.5-coder:14b",
    backend_type=OLLAMA
)


# nemo need sys message
# nemo works with ollama 0.3.14 version, not the 0.3.13 version
# https://github.com/ollama/ollama/issues/6713#issuecomment-2338933604


# both the ollama3.1:8b and llama3.1:8b-instruct-q3_K_M models
# works with openai compatible function calling on device
if model_choice.model_name == "ollama3.2":
    # model="gorilla-openfunctions-v2-q4_K_M"
    model_choice.model="llama3.2:3b"
elif model_choice.model_name == "ollama-inst":
    model_choice.model="llama3.1:8b-instruct-q3_K_M"
elif model_choice == "mistral-nemo-inst":
    model_choice.model="mistral-nemo:12b-instruct-2407-fp16"
elif model_choice.model_name == "mistral-nemo":
    model_choice.model="mistral-nemo:12b"
elif model_choice.model_name == "gorilla-v1":
    model_choice.model="TheBloke/gorilla-openfunctions-v1-GGUF"
    model_choice.backend_type = LMSTUDIO
elif model_choice.model_name == "qwencoder32b":
    model_choice.model="qwen2.5-coder-32b-instruct"
    model_choice.backend_type = LMSTUDIO
# elif model_name == "openwebui":
#     model="llama3.1:70b"
#     endpoint="https://core-llmtest.med.uni-muenchen.de/ollama/v1"
#     config = dotenv_values(dotenv_path="envs/openwebui.env")
#     api_key = config["API_KEY"]

if model_choice.backend_type == OLLAMA:
    model_choice.endpoint="http://localhost:11434/v1"
elif model_choice.backend_type == LMSTUDIO:
    model_choice.endpoint="http://localhost:1234/v1"
elif model_choice.backend_type == OPENWEBUI:
    # notice the openwebui endpoint is not just pass through forward to ollama
    # the current openwebui endpoint is not openai compatible
    model_choice.endpoint="https://localhost:34567/ollama/v1"

In [4]:
# from openai import OpenAI

# client = OpenAI(
#     base_url = endpoint,
#     api_key=api_key, # required, but unused
# )

# response = client.chat.completions.create(
#   model=model,
#   messages=[
#     {"role": "system", "content": "You are a helpful assistant."},
#     {"role": "user", "content": "Who won the world series in 2020?"},
#     {"role": "assistant", "content": "The LA Dodgers won in 2020."},
#     {"role": "user", "content": "Where was it played?"}
#   ]
# )
# print(response.choices[0].message.content)

In [5]:
from openai import OpenAI
import json

def get_llm_response(
    prompt="Call me an Uber ride type \"Plus\" in Berkeley at zipcode 94704 in 10 minutes", 
    model="gorilla-openfunctions-v2-q4_K_M",
    endpoint="http://localhost:11434/v1",
    api_key="EMPTY",
    sys_msg={},
    functions=[]):


    client = OpenAI(
        api_key=api_key,
        base_url=endpoint,
    )
    # client = CustomOpenAIClient(
    #     api_key="EMPTY",
    #     base_url=endpoint
    # )
  
    try:
        # extend with custom prompt
        # prompt = f'<<question>> {prompt} <<function>> {json.dumps(functions)}'
        completion = client.chat.completions.create(
            model=model, # model from the ollama list
            temperature=0.01,
            top_p=1,
            max_tokens=80,
            messages=[{"role": "user", "content": prompt}, ] if sys_msg == {} else [sys_msg, {"role": "user", "content": prompt}, ],
            tools=functions,
            # tool_choice= "required",
            stream=False,
            tool_choice= "auto",
        )
        print(completion)
        return completion.choices[0]
    except Exception as e:
        print(e, model, prompt)

### Example of Parallel Function calling
Human: 
```text
What's the weather like in the two cities of Boston and San Francisco?
```
Agent: 
```json
...
        "tool_calls": [
          {
            "id": "1",
            "function": {
              "arguments": "{\"location\": \"Boston, MA\"}",
              "name": "get_current_weather"
            },
            "type": "function"
          },
          {
            "id": "1",
            "function": {
              "arguments": "{\"location\": \"San Francisco, CA\"}",
              "name": "get_current_weather"
            },
            "type": "function"
          }
        ]
```

In [6]:
# response = get_llm_response("hi", functions=[], model=model, endpoint=endpoint, api_key=api_key)
# print(response)

In [7]:
query = "What's the weather like in the two cities of Boston and San Francisco?"
functions = [
    {   "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["location"],
            },
        }
    }
]

# from pydantic import BaseModel
# import openai
# class get_current_weather(BaseModel):
#     """Get the current weather in a given location"""
#     location: str # The city and state, e.g. San Francisco, CA
#     unit: str

# functions = [openai.pydantic_function_tool(get_current_weather)]

response = get_llm_response(
    query, functions=functions, 
    model=model_choice.model, endpoint=model_choice.endpoint, api_key=model_choice.api_key, sys_msg=model_choice.sys_msg)

ChatCompletion(id='chatcmpl-640', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_x3we59v2', function=Function(arguments='{"location":"Boston, MA"}', name='get_current_weather'), type='function', index=0), ChatCompletionMessageToolCall(id='call_c0z6sls7', function=Function(arguments='{"location":"San Francisco, CA"}', name='get_current_weather'), type='function', index=0)]))], created=1733959735, model='qwen2.5-coder:14b', object='chat.completion', service_tier=None, system_fingerprint='fp_ollama', usage=CompletionUsage(completion_tokens=74, prompt_tokens=200, total_tokens=274, completion_tokens_details=None, prompt_tokens_details=None))


In [8]:
# successfully make parallel function calling
if model_choice.model == "gorilla-openfunctions-v2-q4_K_M":
    from utils.openai import CustomLLMResponseAdapter
    completion = CustomLLMResponseAdapter.adapt_response(response=response.message.content)
    response = completion.choices[0]

if response is not None:
    print(response.message.content)
else:
    print("No response")




In [9]:
if response is not None:
    openai_json_response = response.to_json()
    print(openai_json_response)

{
  "finish_reason": "tool_calls",
  "index": 0,
  "message": {
    "content": "",
    "role": "assistant",
    "tool_calls": [
      {
        "id": "call_x3we59v2",
        "function": {
          "arguments": "{\"location\":\"Boston, MA\"}",
          "name": "get_current_weather"
        },
        "type": "function",
        "index": 0
      },
      {
        "id": "call_c0z6sls7",
        "function": {
          "arguments": "{\"location\":\"San Francisco, CA\"}",
          "name": "get_current_weather"
        },
        "type": "function",
        "index": 0
      }
    ]
  }
}
