# Introduction

this notebook demos example of using llm in a MPS backend (apple silicon GPU) using torch 2.x to do a local function calling

Referece:
* https://ollama.com/blog/tool-support
* torch 2.x MPS Backend: https://pytorch.org/docs/stable/notes/mps.html
* https://blog.pamelafox.org/2024/08/making-ollama-compatible-rag-app.html
* https://platform.openai.com/docs/guides/function-calling

In [1]:
import os
import torch
import applyllm as apl

print(apl.__version__)

0.0.7


In [2]:
# check that MPS is availabe (Metal Performance Shaders)
if not torch.backends.mps.is_available():
    print("MPS is not available")
else:
    print("MPS is available")
    mps_device = torch.device("mps")
    print(mps_device)



MPS is available
mps


## Define global variables

In [3]:
backend = "ollama"
# backend = "lmstudio"

if backend == "ollama":
    # model="llama3-groq-tool-use:8b"
    # model="gorilla-openfunctions-v2-q4_K_M"
    model="llama3.1:8b"
    endpoint="http://localhost:11434/v1"
elif backend == "lmstudio":
    model="TheBloke/gorilla-openfunctions-v1-GGUF"
    endpoint="http://localhost:1234/v1"  

In [4]:
from openai import OpenAI
import json

def get_llm_response(
    prompt="Call me an Uber ride type \"Plus\" in Berkeley at zipcode 94704 in 10 minutes", 
    model="gorilla-openfunctions-v2-q4_K_M",
    endpoint="http://localhost:11434/v1",
    functions=[]):
    client = OpenAI(
        api_key="EMPTY",
        base_url=endpoint,
    )
    # client = CustomOpenAIClient(
    #     api_key="EMPTY",
    #     base_url=endpoint
    # )
    try:
        # extend with custom prompt
        # prompt = f'<<question>> {prompt} <<function>> {json.dumps(functions)}'
        completion = client.chat.completions.create(
            model=model, # model from the ollama list
            temperature=0.01,
            top_p=1,
            max_tokens=80,
            messages=[
                {"role": "user", "content": prompt}, 
            ],
            tools=functions,
            # tool_choice= "required",
            stream=False,
            tool_choice= "auto",
        )
        print(completion)
        return completion.choices[0]
    except Exception as e:
        print(e, model, prompt)

### Example of Parallel Function calling
Human: 
```text
What's the weather like in the two cities of Boston and San Francisco?
```
Agent: 
```json
...
        "tool_calls": [
          {
            "id": "1",
            "function": {
              "arguments": "{\"location\": \"Boston, MA\"}",
              "name": "get_current_weather"
            },
            "type": "function"
          },
          {
            "id": "1",
            "function": {
              "arguments": "{\"location\": \"San Francisco, CA\"}",
              "name": "get_current_weather"
            },
            "type": "function"
          }
        ]
```

In [5]:
query = "What's the weather like in the two cities of Boston and San Francisco?"
functions = [
    {   "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["location"],
            },
        }
    }
]

# from pydantic import BaseModel
# import openai
# class get_current_weather(BaseModel):
#     """Get the current weather in a given location"""
#     location: str # The city and state, e.g. San Francisco, CA
#     unit: str

# functions = [openai.pydantic_function_tool(get_current_weather)]

response = get_llm_response(query, functions=functions, model=model, endpoint=endpoint)

ChatCompletion(id='chatcmpl-776', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content='', refusal=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_br52p4f5', function=Function(arguments='{"location":"Boston, MA","unit":"fahrenheit"}', name='get_current_weather'), type='function'), ChatCompletionMessageToolCall(id='call_kc98lgwb', function=Function(arguments='{"location":"San Francisco, CA","unit":"fahrenheit"}', name='get_current_weather'), type='function')]))], created=1723757789, model='llama3.1:8b', object='chat.completion', service_tier=None, system_fingerprint='fp_ollama', usage=CompletionUsage(completion_tokens=55, prompt_tokens=190, total_tokens=245))


In [6]:
# successfully make parallel function calling
print(response.message.tool_calls)

[ChatCompletionMessageToolCall(id='call_br52p4f5', function=Function(arguments='{"location":"Boston, MA","unit":"fahrenheit"}', name='get_current_weather'), type='function'), ChatCompletionMessageToolCall(id='call_kc98lgwb', function=Function(arguments='{"location":"San Francisco, CA","unit":"fahrenheit"}', name='get_current_weather'), type='function')]
