# Introduction

this notebook demos example of using llm in a MPS backend (apple silicon GPU) using torch 2.x

Referece:
* torch 2.x MPS Backend: https://pytorch.org/docs/stable/notes/mps.html

In [1]:
import os
import torch
import applyllm as apl
from utils.openai import CustomOpenAIClient

print(apl.__version__)

0.0.6


In [2]:
# check that MPS is availabe (Metal Performance Shaders)
if not torch.backends.mps.is_available():
    print("MPS is not available")
else:
    print("MPS is available")
    mps_device = torch.device("mps")
    print(mps_device)



MPS is available
mps


## Define global variables

In [3]:
backend = "ollama"
# backend = "lmstudio"

if backend == "ollama":
    model="gorilla-openfunctions-v2-q4_K_M"
    endpoint="http://localhost:11434/v1"
elif backend == "lmstudio":
    model="TheBloke/gorilla-openfunctions-v1-GGUF"
    endpoint="http://localhost:1234/v1"  

In [4]:
#sdk openai==0.28.1


# import openai
# import json

# def get_gorilla_response(
#     prompt="Call me an Uber ride type \"Plus\" in Berkeley at zipcode 94704 in 10 minutes", 
#     model="gorilla-openfunctions-v2-q4_K_M",
#     endpoint="http://localhost:11434/v1",
#     functions=[]):
#   openai.api_key = "EMPTY"
#   openai.api_base = endpoint
#   try:
#     # extend with custom prompt
#     prompt = f'<<question>> {prompt} <<function>> {json.dumps(functions)}'
#     completion = openai.ChatCompletion.create(
#       # model="adrienbrault/gorilla-openfunctions-v2:Q4_K_M",
#       model=model, # model from the ollama list
#       temperature=0.01,
#       top_p=1,
#       messages=[{"role": "user", "content": prompt}],
#       # stop=["<|EOT|>"],
#       # start=["<｜begin▁of▁sentence｜>"],
#       functions=functions,
#     )
#     return completion.choices[0]
#   except Exception as e:
#     print(e, model, prompt)

In [5]:
#sdk openai==1.17.0


from openai import OpenAI
import json

def get_gorilla_response(
    prompt="Call me an Uber ride type \"Plus\" in Berkeley at zipcode 94704 in 10 minutes", 
    model="gorilla-openfunctions-v2-q4_K_M",
    endpoint="http://localhost:11434/v1",
    functions=[]):
    client = OpenAI(
        api_key="EMPTY",
        base_url=endpoint,
    )
    # client = CustomOpenAIClient(
    #     api_key="EMPTY",
    #     base_url=endpoint
    # )
    try:
        # extend with custom prompt
        prompt = f'<<question>> {prompt} <<function>> {json.dumps(functions)}'
        completion = client.chat.completions.create(
            model=model, # model from the ollama list
            temperature=0.01,
            top_p=1,
            max_tokens=80,
            messages=[{"role": "user", "content": prompt}],
            functions=functions,
            tool_choice= "auto",
        )
        print(completion)
        return completion.choices[0]
    except Exception as e:
        print(e, model, prompt)

### Example of Parallel Function calling
Human: 
```text
What's the weather like in the two cities of Boston and San Francisco?
```
Agent: 
```json
...
        "tool_calls": [
          {
            "id": "1",
            "function": {
              "arguments": "{\"location\": \"Boston, MA\"}",
              "name": "get_current_weather"
            },
            "type": "function"
          },
          {
            "id": "1",
            "function": {
              "arguments": "{\"location\": \"San Francisco, CA\"}",
              "name": "get_current_weather"
            },
            "type": "function"
          }
        ]
```

In [6]:
query = "What's the weather like in the two cities of Boston and San Francisco?"
functions = [
    {
        "name": "get_current_weather",
        "description": "Get the current weather in a given location",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {
                    "type": "string",
                    "description": "The city and state, e.g. San Francisco, CA",
                },
                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
            },
            "required": ["location"],
        },
    }
]
response = get_gorilla_response(query, functions=functions, model=model, endpoint=endpoint)

ChatCompletion(id='chatcmpl-3', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="<<function>>get_current_weather(location='Boston, MA')<<function>>get_current_weather(location='San Francisco, CA')", role='assistant', function_call=None, tool_calls=None))], created=1715786929, model='gorilla-openfunctions-v2-q4_K_M', object='chat.completion', system_fingerprint='fp_ollama', usage=CompletionUsage(completion_tokens=32, prompt_tokens=138, total_tokens=170))


In [7]:
# successfully make parallel function calling
print(response.message.content)

<<function>>get_current_weather(location='Boston, MA')<<function>>get_current_weather(location='San Francisco, CA')


### Use custom response adapter to convert gorilla response to OpenAI response Schema

In [8]:
from utils.openai import CustomLLMResponseAdapter
openai_json_response = CustomLLMResponseAdapter.adapt_response(response=response.message.content).to_json()

In [9]:
print(openai_json_response)

{
  "id": "chatcmpl-default-id",
  "choices": [
    {
      "finish_reason": "tool_calls",
      "index": 0,
      "logprobs": null,
      "message": {
        "content": "",
        "role": "assistant",
        "function_call": null,
        "tool_calls": [
          {
            "id": "1",
            "function": {
              "arguments": "{\"location\": \"Boston, MA\"}",
              "name": "get_current_weather"
            },
            "type": "function"
          },
          {
            "id": "1",
            "function": {
              "arguments": "{\"location\": \"San Francisco, CA\"}",
              "name": "get_current_weather"
            },
            "type": "function"
          }
        ]
      }
    }
  ],
  "created": 0,
  "model": "default-model",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 0,
    "prompt_tokens": 0,
    "total_tokens": 0
  }
}
