In [12]:
!pip install openai==1.11.1 tiktoken litellm

Defaulting to user installation because normal site-packages is not writeable


# Hosted inference
<img src="hosted_inference.JPG"
     alt="Markdown Monster icon"
     style="float: left; margin-right: 10px;" />
- 
-
## Infrastructure
- Azure, AWS, GCP
- note on huggingface

## AI Research
- Training the models
- Often times provide endpoints for inference with abstraction for their features

## Proxies
- Hosted service features
- OpenAI client as a standard
  - see below how Braintrust does it (https://www.braintrustdata.com/docs/guides/proxy)
- HTTP requests


In [2]:
from openai import OpenAI
import os
import time
 
client = OpenAI(
  base_url="https://braintrustproxy.com/v1",
  api_key=os.environ["BRAINTRUST_API_KEY"]
)

# [Litellm](https://github.com/BerriAI/litellm)
Proxy for accessing other models through the same OpenAI methods and responses.

In [9]:
import os
import litellm
from litellm import completion
litellm.set_verbose = True
from openai import OpenAI

In [13]:
os.environ["HUGGINGFACE_API_KEY"] = ""
os.environ["OPENAI_API_KEY"] = ""

In [14]:
openai_client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

# General Usage
- same inputs and outputs (opeai standard)

In [15]:
prompt = "What is the weather in Gainesville?"
messages = [{ "content": prompt,"role": "user"}]

In [21]:
openai_response = openai_client.chat.completions.create(
  model="gpt-3.5-turbo", 
  messages=messages, 
)
litellm_response = completion(
  model="huggingface/mistralai/Mixtral-8x7B-Instruct-v0.1", 
  messages=messages, 
)

print("\n\nOpenAI Response:\n", openai_response.choices[0].message.content)
print("\n\nLitellm Response:\n", litellm_response.choices[0].message.content)

LiteLLM: checking params for mistralai/Mixtral-8x7B-Instruct-v0.1
LiteLLM: params passed in {'functions': [], 'function_call': '', 'temperature': None, 'top_p': None, 'stream': None, 'stop': None, 'max_tokens': None, 'presence_penalty': None, 'frequency_penalty': None, 'logit_bias': {}, 'user': '', 'request_timeout': None, 'deployment_id': None, 'custom_llm_provider': 'huggingface', 'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'n': None}
LiteLLM: non-default params passed in {}
LiteLLM: self.optional_params: {}
LiteLLM: mistralai/Mixtral-8x7B-Instruct-v0.1, text-generation-inference
LiteLLM: Logging Details Pre-API Call for call id 033bac2f-1900-40bf-88d9-d5bb3a79aca6
LiteLLM: model call details: {'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'messages': [{'content': 'What is the weather in Gainesville?', 'role': 'user'}], 'optional_params': {'details': True, 'return_full_text': False}, 'litellm_params': {'return_async': False, 'api_key': None, 'force_timeout': 600, 'logger_fn':

- works for usecases other than completions

In [31]:
import numpy as np
from litellm import embedding
import os
openai_response = openai_client.embeddings.create(
  model="text-embedding-ada-002",
  input=[prompt],
)
litellm_response = embedding(
    model='huggingface/microsoft/codebert-base',
    input=[prompt]
)

print("\n", openai_response.data[0].embedding[:10])
print("\nOpenAI shape", np.array(openai_response.data[0].embedding).shape)
print("\nLitellm shape", np.array(litellm_response.data[0]["embedding"]).shape)


LiteLLM: self.optional_params: {}
LiteLLM: Logging Details Pre-API Call for call id 71ade537-dcd1-48b6-a7b7-0d6aa2688759
LiteLLM: model call details: {'model': 'microsoft/codebert-base', 'messages': ['What is the weather in Gainesville?'], 'optional_params': {}, 'litellm_params': {'force_timeout': 60, 'azure': False, 'litellm_call_id': '71ade537-dcd1-48b6-a7b7-0d6aa2688759', 'logger_fn': None}, 'start_time': datetime.datetime(2024, 2, 5, 17, 28, 38, 53492), 'input': ['What is the weather in Gainesville?'], 'api_key': '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', 'additional_args': {'complete_input_dict': {'inputs': ['What is the weather in Gainesville?']}}}
LiteLLM: model call details: {'model': 'microsoft/codebert-base', 'messages': ['What is the weather in Gainesville?'], 'optional_params': {}, 'litellm_params': {'force_timeout': 60, 'azure': False, 'litellm_call_id': '71ade537-dcd1-48b6-a7b7-0d6aa2688759', 'logger_fn': None}, 'start_time': datetime.datetime(2024, 2, 5, 17, 28, 38, 53492)

- can do streaming

In [42]:
litellm.set_verbose = False

stream_response = completion(
  model="huggingface/mistralai/Mixtral-8x7B-Instruct-v0.1", 
  messages=[{ "content": prompt, "role": "user"}], 
  stream=True,
)

for chunk in stream_response:
  print(chunk.choices[0].delta.content, end="")

litellm.set_verbose = True

stream result: {
  "object": "chat.completion.chunk",
  "choices": [
    {
      "finish_reason": null,
      "index": 0,
      "delta": {
        "content": " I",
        "role": "assistant"
      }
    }
  ],
  "id": "chatcmpl-c255a120-ebae-4380-83cb-1be4a2bfe9dc",
  "created": 1707172391,
  "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
  "usage": {
    "prompt_tokens": null,
    "completion_tokens": null,
    "total_tokens": null
  }
}
{
  "finish_reason": null,
  "index": 0,
  "delta": {
    "content": " I",
    "role": "assistant"
  }
}stream result: {
  "object": "chat.completion.chunk",
  "choices": [
    {
      "finish_reason": null,
      "index": 0,
      "delta": {
        "content": " don"
      }
    }
  ],
  "id": "chatcmpl-d13f62b6-31e0-4ccb-a90e-304488d4597f",
  "created": 1707172391,
  "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
  "usage": {
    "prompt_tokens": null,
    "completion_tokens": null,
    "total_tokens": null
  }
}
{
  "finish_reason": null,
  "i

- How was it instant?
- What is contained in the chunks of the stream (full generatio or tokens)?

- By superimposing to the prompt the model is able to exhibit functionalities of other LLMs
https://github.com/BerriAI/litellm/blob/d69edac11ba4acdb03116cde253cc0d7caadcf68/litellm/llms/prompt_templates/factory.py#L531-L545

In [47]:
litellm.add_function_to_prompt = True 
prompt_engineered_messages = [
  { "content": "Answer in spanish", "role": "system"},
  { "content": "Produce the function call response and nothing else, here is the prompt:"+prompt,"role": "user"}
]
response = completion(
  model="huggingface/mistralai/Mixtral-8x7B-Instruct-v0.1", 
  messages= prompt_engineered_messages,
  functions = [
    {
      "name": "get_current_weather",
      "description": "Get the current weather in a given location",
      "parameters": {
        "type": "object",
        "properties": {
          "location": {
            "type": "string",
            "description": "The city and state, e.g. San Francisco, CA"
          },
          "unit": {
            "type": "string",
            "enum": ["celsius", "fahrenheit"]
          }
        },
        "required": ["location"]
      }
    }
  ],
  temperature=0.01
)
content = response.choices[0].message.content
print("\n\ncontent:", content)
import json
content = response.choices[0].message.content


modified_content = content.replace('\_', '_') # This is a workaround for the Mixtral generation
json_content = json.loads(modified_content)
print("json_content:", json.dumps(json_content, indent=2))

LiteLLM: checking params for mistralai/Mixtral-8x7B-Instruct-v0.1
LiteLLM: params passed in {'functions': [{'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}], 'function_call': '', 'temperature': 0.01, 'top_p': None, 'stream': None, 'stop': None, 'max_tokens': None, 'presence_penalty': None, 'frequency_penalty': None, 'logit_bias': {}, 'user': '', 'request_timeout': None, 'deployment_id': None, 'custom_llm_provider': 'huggingface', 'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'n': None}
LiteLLM: non-default params passed in {'temperature': 0.01}
LiteLLM: self.optional_params: {'temperature': 0.01}
LiteLLM: mistralai/Mixtral-8x7B-Instruct-v0.1, text-generation-inference
LiteLLM: Logging Details Pre-API Call for c

In [49]:
litellm.add_function_to_prompt = True 

response = completion(
  model="huggingface/tiiuae/falcon-7b-instruct", 
  messages=prompt_engineered_messages,
  functions = [
    {
      "name": "get_current_weather",
      "description": "Get the current weather in a given location",
      "parameters": {
        "type": "object",
        "properties": {
          "location": {
            "type": "string",
            "description": "The city and state, e.g. San Francisco, CA"
          },
          "unit": {
            "type": "string",
            "enum": ["celsius", "fahrenheit"]
          }
        },
        "required": ["location"]
      }
    }
  ],
)
content = response.choices[0].message.content
print("\n\ncontent:", content)
import json
content = response.choices[0].message.content


modified_content = content.replace('\_', '_')
json_content = json.loads(modified_content)
print("json_content:", json.dumps(json_content, indent=2))

LiteLLM: checking params for tiiuae/falcon-7b-instruct
LiteLLM: params passed in {'functions': [{'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}], 'function_call': '', 'temperature': None, 'top_p': None, 'stream': None, 'stop': None, 'max_tokens': None, 'presence_penalty': None, 'frequency_penalty': None, 'logit_bias': {}, 'user': '', 'request_timeout': None, 'deployment_id': None, 'custom_llm_provider': 'huggingface', 'model': 'tiiuae/falcon-7b-instruct', 'n': None}
LiteLLM: non-default params passed in {}
LiteLLM: self.optional_params: {}
LiteLLM: tiiuae/falcon-7b-instruct, text-generation-inference
LiteLLM: Logging Details Pre-API Call for call id a03d2ed4-16a5-4e0b-b04d-3d40a3aae827
LiteLLM: model call details

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)