In [1]:
!pip install openai==1.11.1 tiktoken litellm

Defaulting to user installation because normal site-packages is not writeable
Collecting openai==1.11.1
  Using cached openai-1.11.1-py3-none-any.whl (226 kB)
INFO: pip is looking at multiple versions of litellm to determine which version is compatible with other requirements. This could take a while.
Collecting litellm
  Downloading litellm-1.22.5-py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting requests>=2.26.0 (from tiktoken)
  Downloading requests-2.31.0-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: requests, openai, litellm
  Attempting uninstall: requests
    Found existing installation: requests 2.29.0
    Uninstalling requests-2.29.0:
      Successfully uninstalled requests-2.29.0
  Attempting uninstall: openai
    Found existi

# Hosted inference
<img src="hosted_inference.JPG"
     alt="Markdown Monster icon"
     style="float: left; margin-right: 10px;" />
- 
-
## Infrastructure
- Azure, AWS, GCP
- note on huggingface

## AI Research
- Training the models
- Often times provide endpoints for inference with abstraction for their features

## Proxies
- Hosted service features
- OpenAI client as a standard
  - see below how Braintrust does it (https://www.braintrustdata.com/docs/guides/proxy)
- HTTP requests


In [2]:
from openai import OpenAI
import os
import time
 
client = OpenAI(
  base_url="https://braintrustproxy.com/v1",
  api_key=os.environ["BRAINTRUST_API_KEY"]
)

# [Litellm](https://github.com/BerriAI/litellm)
Proxy for accessing other models through the same OpenAI methods and responses.

In [4]:
import os
import litellm
from litellm import completion
litellm.set_verbose = True
from openai import OpenAI

In [5]:
# os.environ["HUGGINGFACE_API_KEY"] = ""
# os.environ["OPENAI_API_KEY"] = ""

In [6]:
openai_client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

# General Usage
- same inputs and outputs (opeai standard)

In [7]:
prompt = "What is the weather in Gainesville?"
messages = [{ "content": prompt,"role": "user"}]

In [8]:
openai_response = openai_client.chat.completions.create(
  model="gpt-3.5-turbo", 
  messages=messages, 
)
litellm_response = completion(
  model="huggingface/mistralai/Mixtral-8x7B-Instruct-v0.1", 
  messages=messages, 
)

print("\n\nOpenAI Response:\n", openai_response.choices[0].message.content)
print("\n\nLitellm Response:\n", litellm_response.choices[0].message.content)



[92mRequest to litellm:[0m
[92mlitellm.completion(model='huggingface/mistralai/Mixtral-8x7B-Instruct-v0.1', messages=[{'content': 'What is the weather in Gainesville?', 'role': 'user'}])[0m


self.optional_params: {}
kwargs[caching]: False; litellm.cache: None
self.optional_params: {}
mistralai/Mixtral-8x7B-Instruct-v0.1, text-generation-inference
[92m

POST Request Sent from LiteLLM:
curl -X POST \
https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1 \
-H 'content-type: application/json' -H 'Authorization: Bearer hf_xjjahKPGcXInSc********************' \
-d '{'inputs': '<s>[INST] What is the weather in Gainesville? [/INST]', 'parameters': {'details': True, 'return_full_text': False}, 'stream': False}'
[0m

response: [{'generated_text': " I don't have real-time data or location tracking capabilities, so I can't provide you with the current weather in Gainesville. However, I can tell you that Gainesville is located in the state of Florida, USA, and its

- works for usecases other than completions

In [18]:
import numpy as np
from litellm import embedding
import os
openai_response = openai_client.embeddings.create(
  model="text-embedding-ada-002",
  input=[prompt],
)
litellm_response = embedding(
    model='huggingface/microsoft/codebert-base',
    input=[prompt]
)

print("\n", openai_response.data[0].embedding[:10])
print("\nOpenAI shape", np.array(openai_response.data[0].embedding).shape)
print("\nLitellm shape", np.array(litellm_response.data[0]["embedding"]).shape)




[92mRequest to litellm:[0m
[92mlitellm.embedding(model='huggingface/microsoft/codebert-base', input=['What is the weather in Gainesville?'])[0m


self.optional_params: {}
kwargs[caching]: False; litellm.cache: None
self.optional_params: {}
[92m

POST Request Sent from LiteLLM:
curl -X POST \
https://api-inference.huggingface.co/models/microsoft/codebert-base \
-H 'content-type: application/json' -H 'Authorization: Bearer hf_xjjahKPGcXInSc********************' \
-d '{'inputs': ['What is the weather in Gainesville?']}'
[0m

Looking up model=microsoft/codebert-base in model_cost_map

 [0.004790938924998045, 0.0064730034209787846, 0.007729643490165472, -0.011499562300741673, -0.0208261851221323, 0.007814728654921055, -0.00651554623618722, -0.005255633965134621, -0.004542228765785694, -0.04207124933600426]

OpenAI shape (1536,)

Litellm shape (768,)


- can do streaming

In [13]:
litellm.set_verbose = False

stream_response = completion(
  model="huggingface/mistralai/Mixtral-8x7B-Instruct-v0.1", 
  messages=[{ "content": prompt, "role": "user"}], 
  stream=True,
)

for chunk in stream_response:
  print(chunk.choices[0].delta.content, end="\n")

litellm.set_verbose = True

I
 don
'
t
 have
 real
-
time
 data
 or
 location
 tracking
 capabilities
,
 so
 I
 can
'
t
 provide
 you
 with
 the
 current
 weather
 in
 G
aines
ville
.
 However
,
 I
 can
 tell
 you
 that
 G
aines
ville
 is
 located
 in
 the
 state
 of
 Florida
,
 USA
,
 and
 its
 climate
 is
 characterized
 as
 hum
id
 sub
t
rop
ical
,
 with
 hot
,
 hum
id
 sum
mers
 and
 mild
,
 dry
 win
ters
.
 The
 city
'
s
 weather
 can
 be
 quite
 variable
,
 with
 occasional
 cold
 front
s
 bringing
 cool
er
 temperatures
 in
 the
 winter
 and
Goes into checking if chunk has hiddden created at param
Chunks have a created at hidden param
Chunks sorted
token_counter messages received: [{'content': 'What is the weather in Gainesville?', 'role': 'user'}]
Token Counter - using generic token counter, for model=mistralai/Mixtral-8x7B-Instruct-v0.1
LiteLLM: Utils - Counting tokens for OpenAI model=gpt-3.5-turbo
Token Counter - using generic token counter, for model=mistralai/Mixtral-8x7B-Instruct-v0.1
LiteLLM: Utils

- How was it instant?
- What is contained in the chunks of the stream (full generatio or tokens)?

- By superimposing to the prompt the model is able to exhibit functionalities of other LLMs
https://github.com/BerriAI/litellm/blob/d69edac11ba4acdb03116cde253cc0d7caadcf68/litellm/llms/prompt_templates/factory.py#L531-L545

In [14]:
litellm.add_function_to_prompt = True 
prompt_engineered_messages = [
  { "content": "Answer in spanish", "role": "system"},
  { "content": "Produce the function call response and nothing else, here is the prompt:"+prompt,"role": "user"}
]
response = completion(
  model="huggingface/mistralai/Mixtral-8x7B-Instruct-v0.1", 
  messages= prompt_engineered_messages,
  functions = [
    {
      "name": "get_current_weather",
      "description": "Get the current weather in a given location",
      "parameters": {
        "type": "object",
        "properties": {
          "location": {
            "type": "string",
            "description": "The city and state, e.g. San Francisco, CA"
          },
          "unit": {
            "type": "string",
            "enum": ["celsius", "fahrenheit"]
          }
        },
        "required": ["location"]
      }
    }
  ],
  temperature=0.01
)
content = response.choices[0].message.content
print("\n\ncontent:", content)
import json
content = response.choices[0].message.content


modified_content = content.replace('\_', '_') # This is a workaround for the Mixtral generation
json_content = json.loads(modified_content)
print("json_content:", json.dumps(json_content, indent=2))



[92mRequest to litellm:[0m
[92mlitellm.completion(model='huggingface/mistralai/Mixtral-8x7B-Instruct-v0.1', messages=[{'content': 'Answer in spanish', 'role': 'system'}, {'content': 'Produce the function call response and nothing else, here is the prompt:What is the weather in Gainesville?', 'role': 'user'}], functions=[{'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}], temperature=0.01)[0m


self.optional_params: {}
kwargs[caching]: False; litellm.cache: None
self.optional_params: {'temperature': 0.01}
mistralai/Mixtral-8x7B-Instruct-v0.1, text-generation-inference
[92m

POST Request Sent from LiteLLM:
curl -X POST \
https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1 \
-H 'con

In [17]:
litellm.add_function_to_prompt = True 

response = completion(
  model="huggingface/tiiuae/falcon-7b-instruct", 
  messages=prompt_engineered_messages,
  functions = [
    {
      "name": "get_current_weather",
      "description": "Get the current weather in a given location",
      "parameters": {
        "type": "object",
        "properties": {
          "location": {
            "type": "string",
            "description": "The city and state, e.g. San Francisco, CA"
          },
          "unit": {
            "type": "string",
            "enum": ["celsius", "fahrenheit"]
          }
        },
        "required": ["location"]
      }
    }
  ],
)
content = response.choices[0].message.content
print("\n\ncontent:", content)
import json
content = response.choices[0].message.content


modified_content = content.replace('\_', '_')
json_content = json.loads(modified_content)
print("json_content:", json.dumps(json_content, indent=2))



[92mRequest to litellm:[0m
[92mlitellm.completion(model='huggingface/tiiuae/falcon-7b-instruct', messages=[{'content': "Answer in spanishProduce JSON OUTPUT ONLY! The following functions are available to you:\n{'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}\nProduce JSON OUTPUT ONLY! The following functions are available to you:\n{'name': 'get_current_weather', 'description': 'Get the current weather in a given location', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and state, e.g. San Francisco, CA'}, 'unit': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location']}}\nProduce JSON OUTPUT ONLY! The following functions are avai

APIError: HuggingfaceException - response is not in expected format - [{'error': 'Model tiiuae/falcon-7b-instruct is currently loading', 'estimated_time': 1685.13330078125}]