In [1]:
%load_ext autoreload
%autoreload 2
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=2

from pydantic import BaseModel


from encourage.llm.inference_runner import BatchInferenceRunner
from encourage.prompts.prompt_collection import PromptCollection
from vllm import LLM, SamplingParams

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=2


### Init vllm 

In [2]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

llm = LLM(model=model_name, gpu_memory_utilization=0.95)
sampling_params = SamplingParams(temperature=0.5, max_tokens=10)


INFO 10-29 15:38:11 config.py:1010] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 10-29 15:38:11 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, 

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 10-29 15:38:21 model_runner.py:1025] Loading model weights took 14.9888 GB
INFO 10-29 15:38:23 gpu_executor.py:122] # GPU blocks: 14673, # CPU blocks: 2048
INFO 10-29 15:38:27 model_runner.py:1329] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 10-29 15:38:27 model_runner.py:1333] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 10-29 15:38:53 model_runner.py:1456] Graph capturing finished in 27 secs.


### Define the toy data structure

In [3]:
sys_prompts = "You are a helpful AI."

# List of user prompts (questions or requests for the AI)
user_prompts = ["What is the capital of France?", "What is the capital of Germany?"] * 5

# Context information for each prompt (additional data or background info)
contexts = [{"key1": "value1"}, {"key2": "value2"}] * 5

# Metadata associated with each prompt (e.g., priority, tags)
meta_datas = [{"meta": "data1"}, {"meta": "data2"}] * 5

# Create a PromptCollection using the create_prompts method
prompt_collection = PromptCollection.create_prompts(
    sys_prompts=sys_prompts,  # System prompt or list of system prompts
    user_prompts=user_prompts,  # List of user prompts
    contexts=contexts,  # List of context dictionaries (optional)
    meta_datas=meta_datas,  # List of metadata dictionaries (optional)
    model_name=model_name,  # The name of the model being used (optional)
)


### Init the Inference Runner with no structured output

In [None]:
runner = BatchInferenceRunner(llm, sampling_params)
responses = runner.run(prompt_collection)
responses.print_response_summary()

### Change the user request and add a structured output with pydantic model

In [5]:

user_prompts = ["Return a male User", "Return a female User" ] * 5
prompt_collection = PromptCollection.create_prompts(
    sys_prompts=sys_prompts, 
    user_prompts=user_prompts, 
    contexts=contexts, 
    meta_datas=meta_datas, 
    model_name=model_name
)

class User(BaseModel):
    name: str
    age: int
    id: str
    
runner = BatchInferenceRunner(llm, sampling_params)
responses = runner.run(prompt_collection, schema=User)
responses.print_response_summary()

Processed prompts: 100%|██████████| 10/10 [00:01<00:00,  7.37it/s, est. speed input: 184.21 toks/s, output: 151.05 toks/s]

--------------------------------------------------
🧑‍💻 User Prompt:
Return a male User
📚 Added Context keys: key1 (See Template for details.)

💬 Response:
name='John' age=30 id='user_001'

🤖 System Prompt:
You are a helpful AI.

🗂️ Metadata: {'meta': 'data1'}
🆔 Request ID: b4541705-4e0d-40bb-a9ad-f75a313e33f4
🆔 Prompt ID: f4668b41-a599-4175-9787-fa5261e2ef9d
🆔 Conversation ID: 0
⏳ Processing Time: 0.0 seconds

--------------------------------------------------
🧑‍💻 User Prompt:
Return a female User
📚 Added Context keys: key2 (See Template for details.)

💬 Response:
name='Emily' age=28 id='user_1'

🤖 System Prompt:
You are a helpful AI.

🗂️ Metadata: {'meta': 'data2'}
🆔 Request ID: a653a049-77fc-40f0-a359-9d60cbd3a086
🆔 Prompt ID: f24e1f2a-bbbf-4e7b-b1a9-8e406bc0c797
🆔 Conversation ID: 0
⏳ Processing Time: 0.0 seconds

--------------------------------------------------
🧑‍💻 User Prompt:
Return a male User
📚 Added Context keys: key1 (See Template for details.)

💬 Response:
name='John' age=




### Change the user request and add a structured output with custom json model

In [None]:

user_prompts = ["Return a male User", "Return a female User" ] * 5
prompt_collection = PromptCollection.create_prompts(
    sys_prompts=sys_prompts, 
    user_prompts=user_prompts, 
    contexts=contexts, 
    meta_datas=meta_datas, 
    model_name=model_name
)

schema = """
{
  "title": "User",
  "type": "object",
  "properties": {
    "name": {"type": "string"},
    "last_name": {"type": "string"},
    "id": {"type": "integer"},
    "height": {"type": "integer"}
  },
  "required": ["name", "last_name", "id", "height"]
}
"""
    
runner = BatchInferenceRunner(llm, sampling_params)
responses = runner.run(prompt_collection, schema=schema)
responses.print_response_summary()

### Change the user request and add a structured output with a function

In [None]:

user_prompts = ["Return two integers named a and b respectively between 10-100. a is odd and b even.", "Return two integers named a and b respectively. a is odd and b even."]  * 5
prompt_collection = PromptCollection.create_prompts(
    sys_prompts=sys_prompts, 
    user_prompts=user_prompts, 
    contexts=contexts, 
    meta_datas=meta_datas, 
    model_name=model_name
)

def add(a: int, b: int):
    return a + b
    
runner = BatchInferenceRunner(llm, sampling_params)
responses = runner.run(prompt_collection, schema=schema)
responses.print_response_summary()