In [1]:
%load_ext autoreload
%autoreload 2
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=2

from pydantic import BaseModel


from encourage.llm import BatchInferenceRunner
from encourage.prompts.prompt_collection import PromptCollection
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
from encourage.prompts.context import Context
from encourage.prompts.meta_data import MetaData

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=2


### Init vllm 

In [2]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

llm = LLM(model=model_name, gpu_memory_utilization=0.95)


INFO 12-06 09:49:01 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
INFO 12-06 09:49:01 config.py:1136] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 12-06 09:49:01 llm_engine.py:249] Initializing an LLM engine (v0.6.4) with config: model='meta-llama/Meta-Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 12-06 09:49:10 model_runner.py:1077] Loading model weights took 14.9888 GB
INFO 12-06 09:49:12 worker.py:232] Memory profiling results: total_gpu_memory=47.32GiB initial_memory_usage=15.31GiB peak_torch_memory=16.16GiB memory_usage_post_profile=15.35Gib non_torch_memory=0.35GiB kv_cache_size=28.44GiB gpu_memory_utilization=0.95
INFO 12-06 09:49:12 gpu_executor.py:113] # GPU blocks: 14560, # CPU blocks: 2048
INFO 12-06 09:49:12 gpu_executor.py:117] Maximum concurrency for 131072 tokens per request: 1.78x
INFO 12-06 09:49:16 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-06 09:49:16 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.


### Define the toy data structure

In [11]:
sys_prompts = "You are a helpful AI."

# List of user prompts (questions or requests for the AI)
user_prompts = ["What is the capital of France?", "What is the capital of Germany?"] * 5

# # Context information for each prompt (additional data or background info)
contexts = [Context.from_prompt_vars({"key1": "value1"}), Context.from_prompt_vars({"key2": "value2"})] * 5

# # Metadata associated with each prompt (e.g., priority, tags)
meta_datas = [MetaData({"meta": "data1"}), MetaData({"meta": "data2"})] * 5

# Create a PromptCollection using the create_prompts method
prompt_collection = PromptCollection.create_prompts(
    sys_prompts=sys_prompts,  # System prompt or list of system prompts
    user_prompts=user_prompts,  # List of user prompts
    model_name=model_name,  # The name of the model being used (optional)
    contexts=contexts,  # List of Context objects
    meta_datas=meta_datas,  # List of MetaData objects
)


### Init the Inference Runner with no structured output

In [12]:
sampling_params = SamplingParams(temperature=0.5, max_tokens=100)
runner = BatchInferenceRunner(llm, sampling_params)
responses = runner.run(prompt_collection)
responses.print_response_summary()

Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts: 100%|██████████| 10/10 [00:00<00:00, 26.03it/s, est. speed input: 729.11 toks/s, output: 234.35 toks/s]

--------------------------------------------------
🧑‍💻 User Prompt:
What is the capital of France?

💬 Response:
The capital of France is Paris.

🤖 System Prompt:
You are a helpful AI.

🗂️ Metadata: MetaData(tags={'meta': 'data1'})
🆔 Request ID: 40
🆔 Prompt ID: 0448c5af-e644-4d77-a62e-c28ed1bf2888
🆔 Conversation ID: 0
⏳ Processing Time: 0.3626 seconds

--------------------------------------------------
🧑‍💻 User Prompt:
What is the capital of Germany?

💬 Response:
The capital of Germany is Berlin.

🤖 System Prompt:
You are a helpful AI.

🗂️ Metadata: MetaData(tags={'meta': 'data2'})
🆔 Request ID: 41
🆔 Prompt ID: 24c15e3c-22cc-47a6-803c-0594cca9cf6e
🆔 Conversation ID: 0
⏳ Processing Time: 0.3617 seconds

--------------------------------------------------
🧑‍💻 User Prompt:
What is the capital of France?

💬 Response:
The capital of France is Paris.

🤖 System Prompt:
You are a helpful AI.

🗂️ Metadata: MetaData(tags={'meta': 'data1'})
🆔 Request ID: 42
🆔 Prompt ID: f0e5c24c-65a7-48e8-94a4-5f85




### Change the user request and add a structured output with pydantic model

In [None]:
user_prompts = ["Return a male User", "Return a female User" ] * 5
prompt_collection = PromptCollection.create_prompts(
    sys_prompts=sys_prompts, 
    user_prompts=user_prompts, 
    model_name=model_name,
    contexts=contexts, 
    meta_datas=meta_datas
)

class User(BaseModel):
    name: str
    age: int
    id: str
    


sampling_params = SamplingParams(temperature=0.5, max_tokens=1000)
sampling_params.guided_decoding = GuidedDecodingParams(json=User.model_json_schema())

runner = BatchInferenceRunner(llm, sampling_params)
responses = runner.run(prompt_collection)
responses.print_response_summary()

Processed prompts: 100%|██████████| 10/10 [00:00<00:00, 11.49it/s, est. speed input: 287.30 toks/s, output: 235.58 toks/s]

--------------------------------------------------
🧑‍💻 User Prompt:
Return a male User

💬 Response:
{"name": "John", "age": 30, "id": "user123"}

🤖 System Prompt:
You are a helpful AI.

🗂️ Metadata: MetaData(tags={'meta': 'data1'})
🆔 Request ID: 50
🆔 Prompt ID: 72f7a9a6-b5be-4c2c-94f1-65f2961d970c
🆔 Conversation ID: 0
⏳ Processing Time: 0.9543 seconds

--------------------------------------------------
🧑‍💻 User Prompt:
Return a female User

💬 Response:
{"name": "Sarah", "age": 32, "id": "user1"}

🤖 System Prompt:
You are a helpful AI.

🗂️ Metadata: MetaData(tags={'meta': 'data2'})
🆔 Request ID: 51
🆔 Prompt ID: 72f512db-cfa1-4c06-b607-3a5c5b937640
🆔 Conversation ID: 0
⏳ Processing Time: 0.8228 seconds

--------------------------------------------------
🧑‍💻 User Prompt:
Return a male User

💬 Response:
{"name": "John", "age": 30, "id": "user1"}

🤖 System Prompt:
You are a helpful AI.

🗂️ Metadata: MetaData(tags={'meta': 'data1'})
🆔 Request ID: 52
🆔 Prompt ID: 2583f033-6858-41dd-a5b2-196f9




### Change the user request and add a structured output with custom json model

In [14]:

user_prompts = ["Return a male User", "Return a female User" ] * 5
prompt_collection = PromptCollection.create_prompts(
    sys_prompts=sys_prompts, 
    user_prompts=user_prompts, 
    contexts=contexts, 
    meta_datas=meta_datas, 
    model_name=model_name
)

schema = """
{
  "title": "User",
  "type": "object",
  "properties": {
    "name": {"type": "string"},
    "last_name": {"type": "string"},
    "id": {"type": "integer"},
    "height": {"type": "integer"}
  },
  "required": ["name", "last_name", "id", "height"]
}
"""
    

guided_decoding_params = GuidedDecodingParams(json=schema)
sampling_params = SamplingParams(temperature=0.5, max_tokens=1000)
sampling_params.guided_decoding = guided_decoding_params

runner = BatchInferenceRunner(llm, sampling_params)
responses = runner.run(prompt_collection)
responses.print_response_summary()

Compiling FSM index for all state transitions: 100%|██████████| 75/75 [00:01<00:00, 38.40it/s]
Processed prompts: 100%|██████████| 10/10 [00:01<00:00,  9.29it/s, est. speed input: 232.33 toks/s, output: 255.56 toks/s]

--------------------------------------------------
🧑‍💻 User Prompt:
Return a male User

💬 Response:
{"name": "John", "last_name": "Doe", "id": 12345, "height": 180}

🤖 System Prompt:
You are a helpful AI.

🗂️ Metadata: MetaData(tags={'meta': 'data1'})
🆔 Request ID: 60
🆔 Prompt ID: 93bcac05-afb4-4bbc-afc0-819d7363d7f0
🆔 Conversation ID: 0
⏳ Processing Time: 4.4886 seconds

--------------------------------------------------
🧑‍💻 User Prompt:
Return a female User

💬 Response:
{"name": "Emily", "last_name": "Johnson", "id": 12345, "height": 5}

🤖 System Prompt:
You are a helpful AI.

🗂️ Metadata: MetaData(tags={'meta': 'data2'})
🆔 Request ID: 61
🆔 Prompt ID: dfc6e411-e7d1-42d2-915f-8a1113686e65
🆔 Conversation ID: 0
⏳ Processing Time: 1.0281 seconds

--------------------------------------------------
🧑‍💻 User Prompt:
Return a male User

💬 Response:
{"name": "John", "last_name": "Doe", "id": 1, "height": 180}

🤖 System Prompt:
You are a helpful AI.

🗂️ Metadata: MetaData(tags={'meta': 'data1'




### You can also add a structured output to a inference runner that is already initialized

In [None]:
user_prompts = ["Return a male User", "Return a female User" ] * 5
prompt_collection = PromptCollection.create_prompts(
    sys_prompts=sys_prompts, 
    user_prompts=user_prompts, 
    contexts=contexts, 
    meta_datas=meta_datas, 
    model_name=model_name
)

schema = """
{
  "title": "User",
  "type": "object",
  "properties": {
    "name": {"type": "string"},
    "last_name": {"type": "string"},
    "id": {"type": "integer"},
    "height": {"type": "integer"}
  },
  "required": ["name", "last_name", "id", "height"]
}
"""
    


sampling_params = SamplingParams(temperature=0.5, max_tokens=1000)
runner = BatchInferenceRunner(llm, sampling_params)

runner.add_schema(schema)
responses = runner.run(prompt_collection)
responses.print_response_summary()