# Demo of LoRA adapter for query rewrite

This notebook shows the usage of the IO processor for the Granite query rewrite
intrisic, also known as the [LoRA Adapter for Query Rewrite](
    https://huggingface.co/ibm-granite/granite-3.2-8b-lora-rag-query-rewrite
)

This notebook can run its own vLLM server to perform inference, or you can host the 
models on your own server. To use your own server, set the `run_server` variable below
to `False` and set appropriate values for the constants 
`openai_base_url`, `openai_base_model_name` and `openai_lora_model_name`.

In [1]:
# Imports go here
from granite_io.io.query_rewrite import QueryRewriteIOProcessor
from granite_io.io.granite_3_3.input_processors.granite_3_3_input_processor import (
    Granite3Point3Inputs,
)
from granite_io.backend.vllm_server import LocalVLLMServer
from granite_io import make_backend
from granite_io.io.rag_agent_lib import obtain_lora

In [3]:
# Constants go here
base_model_name = "ibm-granite/granite-3.3-8b-instruct"
# TEMPORARY: Load LoRA adapter locally
lora_model_name = "query_rewrite"
run_server = True

In [4]:
if run_server:
    # Start by firing up a local vLLM server and connecting a backend instance to it.
    # Download and cache the model's LoRA adapter.
    lora_model_path = obtain_lora(lora_model_name)
    print(f"Local path to LoRA adapter: {lora_model_path}")
    server = LocalVLLMServer(
        base_model_name, lora_adapters=[(lora_model_name, lora_model_path)]
    )
    server.wait_for_startup(200)
    lora_backend = server.make_lora_backend(lora_model_name)
    backend = server.make_backend()
else:  # if not run_server
    # Use an existing server.
    # Modify the constants here as needed.
    openai_base_url = "http://localhost:55555/v1"
    openai_api_key = "granite_intrinsics_1234"
    openai_base_model_name = base_model_name
    openai_lora_model_name = lora_model_name
    backend = make_backend(
        "openai",
        {
            "model_name": openai_base_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )
    lora_backend = make_backend(
        "openai",
        {
            "model_name": openai_lora_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )

INFO 17:18:46 Running: /proj/dmfexp/8cc/krishna/miniforge3/envs/granite-io/bin/vllm serve ibm-granite/granite-3.3-8b-instruct --port 43151 --gpu-memory-utilization 0.45 --max-model-len 32768 --guided_decoding_backend outlines --device auto --enforce-eager --enable-lora --max_lora_rank 64 --lora-modules /proj/dmfexp/8cc/hf_home/hub/models--ibm-granite--granite-3.3-8b-rag-agent-lib/snapshots/8023ff6dfdbbc28633b15181477c6504b28c2a8e/query_rewrite_lora=/proj/dmfexp/8cc/hf_home/hub/models--ibm-granite--granite-3.3-8b-rag-agent-lib/snapshots/8023ff6dfdbbc28633b15181477c6504b28c2a8e/query_rewrite_lora
INFO 06-19 17:18:50 __init__.py:207] Automatically detected platform cuda.
INFO 06-19 17:18:51 api_server.py:912] vLLM API server version 0.7.3
INFO 06-19 17:18:51 api_server.py:913] args: Namespace(subparser='serve', model_tag='ibm-granite/granite-3.3-8b-instruct', config='', host=None, port=43151, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], 

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:01<00:05,  1.86s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:03<00:03,  1.90s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:04<00:01,  1.31s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:06<00:00,  1.51s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:06<00:00,  1.55s/it]



INFO 06-19 17:19:11 model_runner.py:1115] Loading model weights took 15.2531 GB
INFO 06-19 17:19:11 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 06-19 17:19:13 worker.py:267] Memory profiling takes 2.31 seconds
INFO 06-19 17:19:13 worker.py:267] the current vLLM instance can use total_gpu_memory (79.21GiB) x gpu_memory_utilization (0.45) = 35.64GiB
INFO 06-19 17:19:13 worker.py:267] model weights take 15.25GiB; non_torch_memory takes 0.16GiB; PyTorch activation peak memory takes 3.38GiB; the rest of the memory reserved for KV Cache is 16.85GiB.
INFO 06-19 17:19:13 executor_base.py:111] # cuda blocks: 6902, # CPU blocks: 1638
INFO 06-19 17:19:13 executor_base.py:116] Maximum concurrency for 32768 tokens per request: 3.37x
INFO 06-19 17:19:15 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 4.18 seconds
INFO 06-19 17:19:15 serving_models.py:174] Loaded new LoRA adapter: name '/proj/dmfexp/8cc/hf_home/hub/models--ibm-granite--granite-3.3-8b-rag-agent-lib

INFO:     Started server process [2143533]
INFO:     Waiting for application startup.
INFO:     Application startup complete.


INFO:     127.0.0.1:39580 - "GET /ping HTTP/1.1" 200 OK


In [5]:
# Create an example chat completion with a short conversation.
chat_input = Granite3Point3Inputs.model_validate(
    {
        "messages": [
            {"role": "assistant", "content": "Welcome to pet questions!"},
            {
                "role": "user",
                "content": "I have two pets, a dog named Rex and a cat named Lucy.",
            },
            {
                "role": "assistant",
                "content": "Great, what would you like to share about them?",
            },
            {
                "role": "user",
                "content": "Rex spends a lot of time in the backyard and outdoors, "
                "and Luna is always inside.",
            },
            {
                "role": "assistant",
                "content": "Sounds good! Rex must love exploring outside, while Lucy "
                "probably enjoys her cozy indoor life.",
            },
            {
                "role": "user",
                "content": "But is he more likely to get fleas because of that?",
            },
        ],
        "generate_inputs": {"temperature": 0.0},
    }
)
chat_input

Granite3Point3Inputs(messages=[AssistantMessage(content='Welcome to pet questions!', role='assistant', tool_calls=[], reasoning_content=None, citations=None, documents=None, hallucinations=None, stop_reason=None), UserMessage(content='I have two pets, a dog named Rex and a cat named Lucy.', role='user'), AssistantMessage(content='Great, what would you like to share about them?', role='assistant', tool_calls=[], reasoning_content=None, citations=None, documents=None, hallucinations=None, stop_reason=None), UserMessage(content='Rex spends a lot of time in the backyard and outdoors, and Luna is always inside.', role='user'), AssistantMessage(content='Sounds good! Rex must love exploring outside, while Lucy probably enjoys her cozy indoor life.', role='assistant', tool_calls=[], reasoning_content=None, citations=None, documents=None, hallucinations=None, stop_reason=None), UserMessage(content='But is he more likely to get fleas because of that?', role='user')], tools=[], generate_inputs=Ge

In [6]:
# Instantiate the I/O processor for the LoRA adapter
io_proc = QueryRewriteIOProcessor(backend)

# Pass our example input through the I/O processor and retrieve the result
chat_result = await io_proc.acreate_chat_completion(chat_input)
print(chat_result.results[0].next_message.model_dump_json(indent=2))

INFO 06-19 17:19:16 logger.py:39] Received request cmpl-c7095b33458d4ecb9d52c6ea8a334c14-0: prompt: '<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday\'s Date: June 19, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant.<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Welcome to pet questions!<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>I have two pets, a dog named Rex and a cat named Lucy.<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Great, what would you like to share about them?<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>Rex spends a lot of time in the backyard and outdoors, and Luna is always inside.<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>Sounds good! Rex must love exploring outside, while Lucy probably enjoys her cozy indoor life.<|end_of_text|>\n<|start_of_role|>user<|end_of_role|>But is he more likely to get fleas because of that?<|end_of_text|>\n<|start_of_role|>query_to_

In [7]:
# Free up GPU resources
if "server" in locals():
    server.shutdown()