# Demo of Granite hallucinations intrinsic

This notebook shows the usage of the IO processor for the Granite hallucinations
intrisic, also known as the [LoRA Adapter for Hallucination Detection in RAG outputs](
    https://huggingface.co/ibm-granite/granite-3.3-8b-rag-agent-lib/blob/main/hallucination_detection_lora/README.md
)

This notebook can run its own vLLM server to perform inference, or you can host the 
models on your own server. 

To use your own server, set the `run_server` variable below
to `False` and set appropriate values for the constants 
`openai_base_url`, `openai_base_model_name` and `openai_lora_model_name`.

In [1]:
# Imports go here
from granite_io.io.granite_3_3.input_processors.granite_3_3_input_processor import (
    Granite3Point3Inputs,
)
from granite_io import make_io_processor, make_backend
from IPython.display import display, Markdown
from granite_io.io.hallucinations import (
    HallucinationsIOProcessor,
    HallucinationsCompositeIOProcessor,
)
from granite_io.backend.vllm_server import LocalVLLMServer
from granite_io.io.rag_agent_lib import obtain_lora

In [2]:
# Constants go here
base_model_name = "ibm-granite/granite-3.3-8b-instruct"
lora_model_name = "hallucination_detection"
run_server = True

In [3]:
if run_server:
    # Start by firing up a local vLLM server and connecting a backend instance to it.
    # Download and cache the model's LoRA adapter.
    lora_model_path = obtain_lora(lora_model_name)
    print(f"Local path to LoRA adapter: {lora_model_path}")
    server = LocalVLLMServer(
        base_model_name, lora_adapters=[(lora_model_name, lora_model_path)]
    )
    server.wait_for_startup(200)
    lora_backend = server.make_lora_backend(lora_model_name)
    backend = server.make_backend()
else:  # if not run_server
    # Use an existing server.
    # Modify the constants here as needed.
    openai_base_url = "http://localhost:55555/v1"
    openai_api_key = "granite_intrinsics_1234"
    openai_base_model_name = base_model_name
    openai_lora_model_name = lora_model_name
    backend = make_backend(
        "openai",
        {
            "model_name": openai_base_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )
    lora_backend = make_backend(
        "openai",
        {
            "model_name": openai_lora_model_name,
            "openai_base_url": openai_base_url,
            "openai_api_key": openai_api_key,
        },
    )

Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

Local path to LoRA adapter: /home/freiss/.cache/huggingface/hub/models--ibm-granite--granite-3.3-8b-rag-agent-lib/snapshots/310479c72458e1ebbad00baa010a37b0003f89c8/hallucination_detection_lora
INFO 11:32:20 Running: /mnt/nvmedisk/freiss/granite/env/bin/vllm serve ibm-granite/granite-3.3-8b-instruct --port 32979 --gpu-memory-utilization 0.45 --max-model-len 32768 --guided_decoding_backend outlines --device auto --enforce-eager --enable-lora --max_lora_rank 64 --lora-modules hallucination_detection=/home/freiss/.cache/huggingface/hub/models--ibm-granite--granite-3.3-8b-rag-agent-lib/snapshots/310479c72458e1ebbad00baa010a37b0003f89c8/hallucination_detection_lora
INFO 06-13 11:32:26 [__init__.py:243] Automatically detected platform cuda.
INFO 06-13 11:32:30 [__init__.py:31] Available plugins for group vllm.general_plugins:
INFO 06-13 11:32:30 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 06-13 11:32:30 [__i

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:01,  1.96it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:02<00:02,  1.25s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:04<00:01,  1.47s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.56s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:05<00:00,  1.43s/it]



INFO 06-13 11:33:07 [default_loader.py:280] Loading weights took 5.88 seconds
INFO 06-13 11:33:07 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 06-13 11:33:08 [model_runner.py:1202] Model loading took 15.6407 GiB and 6.912470 seconds
INFO 06-13 11:33:13 [worker.py:291] Memory profiling takes 4.96 seconds
INFO 06-13 11:33:13 [worker.py:291] the current vLLM instance can use total_gpu_memory (79.25GiB) x gpu_memory_utilization (0.45) = 35.66GiB
INFO 06-13 11:33:13 [worker.py:291] model weights take 15.64GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 3.36GiB; the rest of the memory reserved for KV Cache is 16.57GiB.
INFO 06-13 11:33:13 [executor_base.py:112] # cuda blocks: 6787, # CPU blocks: 1638
INFO 06-13 11:33:13 [executor_base.py:117] Maximum concurrency for 32768 tokens per request: 3.31x
INFO 06-13 11:33:16 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 8.12 seconds
INFO 06-13 11:33:16 [serving_models.py:185] Loaded n

INFO:     Started server process [922820]
INFO:     Waiting for application startup.
INFO:     Application startup complete.


INFO:     127.0.0.1:49338 - "GET /ping HTTP/1.1" 200 OK


In [4]:
# Create an example chat completion with a user question and two documents.
chat_input = Granite3Point3Inputs.model_validate(
    {
        "messages": [
            {
                "role": "user",
                "content": "What is the visibility level of Git Repos and Issue \
Tracking projects?",
            }
        ],
        "documents": [
            {
                "text": "Git Repos and Issue Tracking is an IBM-hosted component of \
the Continuous Delivery service. All of the data that you provide to Git Repos and \
Issue Tracking, including but not limited to source files, issues, pull requests, and \
project configuration properties, is managed securely within Continuous Delivery. \
However, Git Repos and Issue Tracking supports various mechanisms for exporting, \
sending, or otherwise sharing data to users and third parties. The ability of Git \
Repos and Issue Tracking to share information is typical of many social coding \
platforms. However, such sharing might conflict with regulatory controls that \
apply to your business. After you create a project in Git Repos and Issue Tracking, \
but before you entrust any files, issues, records, or other data with the project, \
review the project settings and change any settings that you deem necessary to \
protect your data. Settings to review include visibility levels, email notifications, \
integrations, web hooks, access tokens, deploy tokens, and deploy keys. Project \
visibility levels \n\nGit Repos and Issue Tracking projects can have one of the \
following visibility levels: private, internal, or public. * Private projects are \
visible only to project members. This setting is the default visibility level for new \
projects, and is the most secure visibility level for your data. * Internal projects \
are visible to all users that are logged in to IBM Cloud. * Public projects are \
visible to anyone. To limit project access to only project members, complete the \
following steps:\n\n\n\n1. From the project sidebar, click Settings > General. \
2. On the General Settings page, click Visibility > project features > permissions. \
3. Locate the Project visibility setting. 4. Select Private, if it is not already \
selected. 5. Click Save changes. Project membership \n\nGit Repos and Issue Tracking \
is a cloud hosted social coding environment that is available to all Continuous \
Delivery users. If you are a Git Repos and Issue Tracking project Maintainer or Owner, \
you can invite any user and group members to the project. IBM Cloud places no \
restrictions on who you can invite to a project."
            },
            {
                "text": "After you create a project in Git Repos and Issue Tracking, \
but before you entrust any files, issues, records, or other data with the project, \
review the project settings and change any settings that are necessary to protect your \
data. \
Settings to review include visibility levels, email notifications, integrations, web \
hooks, access tokens, deploy tokens, and deploy keys. Project visibility levels \
\n\nGit Repos and Issue Tracking projects can have one of the following visibility \
levels: private, internal, or public. * Private projects are visible only to \
project members. This setting is the default visibility level for new projects, and \
is the most secure visibility level for your data. * Internal projects are visible to \
all users that are logged in to IBM Cloud. * Public projects are visible to anyone. \
To limit project access to only project members, complete the following \
steps:\n\n\n\n1. From the project sidebar, click Settings > General. 2. On the \
General Settings page, click Visibility > project features > permissions. 3. Locate \
the Project visibility setting. 4. Select Private, if it is not already selected. \
5. Click Save changes. Project email settings \n\nBy default, Git Repos and Issue \
Tracking notifies project members by way of email about project activities. These \
emails typically include customer-owned data that was provided to Git Repos and Issue \
Tracking by users. For example, if a user posts a comment to an issue, Git Repos and \
Issue Tracking sends an email to all subscribers. The email includes information such \
as a copy of the comment, the user who posted it, and when the comment was posted. \
To turn off all email notifications for your project, complete the following \
steps:\n\n\n\n1. From the project sidebar, click Settings > General. 2. On the \
**General Settings **page, click Visibility > project features > permissions. \
3. Select the Disable email notifications checkbox. 4. Click Save changes. Project \
integrations and webhooks"
            },
        ],
        "generate_inputs": {"temperature": 0.0, "max_tokens": 1024},
    }
)
chat_input

Granite3Point3Inputs(messages=[UserMessage(content='What is the visibility level of Git Repos and Issue Tracking projects?', role='user')], tools=[], generate_inputs=GenerateInputs(prompt=None, model=None, best_of=None, echo=None, frequency_penalty=None, logit_bias=None, logprobs=None, max_tokens=1024, n=None, presence_penalty=None, stop=None, stream=None, stream_options=None, suffix=None, temperature=0.0, top_p=None, user=None, extra_headers=None, extra_body={}), documents=[Document(text='Git Repos and Issue Tracking is an IBM-hosted component of the Continuous Delivery service. All of the data that you provide to Git Repos and Issue Tracking, including but not limited to source files, issues, pull requests, and project configuration properties, is managed securely within Continuous Delivery. However, Git Repos and Issue Tracking supports various mechanisms for exporting, sending, or otherwise sharing data to users and third parties. The ability of Git Repos and Issue Tracking to shar

In [5]:
# Pass the example input through Granite 3.3 to get an answer
granite_io_proc = make_io_processor("Granite 3.3", backend=backend)
result = await granite_io_proc.acreate_chat_completion(chat_input)

display(Markdown(result.results[0].next_message.content))

INFO 06-13 11:33:18 [logger.py:42] Received request cmpl-cfd8fa0e9aed4d5ba543c3370e6d6a79-0: prompt: '<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday\'s Date: June 13, 2025.\nYou are Granite, developed by IBM. Write the response to the user\'s input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.<|end_of_text|>\n<|start_of_role|>document {"document_id": "1"}<|end_of_role|>\nGit Repos and Issue Tracking is an IBM-hosted component of the Continuous Delivery service. All of the data that you provide to Git Repos and Issue Tracking, including but not limited to source files, issues, pull requests, and project configuration properties, is managed securely within Continuous Delivery. However, Git Repos and Issue Tracking supports various mechanisms for exporting, sending, or otherwi

Git Repos and Issue Tracking projects can have one of three visibility levels: private, internal, or public. Private projects are visible only to project members, internal projects are visible to all logged-in IBM Cloud users, and public projects are visible to anyone.

In [6]:
# Append the model's output to the chat
next_chat_input = chat_input.with_next_message(result.results[0].next_message)
next_chat_input.messages

[UserMessage(content='What is the visibility level of Git Repos and Issue Tracking projects?', role='user'),
 AssistantMessage(content='Git Repos and Issue Tracking projects can have one of three visibility levels: private, internal, or public. Private projects are visible only to project members, internal projects are visible to all logged-in IBM Cloud users, and public projects are visible to anyone.', role='assistant', tool_calls=[], reasoning_content=None, citations=None, documents=None, hallucinations=None, stop_reason='stop')]

In [7]:
# Instantiate the I/O processor for the hallucinations LoRA adapter
io_proc = HallucinationsIOProcessor(lora_backend)

# Pass our example input thorugh the I/O processor and retrieve the result
chat_result = await io_proc.acreate_chat_completion(next_chat_input)

next_message = chat_result.results[0].next_message
print(next_message.model_dump_json(indent=2))

INFO 06-13 11:33:23 [logger.py:42] Received request cmpl-d97864314bac4130aa5d40ed92bb27df-0: prompt: "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: June 13, 2025.\nYou are Granite, developed by IBM.Write the response to the user's input by strictly aligning with the facts in the provided documents. If the information needed to answer the question is not available in the documents, inform the user that the question cannot be answered based on the available data.<|end_of_text|>\n<|start_of_role|>documents<|end_of_role|>Document 0\nGit Repos and Issue Tracking is an IBM-hosted component of the Continuous Delivery service. All of the data that you provide to Git Repos and Issue Tracking, including but not limited to source files, issues, pull requests, and project configuration properties, is managed securely within Continuous Delivery. However, Git Repos and Issue Tracking supports various mechanisms for exporting, sending, or otherwise sharing da

ValueError: ERROR: the JSON object must be str, bytes or bytearray, not list (raw output: completion_string='[{"i": 0, "r": "This sentence makes a factual claim about the visibility levels of Git Repos and Issue Tracking projects. The document states \'Git Repos and Issue Tracking projects can have one of the following visibility levels: private, internal, or public.\' This matches exactly with the claim in the sentence.", "f": "faithful"}, {"i": 1, "r": "This sentence makes factual claims about the visibility of each level. The document states \'Private projects are visible only to project members,\' \'Internal projects are visible to all users that are logged in to IBM Cloud,\' and \'Public projects are visible to anyone.\' This matches exactly with the claims in the sentence.", "f": "faithful"}]' completion_tokens=[] stop_reason='stop')

INFO 06-13 11:33:48 [metrics.py:486] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 1.8 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.
INFO 06-13 11:33:58 [metrics.py:486] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.


In [None]:
# Try with an artifical poor-quality assistant response.
from granite_io.types import AssistantMessage

chat_result_2 = await io_proc.acreate_chat_completion(
    chat_input.with_next_message(
        AssistantMessage(
            content="Git repos are generally only visible in the infrared "
            "spectrum, due to their natural camouflage. Issue Tracking projects "
            "are much easier to see; their bright colors warn predators of the "
            "poisonous technical debt that they secrete."
        )
    ).with_addl_generate_params({"temperature": 0.0})
)
chat_result_2.results[0].next_message

In [None]:
# Create a composite citations processor that generates a response and runs a
# hallucinations check on the response.
composite_proc = HallucinationsCompositeIOProcessor(granite_io_proc, lora_backend)

# Note that this codes passes in the original chat input, without an assistant response
chat_result_4 = await composite_proc.acreate_chat_completion(chat_input)
print(chat_result_4.model_dump_json(indent=2))

In [None]:
# We can also ask the composite IO processor to generate multiple completions, in
# which case it will run the hallucinations check for all completions in parallel.
chat_result_5 = await composite_proc.acreate_chat_completion(
    chat_input.with_addl_generate_params({"n": 5, "temperature": 0.7})
)

for result in chat_result_5.results:
    print(f"Assistant: {result.next_message.content}")
    print(
        f"           (hallucination scores: "
        f"{[h.risk for h in result.next_message.hallucinations]})"
    )

In [None]:
# Free up GPU resources
if "server" in locals():
    server.shutdown()