In [20]:
from loguru import logger


# Method 1: vLLM with server

In [25]:
import requests
import base64
from io import BytesIO
from PIL import Image

SERVER_URL = "http://localhost:8000/v1/chat/completions"

# Load and encode image
image_path = "/home/ubuntu/nim-dev/models/nemotron-parse-prod-hf/output_results/resized_images/soa_1_page_1_fitz_resized.png"
image = Image.open(image_path)
buffered = BytesIO()
image.save(buffered, format="PNG")
img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')

payload = {
    "model": "nvidia/NVIDIA-Nemotron-Parse-v1.1",
    "messages": [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "</s><s><predict_bbox><predict_classes><output_markdown>"},
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}}
            ]
        }
    ],
    "temperature": 0,
    "top_k": 1,
    "repetition_penalty": 1.1,
    "max_tokens": 8000,
    "skip_special_tokens": False
}

logger.info(f"Sending request to {SERVER_URL}...")
response = requests.post(SERVER_URL, json=payload)
if response.status_code == 200:
    response = response.json()
    logger.success("Request successful")


[32m2025-11-24 21:55:40.326[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m33[0m - [1mSending request to http://localhost:8000/v1/chat/completions...[0m
[32m2025-11-24 21:55:44.537[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [32m[1mRequest successful[0m


In [27]:
content = response["choices"][0]["message"]["content"]
content

'<x_0.1123><y_0.2578>**_Symphogen Trial ID: Sym004-09_** # Clinical Trial Protocol Version 4.0<x_0.8857><y_0.2805><class_Page-header>\n\n<x_0.1113><y_0.6672>Abbreviations (in alphabetical order): ADA, anti-drug antibody; C, Cycle; CT, computed tomography scan; D/d, day(s); DLT, dose-limiting toxicity; EOT, End of trial treatment Visit (Sym004 and FOLFIRI); ECG, electrocardiography; ECOG PS, Eastern Cooperative Oncology Group performance status; MRI, magnetic resonance imaging; 1M FUP, One Month Follow-up Visit; PK, pharmacokinetic; (S)AE, (serious) adverse event; TX, therapy<x_0.8789><y_0.6977><class_Text>\n\n<x_0.1133><y_0.7055>\\*Omitted with Amendment 3<x_0.248><y_0.7141><class_Footnote>\n\n<x_0.1123><y_0.7328>Confidential Page 35 of 122<x_0.8838><y_0.743><class_Page-footer>\n\n<x_0.1143><y_0.3063>\\begin{tabular}{ccccccccccc}\n\\multicolumn{2}{c}{Pre-Treatment Phase} & \\multicolumn{7}{c}{Treatment Phase1} & \\multicolumn{2}{c}{Post-Treatment Phase} \\\\\nCycle Number & Screening &

In [28]:
import re
from typing import List, Dict, Any, Union
import json

_PATTERN = re.compile(
    r"<x_(\d+(?:\.\d+)?)><y_(\d+(?:\.\d+)?)>(.*?)<x_(\d+(?:\.\d+)?)><y_(\d+(?:\.\d+)?)><class_([^>]+)>",
    re.DOTALL,
)

def parse_content_to_blocks(content: str) -> List[Dict[str, Any]]:
    transformed_data: List[Dict[str, Any]] = []
    for xmin, ymin, text, xmax, ymax, cls in _PATTERN.findall(content):
        transformed_data.append({
            "type": cls,  # same role as old 'category'
            "text": text.strip(),
            "bbox": {
                "xmin": float(xmin),
                "ymin": float(ymin),
                "xmax": float(xmax),
                "ymax": float(ymax),
            },
        })
    return transformed_data

def parse_response(resp: Union[List, Dict]) -> List[Dict[str, Any]]:
    
    # in rc1.5, use this: 
    # content = resp['choices'][0]["message"]["content"]
    # return parse_content_to_blocks(content)

    # in GA 1.0, use this: 
    return json.loads(resp['choices'][0]['message']['tool_calls'][0]['function']['arguments'])[0]

def encode_file_to_base64(image_path: str):
    with open(image_path, "rb") as f:
        image_b64 = base64.b64encode(f.read()).decode('utf-8')

    ext = os.path.splitext(image_path)[1].lower()
    if ext in ('.jpg', '.jpeg'):
        mime = 'image/jpeg'
    elif ext == '.png':
        mime = 'image/png'
    else:
        # raise warning
        mime = 'image/' + ext
        print(f"Warning: Imeage extension is {ext}. Not all image types are supported. It might be best to convert to .png, .jpg, .jpeg instead. For now we will try to encode it as {mime}")

    return "data:" + mime + ";base64," + image_b64

def call_eclair_inference(image_path: str, temperature: float = 0.5):
    """
    Sends an image to the local Docker inference endpoint and transforms the
    output to the format expected by the rest of the script.
    Args:
        image_path (str): The path to the image to send to the Docker endpoint.
    Returns:
        list: A list of dictionaries, each representing a block in the image.
    """
    image_filename = os.path.basename(image_path)

    # convert to base64 encoded image
    image_b64 = encode_file_to_base64(image_path)
    print(f"    - Sending {image_filename} to Docker endpoint: {ECLAIR_ENDPOINT_URL}")

    try:
        headers = {
            'accept': 'application/json',
            'Content-Type': 'application/json'
        }
        data = {
            "model": "nvidia/nemotron-parse",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": image_b64
                            }
                        }
                    ]
                }
            ],
            "temperature": temperature
        }

        response = requests.post(ECLAIR_ENDPOINT_URL, headers=headers, json=data, timeout=180)
        response.raise_for_status()  # Raise an exception for bad status codes
        response_json = response.json()

        # DEBUG
        # return response_json
        transformed_data = parse_response(response_json)
        return transformed_data

    except requests.exceptions.RequestException as e:
        print(f"    - ERROR: Could not connect to Docker endpoint at {ECLAIR_ENDPOINT_URL}. Please ensure it is running. Details: {e}")
        return None
    except Exception as e:
        print(f"    - An unexpected error occurred during Docker inference: {e}")
        return None

In [29]:
parse_content_to_blocks(content)

[{'type': 'Page-header',
  'text': '**_Symphogen Trial ID: Sym004-09_** # Clinical Trial Protocol Version 4.0',
  'bbox': {'xmin': 0.1123, 'ymin': 0.2578, 'xmax': 0.8857, 'ymax': 0.2805}},
 {'type': 'Text',
  'text': 'Abbreviations (in alphabetical order): ADA, anti-drug antibody; C, Cycle; CT, computed tomography scan; D/d, day(s); DLT, dose-limiting toxicity; EOT, End of trial treatment Visit (Sym004 and FOLFIRI); ECG, electrocardiography; ECOG PS, Eastern Cooperative Oncology Group performance status; MRI, magnetic resonance imaging; 1M FUP, One Month Follow-up Visit; PK, pharmacokinetic; (S)AE, (serious) adverse event; TX, therapy',
  'bbox': {'xmin': 0.1113, 'ymin': 0.6672, 'xmax': 0.8789, 'ymax': 0.6977}},
 {'type': 'Footnote',
  'text': '\\*Omitted with Amendment 3',
  'bbox': {'xmin': 0.1133, 'ymin': 0.7055, 'xmax': 0.248, 'ymax': 0.7141}},
 {'type': 'Page-footer',
  'text': 'Confidential Page 35 of 122',
  'bbox': {'xmin': 0.1123, 'ymin': 0.7328, 'xmax': 0.8838, 'ymax': 0.743}

# Method 2: vLLM without server

In [None]:

# import requests
# import base64
# from io import BytesIO
# from vllm import LLM, SamplingParams
# from PIL import Image

# sampling_params = SamplingParams(
#     temperature=0,
#     top_k=1,
#     repetition_penalty=1.1,
#     max_tokens=9000,
#     skip_special_tokens=False,
# )

# llm = LLM(
#     model="nvidia/NVIDIA-Nemotron-Parse-v1.1",
#     max_num_seqs=64,
#     limit_mm_per_prompt={"image": 1},
#     dtype="bfloat16",
#     trust_remote_code=True,
# )

# image = Image.open("/home/ubuntu/nim-dev/models/nemotron-parse-prod-hf/output_results/resized_images/soa_1_page_1_fitz_resized.png")

# prompts = [
#     {  # Implicit prompt
#         "prompt": "</s><s><predict_bbox><predict_classes><output_markdown>",
#         "multi_modal_data": {
#             "image": image
#         },
#     }
# ]

# outputs = llm.generate(prompts, sampling_params)

# for output in outputs:
#     prompt = output.prompt
#     generated_text = output.outputs[0].text
#     print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}")