# 3D Grounding with Qwen3-VL (Together AI)
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/togethercomputer/together-cookbook/blob/main/Multimodal/Vision/3D_Grounding.ipynb)


## Introduction

In this notebook, we'll explore Qwen3-VL's 3D spatial understanding capabilities using Together AI's API. We'll cover:

1. Detecting objects with 3D bounding boxes
2. Using camera parameters for accurate projections
3. Multi-object 3D localization

3D bounding boxes are represented as: `[x_center, y_center, z_center, x_size, y_size, z_size, roll, pitch, yaw]`


### Install required libraries


In [None]:
!pip install openai pillow numpy matplotlib opencv-python


In [1]:
import os
import json
import math
import random
import base64
import cv2
import numpy as np
import matplotlib.pyplot as plt
import openai
from PIL import Image

# Together AI Configuration
client = openai.OpenAI(
    api_key=os.environ.get("TOGETHER_API_KEY"),
    base_url="https://api.together.xyz/v1",
)

MODEL_ID = "Qwen/Qwen3-VL-32B-Instruct"

print(f"Using model: {MODEL_ID}")
print(f"API Key configured: {bool(os.environ.get('TOGETHER_API_KEY'))}")


ModuleNotFoundError: No module named 'cv2'

In [None]:
# Utility functions

def encode_image(image_path):
    """Encode image to base64."""
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def inference_with_api(image_path, prompt, max_tokens=4096):
    """Run inference with Together AI API."""
    base64_image = encode_image(image_path)
    ext = image_path.split(".")[-1].lower()
    mime_type = "jpeg" if ext in ["jpg", "jpeg"] else ext
    
    response = client.chat.completions.create(
        model=MODEL_ID,
        messages=[{
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": f"data:image/{mime_type};base64,{base64_image}"}},
                {"type": "text", "text": prompt},
            ],
        }],
        max_tokens=max_tokens,
    )
    return response.choices[0].message.content

def parse_bbox_3d_from_text(text):
    """Parse 3D bounding box information from assistant response."""
    try:
        if "```json" in text:
            start_idx = text.find("```json")
            end_idx = text.find("```", start_idx + 7)
            json_str = text[start_idx + 7:end_idx].strip() if end_idx != -1 else text[start_idx + 7:].strip()
        else:
            start_idx = text.find('[')
            end_idx = text.rfind(']')
            json_str = text[start_idx:end_idx + 1] if start_idx != -1 and end_idx != -1 else ""
        
        bbox_data = json.loads(json_str)
        return bbox_data if isinstance(bbox_data, list) else [bbox_data]
    except:
        return []

def convert_3dbbox(point, cam_params):
    """Convert 3D bounding box to 2D image coordinates."""
    x, y, z, x_size, y_size, z_size, pitch, yaw, roll = point
    hx, hy, hz = x_size / 2, y_size / 2, z_size / 2
    local_corners = [
        [hx, hy, hz], [hx, hy, -hz], [hx, -hy, hz], [hx, -hy, -hz],
        [-hx, hy, hz], [-hx, hy, -hz], [-hx, -hy, hz], [-hx, -hy, -hz]
    ]

    def rotate_xyz(pt, _pitch, _yaw, _roll):
        x0, y0, z0 = pt
        x1, y1 = x0, y0 * math.cos(_pitch) - z0 * math.sin(_pitch)
        z1 = y0 * math.sin(_pitch) + z0 * math.cos(_pitch)
        x2 = x1 * math.cos(_yaw) + z1 * math.sin(_yaw)
        y2, z2 = y1, -x1 * math.sin(_yaw) + z1 * math.cos(_yaw)
        x3 = x2 * math.cos(_roll) - y2 * math.sin(_roll)
        y3, z3 = x2 * math.sin(_roll) + y2 * math.cos(_roll), z2
        return [x3, y3, z3]
    
    img_corners = []
    for corner in local_corners:
        rotated = rotate_xyz(corner, np.deg2rad(pitch), np.deg2rad(yaw), np.deg2rad(roll))
        X, Y, Z = rotated[0] + x, rotated[1] + y, rotated[2] + z
        if Z > 0:
            x_2d = cam_params['fx'] * (X / Z) + cam_params['cx']
            y_2d = cam_params['fy'] * (Y / Z) + cam_params['cy']
            img_corners.append([x_2d, y_2d])
    return img_corners

def draw_3dbboxes(image_path, cam_params, bbox_3d_list):
    """Draw multiple 3D bounding boxes on the image."""
    annotated_image = cv2.imread(image_path)
    if annotated_image is None:
        print(f"Error reading image: {image_path}")
        return None

    edges = [[0,1], [2,3], [4,5], [6,7], [0,2], [1,3], [4,6], [5,7], [0,4], [1,5], [2,6], [3,7]]
    
    for bbox_data in bbox_3d_list:
        bbox_3d = bbox_data['bbox_3d'] if isinstance(bbox_data, dict) and 'bbox_3d' in bbox_data else bbox_data
        bbox_3d = list(bbox_3d)
        bbox_3d[-3:] = [_x * 180 for _x in bbox_3d[-3:]]
        bbox_2d = convert_3dbbox(bbox_3d, cam_params)

        if len(bbox_2d) >= 8:
            box_color = [random.randint(0, 255) for _ in range(3)]
            for start, end in edges:
                try:
                    pt1 = tuple([int(_pt) for _pt in bbox_2d[start]])
                    pt2 = tuple([int(_pt) for _pt in bbox_2d[end]])
                    cv2.line(annotated_image, pt1, pt2, box_color, 2)
                except:
                    continue

    annotated_image_rgb = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    ax.imshow(annotated_image_rgb)
    ax.axis('off')
    return fig

def load_camera_params(image_name):
    """Load camera parameters from JSON file."""
    try:
        with open('../assets/spatial_understanding/cam_infos.json', 'r') as f:
            cam_infos = json.load(f)
        return cam_infos.get(image_name, None)
    except:
        return None

def generate_camera_params(image_path, fov=60):
    """Generate camera parameters if not available."""
    image = Image.open(image_path)
    w, h = image.size
    fx = round(w / (2 * np.tan(np.deg2rad(fov) / 2)), 2)
    fy = round(h / (2 * np.tan(np.deg2rad(fov) / 2)), 2)
    cx, cy = round(w / 2, 2), round(h / 2, 2)
    return {'fx': fx, 'fy': fy, 'cx': cx, 'cy': cy}


## 1. Detect Objects of Specific Categories


In [None]:
# Example 1: Detect all cars in autonomous driving scene
image_path = "../assets/spatial_understanding/autonomous_driving.jpg"
prompt = 'Find all cars in this image. For each car, provide its 3D bounding box. The output format required is JSON: `[{"bbox_3d":[x_center, y_center, z_center, x_size, y_size, z_size, roll, pitch, yaw],"label":"category"}]`.'

cam_params = load_camera_params("autonomous_driving.jpg")
if cam_params is None:
    cam_params = generate_camera_params(image_path)
    print("Using generated camera params:", cam_params)

response = inference_with_api(image_path, prompt)
bbox_3d_results = parse_bbox_3d_from_text(response)
print("Parsed bbox_3d_results:", bbox_3d_results)

fig = draw_3dbboxes(image_path, cam_params, bbox_3d_results)
if fig:
    plt.show()


## 2. Detect a Specific Object Using Descriptions


In [None]:
# Example 2: Detect a specific object using descriptions
image_path = "../assets/spatial_understanding/office.jpg"
prompt = "Locate the black chair in image and provide 3D bounding boxes results in JSON format."

cam_params = load_camera_params("office.jpg")
if cam_params is None:
    cam_params = generate_camera_params(image_path)

response = inference_with_api(image_path, prompt)
bbox_3d_results = parse_bbox_3d_from_text(response)
print("Parsed bbox_3d_results:", bbox_3d_results)

fig = draw_3dbboxes(image_path, cam_params, bbox_3d_results)
if fig:
    plt.show()


## 3. Detect Multiple Objects


In [None]:
# Example 3: Detect multiple objects simultaneously
image_path = "../assets/spatial_understanding/lounge.jpg"
prompt = 'Locate tables, chairs, and sofas in the image and output their 3D bounding boxes. Format: [{"bbox_3d":[x_center, y_center, z_center, x_size, y_size, z_size, roll, pitch, yaw],"label":"category"}].'

cam_params = load_camera_params("lounge.jpg")
if cam_params is None:
    cam_params = generate_camera_params(image_path)

response = inference_with_api(image_path, prompt)
bbox_3d_results = parse_bbox_3d_from_text(response)
print("Parsed bbox_3d_results:", bbox_3d_results)

fig = draw_3dbboxes(image_path, cam_params, bbox_3d_results)
if fig:
    plt.show()


## 4. Using Custom Camera Parameters

When you don't have access to the original camera intrinsic parameters, you can generate parameters with a field of view of 60°.


In [None]:
# Example 4: Using custom camera parameters
image_path = "../assets/spatial_understanding/manipulation.jpg"
prompt = 'Detect the bottle in the image and predict the 3D box. Output JSON: [{"bbox_3d":[x_center, y_center, z_center, x_size, y_size, z_size, roll, pitch, yaw],"label":"category"}].'

# Generate camera parameters with 60° FOV
cam_params = generate_camera_params(image_path, fov=60)
print("Generated camera params:", cam_params)

response = inference_with_api(image_path, prompt)
bbox_3d_results = parse_bbox_3d_from_text(response)
print("Parsed bbox_3d_results:", bbox_3d_results)

fig = draw_3dbboxes(image_path, cam_params, bbox_3d_results)
if fig:
    plt.show()
