# Step 3: Define Answer Bounding Boxes
1. Convert the exam PDF into page images.
2. Auto-detect bounding boxes with AI.
3. Manually review and adjust each answer region.

In [1]:
from grading_utils import setup_paths, create_directories

prefix = "VTC Test"
paths = setup_paths(prefix, "sample")

pdf_file = paths["pdf_file"]

## Configure Processing Settings

Set the number of pages to process from the PDF. 
- Use `len(pages)` to process all pages
- Use a specific number (e.g., `1`, `2`, `3`) to process only that many pages
- Useful for testing on a subset before processing the entire document

In [2]:
# Number of pages to process
# Set to 1 for testing, or use len(pages) after converting PDF to process all pages
number_of_pages = 2

In [3]:
import os

# Extract paths from setup
file_name = paths["file_name"]
base_path = paths["base_path"]
base_path_images = paths["base_path_images"]
base_path_annotations = paths["base_path_annotations"]

# Create directories
create_directories(paths)

## Convert PDF to JPG

In [4]:
# read pdf and convert to images
# https://stackoverflow.com/questions/46184239/how-to-convert-pdf-to-image-using-python
from pdf2image import convert_from_path
import os

pages = convert_from_path(pdf_file, fmt='jpeg')
# extrat file name from pdf_file
file_name = os.path.basename(pdf_file)
file_name = os.path.splitext(file_name)[0]

for count, page in enumerate(pages):
    page.save(f'{base_path_images}{count}.jpg', 'JPEG')

In [5]:
import base64
import json

def update_json_file(annotations, path):
    with open(path, "w") as f:
        json.dump(annotations, f, indent=4)   

def image_to_data_url(filename):
    ext = filename.split(".")[-1]
    prefix = f"data:image/{ext};base64,"
    with open(filename, "rb") as f:
        img = f.read()
    return prefix + base64.b64encode(img).decode("utf-8")

## Setup Vertex AI Express Mode with API Key

This notebook now uses **Vertex AI Express Mode** with API key authentication instead of OAuth/ADC.

**Steps to get your API key:**
1. Visit https://aistudio.google.com/apikey
2. Create or select your API key
3. Copy the API key and add it to the `.env` file in the parent directory:
   ```
   GOOGLE_GENAI_API_KEY=your-actual-api-key-here
   ```

**Benefits of Express Mode:**
- ✓ Simpler authentication (just an API key)
- ✓ No need for gcloud CLI authentication
- ✓ No service account JSON files
- ✓ Easy to use in notebooks and scripts

In [6]:
from grading_utils import init_gemini_client

# Gemini client will be initialized in the next cell
print("✓ Ready to initialize Gemini client")

✓ Ready to initialize Gemini client


## Initialize Vertex AI Client

Initialize the Gemini API client using the API key from the `.env` file.

In [7]:
from google import genai
from google.genai import types

# Get API key from environment variable
API_KEY = os.getenv("GOOGLE_GENAI_API_KEY")

if not API_KEY or API_KEY == "your-api-key-here":
    raise ValueError(
        "Please set your GOOGLE_GENAI_API_KEY in the .env file. "
        "Get your API key from: https://aistudio.google.com/apikey"
    )

# Initialize client with Vertex AI Express Mode
client = genai.Client(vertexai=True, api_key=API_KEY)

print("✓ Vertex AI Express Mode initialized successfully!")

✓ Vertex AI Express Mode initialized successfully!


In [8]:
from pydantic import BaseModel, Field
from typing import List

class BoundingBox(BaseModel):
    """Represents a single bounding box annotation"""
    x: int = Field(description="X coordinate of the top-left corner")
    y: int = Field(description="Y coordinate of the top-left corner")
    width: int = Field(description="Width of the bounding box")
    height: int = Field(description="Height of the bounding box")
    label: str = Field(description="Question number (e.g., '1', '2', '3')")

class BoundingBoxResponse(BaseModel):
    """Wrapper class for list of bounding boxes"""
    boxes: List[BoundingBox] = Field(description="List of bounding boxes for question cells")

print("✓ Pydantic models defined for structured output")

✓ Pydantic models defined for structured output


In [9]:
def ocr_structured(prompt: str, filePath: str, response_schema: BaseModel):
    """
    OCR function using Vertex AI Express Mode with structured output
    
    Args:
        prompt: The prompt describing what to extract
        filePath: Path to the image file
        response_schema: Pydantic BaseModel class defining the expected response structure
    
    Returns:
        Parsed response as the specified Pydantic model
    """
    # Read the image file
    with open(filePath, "rb") as f:
        data = f.read()
    
    # Create configuration with structured output
    config = types.GenerateContentConfig(
        temperature=0,
        top_p=0.5,
        max_output_tokens=65535,
        response_mime_type="application/json",
        response_schema=response_schema,
        safety_settings=[
            types.SafetySetting(
                category="HARM_CATEGORY_HATE_SPEECH",
                threshold="BLOCK_ONLY_HIGH",
            ),
            types.SafetySetting(
                category="HARM_CATEGORY_DANGEROUS_CONTENT",
                threshold="BLOCK_ONLY_HIGH",
            ),
            types.SafetySetting(
                category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
                threshold="BLOCK_ONLY_HIGH",
            ),
            types.SafetySetting(
                category="HARM_CATEGORY_HARASSMENT",
                threshold="BLOCK_ONLY_HIGH",
            ),
        ]
    )
    
    # Generate content with structured output
    response = client.models.generate_content(
        model="gemini-3-flash-preview",
        contents=[
            {
                "role": "user",
                "parts": [
                    {"inline_data": {"mime_type": "image/jpeg", "data": data}},
                    {"text": prompt}
                ]
            }
        ],
        config=config,
    )
    
    # Try to use parsed property first, fall back to text parsing
    import json
    
    # Check if parsed property exists and has data
    if hasattr(response, 'parsed') and response.parsed is not None:
        print(f"✓ Response parsed successfully - found {len(response.parsed.boxes)} boxes")
        return response.parsed
    
    # Fall back to text-based parsing
    if response.text is None or response.text == "":
        print(f"⚠️ Warning: Empty response received for {filePath}")
        print(f"Response attributes: {dir(response)}")
        # Return empty response matching schema
        return response_schema(boxes=[])
    
    print(f"✓ Response received ({len(response.text)} chars)")
    
    try:
        result = json.loads(response.text)
        parsed = response_schema(**result)
        print(f"✓ Successfully parsed {len(parsed.boxes)} boxes from text")
        return parsed
    except json.JSONDecodeError as e:
        print(f"❌ JSON parsing error: {e}")
        print(f"Response text (first 1000 chars):\n{response.text[:1000]}")
        # Return empty response on parse error
        return response_schema(boxes=[])

print("✓ Structured OCR function defined")

✓ Structured OCR function defined


## Extract Bounding Boxes with Structured Output

Using **Pydantic models** with Gemini's structured output feature ensures:
- ✓ Consistent JSON format (no parsing errors)
- ✓ Type safety and validation
- ✓ No need for manual JSON parsing or cleanup
- ✓ Automatic schema enforcement by the model

The model will return responses that match the `BoundingBoxResponse` schema exactly.

In [10]:
import json
import copy

# Updated prompt for structured output
prompt = """Extract the coordinates of bounding boxes for each question/answer cell from the table in the image.

Instructions:
- Identify all table cells that contain question numbers (like "20", "21", "22a", "22b", "23", "24", etc.)
- Question numbers are typically located in the top-left corner or top area of each cell
- Each bounding box should cover the entire cell area where a student would write their answer
- Include cells with sub-questions (like 22a, 22b, 22c, etc.) as separate bounding boxes
- Do NOT include cells that only contain "XXXXXXX" or are marked as non-answer areas
- Bounding boxes may be adjacent but should not overlap
- For merged cells spanning multiple rows/columns, create one bounding box covering the entire merged area

For each bounding box, provide:
- x: X coordinate of the top-left corner of the cell
- y: Y coordinate of the top-left corner of the cell
- width: Width of the entire cell (including answer space)
- height: Height of the entire cell (including answer space)
- label: The question number only (e.g., "20", "21", "22a", "22b", "23", "24", etc.)

Important: 
- Extract the question number text exactly as shown (including letters like "a", "b", "c" for sub-questions)
- Do not include the period after the question number in the label
- Focus on cells where students write answers, not header cells or instruction text
"""

aiAnnoation = {}

for i in range(number_of_pages):
    image_path = base_path_images + f"{i}.jpg"
    print(f"\n{'='*60}")
    print(f"Processing page {i} ({image_path})")
    print(f"{'='*60}")
    
    try:
        # Use structured output with Pydantic schema
        result = ocr_structured(prompt, image_path, BoundingBoxResponse)
        
        # Convert Pydantic model to dict and extract boxes
        boxes_dict = [box.model_dump() for box in result.boxes]
        aiAnnoation[str(i)] = boxes_dict
        
        print(f"✓ Page {i}: Found {len(boxes_dict)} bounding boxes")
        if boxes_dict:
            print(json.dumps(boxes_dict, indent=2))
        else:
            print("  (No bounding boxes detected)")
            
    except Exception as e:
        print(f"❌ Error processing page {i}: {type(e).__name__}: {e}")
        aiAnnoation[str(i)] = []

print(f"\n{'='*60}")
print("✓ All annotations extraction completed!")
print(f"Successfully processed {len(aiAnnoation)} pages")
print(f"{'='*60}")

backup = copy.deepcopy(aiAnnoation)


Processing page 0 (../marking_form/VTC Test/images/0.jpg)


✓ Response parsed successfully - found 3 boxes
✓ Page 0: Found 3 bounding boxes
[
  {
    "x": 134,
    "y": 260,
    "width": 719,
    "height": 110,
    "label": "Q1"
  },
  {
    "x": 134,
    "y": 370,
    "width": 719,
    "height": 125,
    "label": "Q2"
  },
  {
    "x": 134,
    "y": 495,
    "width": 719,
    "height": 108,
    "label": "Q3"
  }
]

Processing page 1 (../marking_form/VTC Test/images/1.jpg)
✓ Response parsed successfully - found 2 boxes
✓ Page 1: Found 2 bounding boxes
[
  {
    "x": 135,
    "y": 234,
    "width": 719,
    "height": 111,
    "label": "4"
  },
  {
    "x": 135,
    "y": 345,
    "width": 719,
    "height": 123,
    "label": "5"
  }
]

✓ All annotations extraction completed!
Successfully processed 2 pages


In [11]:
from PIL import Image

# Open an image file
with Image.open(image_path) as img:
    # Get width and height
    width, height = img.size

print(f"Width: {width}, Height: {height}")

Width: 1654, Height: 2338


In [12]:
import json
import copy
aiAnnoation = copy.deepcopy(backup)

x_scale = width / 1000.0
y_scale = height / 1000.0
# x_scale = 1
# y_scale = 1
for i in range(number_of_pages):
    for item in aiAnnoation[str(i)]:
        item['x'] = int(round(item['x'] * x_scale))
        item['y'] = int(round(item['y'] * y_scale)) 
        item['width'] = int(round(item['width'] * x_scale))
        item['height'] = int(round(item['height'] * y_scale))


ai_annotations_path = base_path_annotations + "ai_annotations.json"

# Save the aiAnnoation variable to a JSON file
with open(ai_annotations_path, "w") as f:
    json.dump(aiAnnoation, f)


Please ensure the following are clearly marked on each page before grading:
- ID
- NAME
- CLASS

In [13]:
from jupyter_bbox_widget import BBoxWidget
import ipywidgets as widgets
import json
import glob

page = 1
pageAndBoxingBoxes={}

files = sorted(glob.glob(base_path_images + "*.jpg"))

w_progress = widgets.IntProgress(value=0, max=len(files), description="Progress")
annotations_path = base_path_annotations + "annotations.json"
ai_annotations_path = base_path_annotations + "ai_annotations.json"

annotations = {}
# Load AI annotations first (as base)
if os.path.exists(ai_annotations_path):
    with open(ai_annotations_path, "r") as f: 
        annotations = json.load(f) 
    print(f"✓ Loaded AI annotations for {len(annotations)} pages")

# Then merge/override with manual annotations if they exist
if os.path.exists(annotations_path):
    with open(annotations_path, "r") as f: 
        manual_annotations = json.load(f)
        annotations.update(manual_annotations)  # Merge instead of replace
    print(f"✓ Merged manual annotations for {len(manual_annotations)} pages")

print(f"Total pages with annotations: {list(annotations.keys())}")

question_widget = widgets.Text(value="", placeholder="", description="Question:")

w_bbox = BBoxWidget(
    image=image_to_data_url(files[0])   
)
w_bbox.attach(question_widget, name="label")
w_bbox.bboxes = annotations[str(w_progress.value)] if str(w_progress.value) in annotations else []

# when Skip button is pressed we move on to the next file
def on_skip():
    if w_progress.value + 1 >= len(files):
        print(f"Already at the last page ({len(files)-1})")
        return
    
    w_progress.value += 1
    current_page = str(w_progress.value)
    
    # open new image in the widget
    image_file = files[w_progress.value]
    w_bbox.image = image_to_data_url(image_file)
    
    # Load bounding boxes for current page
    if current_page in annotations:
        w_bbox.bboxes = annotations[current_page]
        print(f"✓ Loaded {len(annotations[current_page])} bounding boxes for page {w_progress.value}")
    else:
        w_bbox.bboxes = []
        print(f"⚠️ No annotations found for page {w_progress.value}")


w_bbox.on_skip(on_skip)

# when Submit button is pressed we save current annotations
# and then move on to the next file
def on_submit():
    image_file = files[w_progress.value]
    # save annotations for current image
    annotations[str(w_progress.value)] = w_bbox.bboxes
    update_json_file(annotations, annotations_path)
    print(f"✓ Saved {len(w_bbox.bboxes)} annotations for page {w_progress.value}")
    # move on to the next file
    on_skip()


w_bbox.on_submit(on_submit)
w_out = widgets.Output()

def on_bbox_change(change):
    w_out.clear_output(wait=True)
    with w_out:
        print(json.dumps(change["new"], indent=4))
        pageAndBoxingBoxes[w_progress.value] = change["new"]


w_bbox.observe(on_bbox_change, names=["bboxes"])

w_container = widgets.VBox(
    [
        widgets.HBox(
            [
                question_widget            
            ]
        ),
        w_progress,
        w_bbox,
        w_out,
    ]
)
w_container

✓ Loaded AI annotations for 2 pages
Total pages with annotations: ['0', '1']


VBox(children=(HBox(children=(Text(value='', description='Question:', disabled=True, placeholder=''),)), IntPr…