In [24]:
json_preds_dir = "/Volumes/MyDataDrive/thesis/code-2/src/weak-labels-algo/AD_BUY_AUTO_LABELS/Qwen2.5-72B-Instruct-per_page_votes_merged"
images_dir = "/Volumes/MyDataDrive/thesis/code-2/src/labelrix/ad-buy-form/test-images"


In [None]:
import os
import json
import glob
from typing import Dict, List, Optional

try:
	from PIL import Image
except ImportError as e:
	raise ImportError("Pillow is required: pip install pillow") from e


def _derive_image_filename_from_json(json_filename: str) -> str:
	"""Map a predictions JSON filename to its corresponding image filename.

	Example: votes_<doc>.pdf_page1.json -> <doc>_page1.png
	"""
	name = os.path.splitext(os.path.basename(json_filename))[0]
	if name.startswith("votes_"):
		name = name[len("votes_"):]
	name = name.replace(".pdf_", "_")
	return f"{name}.png"


def _scale_bbox(bbox: List[float], sx: float, sy: float, new_w: int, new_h: int) -> List[int]:
	"""Scale [x1, y1, x2, y2] with separate x/y factors and clamp to image bounds.

	Clamps to valid pixel indices [0..new_dim-1]. Uses floor for left/top and ceil for right/bottom
	to better preserve coverage when converting to integer coordinates.
	"""
	x1, y1, x2, y2 = bbox
	# Scale as floats first
	nx1 = x1 * sx
	ny1 = y1 * sy
	nx2 = x2 * sx
	ny2 = y2 * sy
	# Ensure proper ordering
	left, right = (nx1, nx2) if nx1 <= nx2 else (nx2, nx1)
	top, bottom = (ny1, ny2) if ny1 <= ny2 else (ny2, ny1)
	# Clamp to image bounds (inclusive indices up to new_dim-1)
	max_x = max(0, new_w - 1)
	max_y = max(0, new_h - 1)
	left = max(0.0, min(float(max_x), float(left)))
	right = max(0.0, min(float(max_x), float(right)))
	top = max(0.0, min(float(max_y), float(top)))
	bottom = max(0.0, min(float(max_y), float(bottom)))
	# Convert to ints
	from math import floor, ceil
	return [int(floor(left)), int(floor(top)), int(ceil(right)), int(ceil(bottom))]


def resize_bboxes_for_images(
	json_preds_dir: str,
	images_dir: str,
	new_width: int,
	new_height: int,
	output_json_dir: Optional[str] = None,
	output_images_dir: Optional[str] = None,
) -> Dict[str, List[dict]]:
	"""
	Resize bboxes in all JSON files in json_preds_dir to match a new image size.

	- Expects each JSON file to be a list of dicts with a "bbox": [x1, y1, x2, y2].
	- Matches images by converting the JSON filename to image filename:
	  votes_<doc>.pdf_pageN.json -> <doc>_pageN.png
	- Scales coordinates using the original image's size from images_dir.
	- Optionally writes resized JSON to output_json_dir and resized images to output_images_dir.

	Returns: dict mapping json_filename (basename) -> resized annotations (list of dicts)
	"""
	if not os.path.isdir(json_preds_dir):
		raise FileNotFoundError(f"json_preds_dir not found: {json_preds_dir}")
	if not os.path.isdir(images_dir):
		raise FileNotFoundError(f"images_dir not found: {images_dir}")

	if output_json_dir:
		os.makedirs(output_json_dir, exist_ok=True)
	if output_images_dir:
		os.makedirs(output_images_dir, exist_ok=True)

	results: Dict[str, List[dict]] = {}

	json_paths = sorted(glob.glob(os.path.join(json_preds_dir, "*.json")))
	for json_path in json_paths:
		json_base = os.path.basename(json_path)
		image_name = _derive_image_filename_from_json(json_base)
		image_path = os.path.join(images_dir, image_name)

		if not os.path.isfile(image_path):
			# Skip if the corresponding image is missing
			continue

		with Image.open(image_path) as im:
			orig_w, orig_h = im.size
			if output_images_dir:
				resized_im = im.resize((new_width, new_height), resample=Image.Resampling.LANCZOS)
				resized_im.save(os.path.join(output_images_dir, image_name))

		sx = float(new_width) / float(orig_w)
		sy = float(new_height) / float(orig_h)

		with open(json_path, "r") as f:
			annotations = json.load(f)

		if not isinstance(annotations, list):
			# Expect a list of predictions; skip malformed files
			continue

		resized_annotations: List[dict] = []
		for ann in annotations:
			if not isinstance(ann, dict):
				continue
			bbox = ann.get("bbox")
			if not bbox or not isinstance(bbox, list) or len(bbox) != 4:
				resized_annotations.append(ann)
				continue
			# Detect bbox mode: normalized [0..1], pixel, or placeholder/invalid (very small ints)
			mode = "pixel"
			if all(isinstance(v, (int, float)) for v in bbox):
				min_v = float(min(bbox))
				max_v = float(max(bbox))
				if 0.0 <= min_v and max_v <= 1.0:
					mode = "normalized"
				elif max_v <= 3.0:
					mode = "invalid"  # likely placeholder, not pixels
			# Compute new bbox based on mode
			if mode == "normalized":
				nx1 = bbox[0] * new_width
				ny1 = bbox[1] * new_height
				nx2 = bbox[2] * new_width
				ny2 = bbox[3] * new_height
				new_bbox = _scale_bbox([nx1, ny1, nx2, ny2], 1.0, 1.0, new_width, new_height)
			elif mode == "pixel":
				new_bbox = _scale_bbox(bbox, sx, sy, new_width, new_height)
			else:
				# Keep placeholder/invalid bbox as-is (0/1/2 values); cannot scale meaningfully
				new_bbox = bbox
			new_ann = dict(ann)
			new_ann["bbox"] = new_bbox
			resized_annotations.append(new_ann)

		results[json_base] = resized_annotations

		# Write resized annotations back to the same file by default
		target_json_path = os.path.join(output_json_dir, json_base) if output_json_dir else json_path
		with open(target_json_path, "w") as f_out:
			json.dump(resized_annotations, f_out, indent=2)

	return results

# Example usage (uncomment to run):
out = resize_bboxes_for_images(
	json_preds_dir,
	images_dir,
	840,
	840,
)


In [26]:
def resize_images_in_dir(
	images_dir: str,
	new_width: int,
	new_height: int,
	output_dir: str,
) -> Dict[str, Dict[str, int]]:
	"""
	Resize all images in images_dir to (new_width, new_height) and save into output_dir.

	Supported extensions: .png, .jpg, .jpeg, .tif, .tiff, .webp
	Returns: dict mapping filename -> {"orig_w", "orig_h", "new_w", "new_h"}
	"""
	if not os.path.isdir(images_dir):
		raise FileNotFoundError(f"images_dir not found: {images_dir}")
	os.makedirs(output_dir, exist_ok=True)

	results: Dict[str, Dict[str, int]] = {}
	exts = ("*.png", "*.jpg", "*.jpeg", "*.tif", "*.tiff", "*.webp")
	image_paths: List[str] = []
	for ext in exts:
		image_paths.extend(sorted(glob.glob(os.path.join(images_dir, ext))))

	for image_path in image_paths:
		filename = os.path.basename(image_path)
		try:
			with Image.open(image_path) as im:
				orig_w, orig_h = im.size
				resized_im = im.resize((new_width, new_height), resample=Image.Resampling.LANCZOS)
				resized_im.save(os.path.join(output_dir, filename))
				results[filename] = {"orig_w": orig_w, "orig_h": orig_h, "new_w": new_width, "new_h": new_height}
		except Exception:
			# Skip files Pillow can't open
			continue

	return results

resize_images_in_dir(
    images_dir="/Volumes/MyDataDrive/thesis/code-2/src/labelrix/ad-buy-form/test-images",
    new_width=840,
    new_height=840,
    output_dir="/Volumes/MyDataDrive/thesis/code-2/src/labelrix/ad-buy-form/test-images-resized-840",
)

{'0a32ce11-7ed9-14ee-8856-6a1edfad9ff3_page2.png': {'orig_w': 612,
  'orig_h': 792,
  'new_w': 840,
  'new_h': 840},
 '456300-sept-17-23-2012-11953-13474707086771-_-pdf_page1.png': {'orig_w': 616,
  'orig_h': 790,
  'new_w': 840,
  'new_h': 840},
 'e038716a-f9a1-2ab5-e72b-eb2f00f26981_page1.png': {'orig_w': 792,
  'orig_h': 612,
  'new_w': 840,
  'new_h': 840},
 'e8d41204-64eb-9f4b-608e-5593933aca41_page1.png': {'orig_w': 612,
  'orig_h': 793,
  'new_w': 840,
  'new_h': 840},
 'e8d41204-64eb-9f4b-608e-5593933aca41_page4.png': {'orig_w': 612,
  'orig_h': 793,
  'new_w': 840,
  'new_h': 840},
 'e910d31f-0c5a-5041-1a4d-1f1b39606992_page1.png': {'orig_w': 792,
  'orig_h': 612,
  'new_w': 840,
  'new_h': 840},
 'eabf486e-2ff6-c060-68b7-fcf5363bde66_page1.png': {'orig_w': 612,
  'orig_h': 792,
  'new_w': 840,
  'new_h': 840},
 'eabf486e-2ff6-c060-68b7-fcf5363bde66_page2.png': {'orig_w': 612,
  'orig_h': 792,
  'new_w': 840,
  'new_h': 840},
 'eabf486e-2ff6-c060-68b7-fcf5363bde66_page3.png': 