In [223]:
import json
import xml.etree.ElementTree as ET
from PIL import Image
from pathlib import Path
from sem_meta import SEMMeta

In [224]:
def strip_ns_key(key):
    """
    Remove XML namespace from a key.
    E.g., '{http://ns.adobe.com/xap/1.0/}CreateDate' -> 'CreateDate'
    """
    return key.split('}')[-1] if '}' in key else key

In [225]:
def xml_to_dict(elem):
    """
    Recursively convert XML element to dict, handling repeated tags as lists.
    """
    children = list(elem)
    if not children:
        text = elem.text.strip() if elem.text and elem.text.strip() else None
        return text

    result = {}
    for child in children:
        child_dict = xml_to_dict(child)
        tag = strip_ns_key(child.tag)
        if tag in result:
            if not isinstance(result[tag], list):
                result[tag] = [result[tag]]
            result[tag].append(child_dict)
        else:
            result[tag] = child_dict
    return result

In [226]:
def strip_ns(d):
    """
    Recursively strip namespaces from dict keys.
    """
    if isinstance(d, dict):
        return {strip_ns_key(k): strip_ns(v) for k, v in d.items()}
    elif isinstance(d, list):
        return [strip_ns(i) for i in d]
    else:
        return d

In [227]:
def parse_value(value):
    """
    Parse metadata value into JSON-safe structure.
    Handles XML, bytes, lists, and simple key=value lines.
    """
    if value is None:
        return None

    # Unpack tuples/lists
    if isinstance(value, (tuple, list)):
        return [parse_value(v) for v in value]

    # Decode bytes
    if isinstance(value, bytes):
        try:
            value = value.decode("utf-8", errors="ignore")
        except Exception:
            return None

    # Try XML
    if isinstance(value, str):
        try:
            root = ET.fromstring(value)
            return {strip_ns_key(root.tag): xml_to_dict(root)}
        except ET.ParseError:
            pass

        # Try key=value pairs
        if "=" in value:
            kv = {}
            for line in value.splitlines():
                if "=" in line:
                    key, val = line.split("=", 1)
                    kv[key.strip()] = val.strip()
            if kv:
                return {"plain": kv}

    # Ensure JSON-safe
    try:
        json.dumps(value)
        return value
    except Exception:
        return None


In [228]:
def convert_meta_to_json(meta):
    """
    Convert SEMMeta metadata dict into JSON-safe dict, parsing all values.
    """
    clean_meta = {}
    for k, v in meta.items():
        try:
            parsed_value = parse_value(v)
            clean_meta[str(k)] = strip_ns(parsed_value)
        except Exception:
            clean_meta[str(k)] = None
    return clean_meta

In [229]:
# ===== MAIN LOOP =====
input_folder = Path("TestData")
output_folder = Path("TestData_json4")
output_folder.mkdir(exist_ok=True)

success_count = 0
failures = []

for tif_path in input_folder.glob("*.tif"):
    try:
        with Image.open(tif_path) as im:
            meta, tags = SEMMeta.ImageMetadata(im)
            json_meta = convert_meta_to_json(meta)

            json_path = output_folder / (tif_path.stem + ".json")
            with open(json_path, "w", encoding="utf-8") as f:
                json.dump(json_meta, f, indent=2)

            print(f"✅ Metadata saved to {json_path}")
            success_count += 1
    except Exception as e:
        print(f"❌ Failed to process {tif_path.name}: {e}")
        failures.append(tif_path.name)

# ===== SUMMARY =====
print("\n===== Summary =====")
print(f"✅ Succeeded: {success_count}")
print(f"❌ Failed: {len(failures)}")
if failures:
    print("Failed images:", ", ".join(failures))

✅ Metadata saved to TestData_json4/tile_sand.json
✅ Metadata saved to TestData_json4/SEM Multi-Detector Image_TLD_1 - SliceImage - 006.json
✅ Metadata saved to TestData_json4/CBS_5kVBD_86pA.json
✅ Metadata saved to TestData_json4/KLE256_05spolirised (1).json
✅ Metadata saved to TestData_json4/SEM Multi-Detector Image_TLD_1 - SliceImage - 007.json
✅ Metadata saved to TestData_json4/SEM Multi-Detector Image_TLD_1 - SliceImage - 005.json
✅ Metadata saved to TestData_json4/SEM Multi-Detector Image_TLD_1 - SliceImage - 011.json
✅ Metadata saved to TestData_json4/Acquire HAADF.json
✅ Metadata saved to TestData_json4/PMal2-2Lw-Ti-law.json
✅ Metadata saved to TestData_json4/SEM Multi-Detector Image_TLD_1 - SliceImage - 010.json
✅ Metadata saved to TestData_json4/SEM Multi-Detector Image_TLD_1 - SliceImage - 004.json
✅ Metadata saved to TestData_json4/highcurrent_013.json
✅ Metadata saved to TestData_json4/Acquire HAADF4.json
✅ Metadata saved to TestData_json4/KLE256_05spolirised.json
✅ Metadat