In [None]:
import os
import re

def rename_files_to_frame_format(directory):
    pattern = re.compile(r'bear_(\d+)\.webp$', re.IGNORECASE)

    for filename in os.listdir(directory):
        match = pattern.match(filename)
        if match:
            number = int(match.group(1))
            new_name = f"frame_{number:05d}.webp"
            src = os.path.join(directory, filename)
            dst = os.path.join(directory, new_name)
            os.rename(src, dst)
            print(f"✅ Renamed: {filename} → {new_name}")
        else:
            print(f"⚠️ Skipping: {filename} (no match)")

rename_files_to_frame_format("/content/depth_bear")


✅ Renamed: bear_19.webp → frame_00019.webp
✅ Renamed: bear_51.webp → frame_00051.webp
✅ Renamed: bear_37.webp → frame_00037.webp
✅ Renamed: bear_65.webp → frame_00065.webp
✅ Renamed: bear_75.webp → frame_00075.webp
✅ Renamed: bear_79.webp → frame_00079.webp
✅ Renamed: bear_33.webp → frame_00033.webp
✅ Renamed: bear_85.webp → frame_00085.webp
✅ Renamed: bear_23.webp → frame_00023.webp
✅ Renamed: bear_59.webp → frame_00059.webp
✅ Renamed: bear_01.webp → frame_00001.webp
✅ Renamed: bear_43.webp → frame_00043.webp
✅ Renamed: bear_57.webp → frame_00057.webp
✅ Renamed: bear_13.webp → frame_00013.webp
✅ Renamed: bear_03.webp → frame_00003.webp
✅ Renamed: bear_91.webp → frame_00091.webp
✅ Renamed: bear_73.webp → frame_00073.webp
✅ Renamed: bear_55.webp → frame_00055.webp
✅ Renamed: bear_83.webp → frame_00083.webp
✅ Renamed: bear_21.webp → frame_00021.webp
✅ Renamed: bear_47.webp → frame_00047.webp
✅ Renamed: bear_69.webp → frame_00069.webp
✅ Renamed: bear_35.webp → frame_00035.webp
✅ Renamed: 

In [None]:
import torch
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, DDIMScheduler, DDIMInverseScheduler
from PIL import Image
import numpy as np
import torchvision.transforms as T
import torch.nn.functional as F
import os
from pathlib import Path

class CustomControlNet:
    def __init__(self, base_model="CompVis/stable-diffusion-v1-4", controlnet_model="lllyasviel/sd-controlnet-depth"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Load ControlNet model
        controlnet = ControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.float16)

        # Load pipeline only once
        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
            base_model,
            controlnet=controlnet,
            torch_dtype=torch.float16
        ).to(self.device)

        # Load schedulers
        self.ddim_inverser = DDIMInverseScheduler.from_pretrained(base_model, subfolder="scheduler")
        self.ddim_scheduler = DDIMScheduler.from_pretrained(base_model, subfolder="scheduler")

        # Save VAE reference for encoding
        self.vae = self.pipe.vae

    def _load_and_preprocess_depth_map(self, image_path, target_size=512):
        depth_image = Image.open(image_path).convert("L")
        depth_tensor = T.ToTensor()(depth_image).float().unsqueeze(0)  # [1, 1, H, W]

        # Convert to disparity
        epsilon = 1e-6
        disparity = 1.0 / (depth_tensor + epsilon)
        disparity -= disparity.min()
        disparity /= (disparity.max() + 1e-8)

        # Resize and repeat channels
        disparity = F.interpolate(disparity, size=(target_size, target_size), mode='bilinear', align_corners=False)
        disparity = disparity.repeat(1, 3, 1, 1)  # [1, 3, H, W]
        disparity = disparity.to("cuda").to(torch.float16)

        return disparity


    def _preprocess_input_image(self, image_path):
        input_image = Image.open(image_path).convert("RGB").resize((512, 512))
        input_image = np.array(input_image).astype(np.float32) / 255.0
        input_tensor = torch.from_numpy(input_image).permute(2, 0, 1).unsqueeze(0)
        input_tensor = input_tensor.to(self.device).to(torch.float16)
        return input_tensor

    def _encode_latents(self, input_tensor):
        latents = self.vae.encode(input_tensor * 2 - 1)["latent_dist"].mean * 0.18215
        return latents

    def edit_image(self, image_path, depth_map_path, prompt):
        # Step 1: Preprocess inputs
        input_tensor = self._preprocess_input_image(image_path)
        latents = self._encode_latents(input_tensor)
        disparity = self._load_and_preprocess_depth_map(depth_map_path)

        # Step 2: Invert image to noise using DDIMInverseScheduler
        self.pipe.scheduler = self.ddim_inverser
        init_noise = self.pipe(
            prompt="",
            num_inference_steps=20,
            latents=latents,
            image=disparity,
            return_dict=False,
            guidance_scale=0,
            output_type="latent"
        )[0]

        # Step 3: Use DDIMScheduler to regenerate image with prompt
        self.pipe.scheduler = self.ddim_scheduler
        edited_image = self.pipe(
            prompt=prompt,
            num_inference_steps=20,
            latents=init_noise,
            image=disparity,
            guidance_scale=2,
            controlnet_conditioning_scale=1.0,
            eta=0,
            output_type="pil"
        ).images[0]

        return edited_image

    def edit_bear_images(self, input_dir="bear", depthmap_dir="depth_bear", output_dir="bear_edited", prompt=""):
        input_dir = Path(input_dir)
        depthmap_dir = Path(depthmap_dir)
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        # Collect input images and depth maps
        input_images = {f.stem: f for f in input_dir.glob("*.jpg")}
        depth_maps = {f.stem: f for f in depthmap_dir.glob("*.webp")}

        common_stems = input_images.keys() & depth_maps.keys()

        if not common_stems:
            print("⚠️ No matching pairs found between .jpg and .webp files.")
            return

        for stem in sorted(common_stems):
            image_path = input_images[stem]
            depth_path = depth_maps[stem]

            try:
                edited = self.edit_image(str(image_path), str(depth_path), prompt)
                save_path = output_dir / f"{stem}.png"
                edited.save(save_path)
                print(f"✅ Saved: {save_path.name}")
            except Exception as e:
                print(f"❌ Error processing {stem}: {e}")

In [None]:
editor = CustomControlNet()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/920 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

scheduler_config-checkpoint.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
editor.edit_bear_images(
    input_dir="/content/bear",
    depthmap_dir="/content/depth_bear",
    prompt="a polar bear ",
    output_dir="polar_bear_2.5_neg"
)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00001.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00003.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00005.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00007.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00009.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00011.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00013.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00015.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00017.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00019.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00021.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00023.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00025.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00027.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00029.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00031.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00033.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00035.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00037.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00039.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00041.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00043.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00045.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00047.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00049.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00051.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00053.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00055.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00057.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00059.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00061.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00063.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00065.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00067.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00069.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00071.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Potential NSFW content was detected in one or more images. A black image will be returned instead. Try again with a different prompt and/or seed.


✅ Saved: frame_00073.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00075.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00077.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00079.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00081.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00083.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00085.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00087.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00089.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00091.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00093.png


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

✅ Saved: frame_00095.png


In [None]:
import shutil
from google.colab import files

# Step 1: Set folder name
folder_to_zip = "/content/polar_bear_2.5_neg"
zip_filename = "polar_bear_2.5_neg.zip"
# Step 2: Create the zip archive
shutil.make_archive(base_name=zip_filename.replace('.zip', ''), format='zip', root_dir=folder_to_zip)

# Step 3: Download the zip file
files.download(zip_filename)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import torch
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, DDIMScheduler, DDIMInverseScheduler
from PIL import Image
import numpy as np
import torchvision.transforms as T
import torch.nn.functional as F
from pathlib import Path

class CustomControlNet:
    def __init__(self, base_model="CompVis/stable-diffusion-v1-4", controlnet_model="lllyasviel/sd-controlnet-depth"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Load ControlNet model
        controlnet = ControlNetModel.from_pretrained(controlnet_model, torch_dtype=torch.float16)

        # Load pipeline
        self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
            base_model,
            controlnet=controlnet,
            torch_dtype=torch.float16
        ).to(self.device)

        # Load schedulers
        self.ddim_inverser = DDIMInverseScheduler.from_pretrained(base_model, subfolder="scheduler")
        self.ddim_scheduler = DDIMScheduler.from_pretrained(base_model, subfolder="scheduler")

        self.vae = self.pipe.vae

    def _load_and_preprocess_depth_map(self, depth_path, target_size):
        """Resize depth map to match the input image size"""
        depth_image = Image.open(depth_path).convert("L").resize(target_size, Image.BILINEAR)
        depth_tensor = T.ToTensor()(depth_image).float().unsqueeze(0)  # [1, 1, H, W]

        epsilon = 1e-6
        disparity = 1.0 / (depth_tensor + epsilon)
        disparity -= disparity.min()
        disparity /= (disparity.max() + 1e-8)

        disparity = disparity.repeat(1, 3, 1, 1)  # [1, 3, H, W]
        disparity = disparity.to(self.device).to(torch.float16)

        return disparity

    def _preprocess_input_image(self, image_path):
        image = Image.open(image_path).convert("RGB")
        original_size = image.size  # (width, height)
        input_image = np.array(image).astype(np.float32) / 255.0
        input_tensor = torch.from_numpy(input_image).permute(2, 0, 1).unsqueeze(0).to(self.device).to(torch.float16)
        return input_tensor, original_size

    def _encode_latents(self, input_tensor):
        latents = self.vae.encode(input_tensor * 2 - 1)["latent_dist"].mean * 0.18215
        return latents

    def edit_image(self, image_path, depth_map_path, prompt):
        input_tensor, image_size = self._preprocess_input_image(image_path)
        latents = self._encode_latents(input_tensor)
        disparity = self._load_and_preprocess_depth_map(depth_map_path, target_size=image_size)

        # Step 1: Invert to noise
        self.pipe.scheduler = self.ddim_inverser
        init_noise = self.pipe(
            prompt="",
            num_inference_steps=20,
            latents=latents,
            image=disparity,
            return_dict=False,
            guidance_scale=0,
            output_type="latent"
        )[0]

        # Step 2: Regenerate with prompt
        self.pipe.scheduler = self.ddim_scheduler
        edited_image = self.pipe(
            prompt=prompt,
            negative_prompt="extra face, extra eyes, extra nose, face on tail, two faces, distorted anatomy, deformed body, unnatural symmetry, back face, tail looks like face",
            num_inference_steps=20,
            latents=init_noise,
            image=disparity,
            guidance_scale=2.5,
            controlnet_conditioning_scale=1.0,
            eta=0,
            output_type="pil"
        ).images[0]

        return edited_image

    def edit_bear_images(self, input_dir="bear", depthmap_dir="depth_bear", output_dir="bear_edited", prompt=""):
        input_dir = Path(input_dir)
        depthmap_dir = Path(depthmap_dir)
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        input_images = {f.stem: f for f in input_dir.glob("*.jpg")}
        depth_maps = {f.stem: f for f in depthmap_dir.glob("*.webp")}

        common_stems = input_images.keys() & depth_maps.keys()

        if not common_stems:
            print("⚠️ No matching pairs found between .jpg and .webp files.")
            return

        for stem in sorted(common_stems):
            image_path = input_images[stem]
            depth_path = depth_maps[stem]

            try:
                edited = self.edit_image(str(image_path), str(depth_path), prompt)
                save_path = output_dir / f"{stem}.png"
                edited.save(save_path)
                print(f"✅ Saved: {save_path.name}")
            except Exception as e:
                print(f"❌ Error processing {stem}: {e}")
