Setup

In [1]:
!pip -q install --upgrade diffusers==0.30.2 transformers accelerate safetensors opencv-python einops

import os, cv2, numpy as np, torch, hashlib
from PIL import Image
from typing import List, Tuple
from diffusers import (
    ControlNetModel,
    UniPCMultistepScheduler,
    AutoencoderKL,
    StableDiffusionControlNetImg2ImgPipeline,
)

device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8 else torch.float16
print('Device:', device, 'dtype:', dtype)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDevice: cuda dtype: torch.bfloat16


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Loading Pipeline

In [12]:
# ControlNets (Canny + Tile)
canny_cn_id = "lllyasviel/control_v11p_sd15_canny"
tile_cn_id  = "lllyasviel/control_v11f1e_sd15_tile"
canny_cn = ControlNetModel.from_pretrained(canny_cn_id, torch_dtype=dtype)
tile_cn  = ControlNetModel.from_pretrained(tile_cn_id,  torch_dtype=dtype)

# SD1.5 + MultiControlNet in *img2img* mode (IMPORTANT!)
base_id = "runwayml/stable-diffusion-v1-5"
pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
    base_id,
    controlnet=[canny_cn, tile_cn],
    torch_dtype=dtype,
    safety_checker=None,
).to(device)

# --- LoRA: load once, then set a weight ---
LORA_PATH = "/content/drive/MyDrive/outputs/LoRA_v1/at-step00003500.safetensors"  # or "last.safetensors"

pipe.load_lora_weights(LORA_PATH, adapter_name="uavtex")  # adds a named adapter
pipe.set_adapters(["uavtex"], adapter_weights=[1.0])      # 0.4–1.0 is a good sweep


# Sharper VAE (reduces mushiness)
pipe.vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype).to(device)

# Sampler + attention
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
try:
    pipe.enable_xformers_memory_efficient_attention(False)
    pipe.unet.set_default_attn_processor()  # SDPA on A100
except Exception as e:
    print("Attention setup:", e)

# ---- IP-Adapter (style/texture prior) ----
# Model repo: h94/IP-Adapter (hosts sd15 weights)
try:
    pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
    print("IP-Adapter loaded.")
except Exception as e:
    print("IP-Adapter load error (you can still run without it):", e)


An error occurred while trying to fetch lllyasviel/control_v11f1e_sd15_tile: lllyasviel/control_v11f1e_sd15_tile does not appear to have a file named diffusion_pytorch_model.safetensors.
Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead.


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.controlnet.pipeline_controlnet_img2img.StableDiffusionControlNetImg2ImgPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


Attention setup: Refer to https://github.com/facebookresearch/xformers for more information on how to install xformers
IP-Adapter loaded.


In [13]:
# load a stronger style adapter
try:
    pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-plus_sd15.bin")
    print("Loaded: ip-adapter-plus_sd15.bin")
except Exception as e:
    print("Fell back to basic IP-Adapter:", e)
    pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")

Loaded: ip-adapter-plus_sd15.bin


In [14]:
print("Active adapters:", getattr(pipe, "get_active_adapters", lambda: "n/a")())

Active adapters: ['uavtex']


Utils

In [15]:
def to_pil(img: np.ndarray) -> Image.Image:
    if img.dtype != np.uint8:
        img = np.clip(img, 0, 255).astype(np.uint8)
    return Image.fromarray(img)

def main_image(img_bgr: np.ndarray) -> Image.Image:
    rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    return to_pil(rgb)

def canny_map(img_bgr: np.ndarray, sigma=0.33) -> Image.Image:
    g = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    v = max(1, np.median(g))
    lo = int(max(0, (1.0 - sigma) * v))
    hi = int(min(255, (1.0 + sigma) * v))
    edges = cv2.Canny(g, lo, hi)
    if np.mean(edges) < 5:
        edges = cv2.Canny(g, 20, 60)  # fallback
    edges = cv2.cvtColor(edges, cv2.COLOR_GRAY2RGB)
    return to_pil(edges)

def split_into_tiles(img: np.ndarray, tile=1280, overlap=192):
    H, W = img.shape[:2]
    tiles = []
    step = max(1, tile - overlap)
    for y in range(0, H, step):
        for x in range(0, W, step):
            y2 = min(y + tile, H)
            x2 = min(x + tile, W)
            tiles.append((y, x, img[y:y2, x:x2].copy()))
    return tiles, (H, W)

def blend_tiles(tiles_with_pos, canvas_shape, overlap=192):
    H, W = canvas_shape
    out = np.zeros((H, W, 3), dtype=np.float32)
    weight = np.zeros((H, W, 1), dtype=np.float32)
    if not tiles_with_pos:
        return out.astype(np.uint8)
    tile_h, tile_w = tiles_with_pos[0][2].shape[:2]
    y = np.linspace(0, 1, overlap, endpoint=False)
    fade = np.sin((y*np.pi)/2.0)
    for (y0, x0, tile) in tiles_with_pos:
        h, w = tile.shape[:2]
        a = np.ones((h, w, 1), dtype=np.float32)
        if y0 > 0: a[:overlap, :, 0] *= fade[:, None]
        if x0 > 0: a[:, :overlap, 0] *= fade[None, :]
        out[y0:y0+h, x0:x0+w] += tile.astype(np.float32) * a
        weight[y0:y0+h, x0:x0+w] += a
    out = np.divide(out, np.maximum(weight, 1e-6))
    return np.clip(out, 0, 255).astype(np.uint8)

def seed_for_name(name: str, base=0):
    h = int(hashlib.md5(name.encode()).hexdigest(), 16) % (2**31-1)
    return (h + base) % (2**31-1)

# Keep Pass-A structure, inject Pass-B micro-detail only
def fuse_highfreq(base_bgr, detail_bgr, alpha=0.35):
    base = cv2.cvtColor(base_bgr, cv2.COLOR_BGR2LAB)
    det  = cv2.cvtColor(detail_bgr, cv2.COLOR_BGR2LAB)
    Lb, Ab, Bb = cv2.split(base); Ld, Ad, Bd = cv2.split(det)
    hp = cv2.addWeighted(Ld, 1.0, cv2.GaussianBlur(Ld, (0,0), 1.0), -1.0, 0)
    L = cv2.add(Lb, (alpha*hp).astype(Lb.dtype))
    out = cv2.cvtColor(cv2.merge([L, Ab, Bb]), cv2.COLOR_LAB2BGR)
    return out


Params

In [17]:
# === Paths (EDIT THESE) ===
input_dir  = "/content/drive/MyDrive/training_data/ControlNet_Canny_V0_test"         # your upsampled satellite tiles
output_dir = "/content/drive/MyDrive/outputs/sat2uav_ControlNet_CannyTile_IP_LoRA/V0.4"      # where to save results
os.makedirs(output_dir, exist_ok=True)

# Reference UAV nadir image (your Turkey photo)
ip_adapter_ref_path = "/content/drive/MyDrive/training_data/Turkey_UAV_eg/DJI_0856-scaled.jpg"  # put the image here or on Drive

# === Knobs ===
seed = 42
tile_size = 1280    # A100: 1280–1536 OK; reduce if VRAM is tight
overlap   = 192

# Pass A (fidelity, Canny strong; Tile off)
A_steps    = 20
A_cfg      = 3.5
A_strength = 0.20
A_canny    = 1.0
A_tile     = 0.0

# Pass B (texture, Canny + light Tile + IP-Adapter)
B_steps    = 20
B_cfg      = 3.0
B_strength = 0.7
B_canny    = 0.7
B_tile     = 0.15
IPA_scale  = 0.3     # how much to mimic the UAV reference style (0.3–0.8)
fuse_alpha = 0.35    # how much high-frequency we inject from Pass B

prompt_A = "(uavtex:1.3), nadir UAV aerial photo of urban scene in Turkey, realistic colours, natural lighting"
prompt_B = "(uavtex:1.4), nadir UAV aerial photo of urban scene in Turkey, realistic textures, natural lighting, high-resolution, fine details"
negative = "painting, cartoon, abstract, brush strokes, patterned stripes, mosaic, oversharpened halos, noise"

img_exts = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"}


# Load IP-Adapter reference (if provided)
ip_adapter_image = None
if ip_adapter_ref_path and os.path.exists(ip_adapter_ref_path):
    ip_adapter_image = Image.open(ip_adapter_ref_path).convert("RGB")
    try:
        pipe.set_ip_adapter_scale(IPA_scale)
    except Exception as e:
        print("Could not set IP-Adapter scale; continuing:", e)
    print("Using IP-Adapter reference:", ip_adapter_ref_path)
else:
    print("No IP-Adapter reference image found; continuing without it.")


Using IP-Adapter reference: /content/drive/MyDrive/training_data/Turkey_UAV_eg/DJI_0856-scaled.jpg


Two-pass inference Setup

In [18]:
def run_two_pass_for_array(bgr: np.ndarray, name: str):
    # ---- PASS A: Canny strong, Tile 0 ----
    genA = torch.Generator(device=device).manual_seed(seed_for_name(name, base=seed))
    imgA   = main_image(bgr)
    cannyA = canny_map(bgr)
    controls_A = [cannyA, imgA]        # [Canny, Tile] (Tile uses the RGB itself)
    outA = pipe(
        prompt=prompt_A,
        negative_prompt=negative,
        image=imgA,
        control_image=controls_A,
        num_inference_steps=A_steps,
        guidance_scale=A_cfg,
        generator=genA,
        strength=A_strength,
        controlnet_conditioning_scale=[A_canny, A_tile],
        ip_adapter_image=ip_adapter_image if ip_adapter_image is not None else None,
    ).images[0]
    outA_bgr = cv2.cvtColor(np.array(outA), cv2.COLOR_RGB2BGR)

    # boost micro-contrast only for the Tile control
    tmp = cv2.GaussianBlur(outA_bgr, (0,0), 0.8)
    tile_src = cv2.addWeighted(outA_bgr, 1.35, tmp, -0.35, 0)   # mild unsharp
    imgB   = main_image(tile_src)   # for Tile control + base img
    cannyB = canny_map(outA_bgr)    # keep canny from the “clean” A output

    # ---- PASS B: Canny strong + Tile light + (optional) IP-Adapter ----

    genB = torch.Generator(device=device).manual_seed(seed_for_name(name+"_B", base=seed))
    imgB   = main_image(outA_bgr)
    cannyB = canny_map(outA_bgr)
    controls_B = [cannyB, imgB]

    if ip_adapter_image is not None:
      pipe.set_ip_adapter_scale(IPA_scale)
    print("IPA scale:", IPA_scale, "ref:", ip_adapter_image is not None)

    pipe.set_adapters(["uavtex"], adapter_weights=[1.2])   # temporarily for B

    outB = pipe(
        prompt=prompt_B,
        negative_prompt=negative,
        image=imgB,
        control_image=controls_B,
        num_inference_steps=B_steps,
        guidance_scale=B_cfg,
        generator=genB,
        strength=B_strength,
        controlnet_conditioning_scale=[B_canny, B_tile],
        ip_adapter_image=ip_adapter_image if ip_adapter_image is not None else None,
    ).images[0]
    outB_bgr = cv2.cvtColor(np.array(outB), cv2.COLOR_RGB2BGR)

    # High-frequency fusion (inject texture, keep A’s structure)
    fused = fuse_highfreq(outA_bgr, outB_bgr, alpha=fuse_alpha)

    # Gentle finishing
    lab = cv2.cvtColor(fused, cv2.COLOR_BGR2LAB)
    L, A, B = cv2.split(lab)
    L = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)).apply(L)
    fused = cv2.cvtColor(cv2.merge([L,A,B]), cv2.COLOR_LAB2BGR)
    blur = cv2.GaussianBlur(fused, (0,0), 0.4)
    fused = cv2.addWeighted(fused, 1.06, blur, -0.06, 0)
    return fused

def run_single(path: str):
    name = os.path.splitext(os.path.basename(path))[0]
    bgr = cv2.imread(path, cv2.IMREAD_COLOR)
    if bgr is None:
        print("Skip (cannot read):", path); return
    H, W = bgr.shape[:2]
    if max(H, W) <= tile_size:
        out_bgr = run_two_pass_for_array(bgr, name)
    else:
        tiles, canvas_shape = split_into_tiles(bgr, tile=tile_size, overlap=overlap)
        out_tiles = []
        for (y, x, crop) in tiles:
            out_crop = run_two_pass_for_array(crop, f"{name}_{y}_{x}")
            out_tiles.append((y, x, out_crop))
        out_bgr = blend_tiles(out_tiles, canvas_shape, overlap=overlap)
    save_path = os.path.join(output_dir, f"{name}_uav.png")
    cv2.imwrite(save_path, out_bgr)
    print("Saved:", save_path)

def run_folder():
    files = [fn for fn in sorted(os.listdir(input_dir)) if os.path.splitext(fn.lower())[1] in img_exts]
    for fn in files:
        run_single(os.path.join(input_dir, fn))


Running inference

In [19]:
# Example: process just one file first
# run_single("/content/upsampled_tiles/your_tile.png")

# Then batch the folder
run_folder()

  0%|          | 0/4 [00:00<?, ?it/s]

IPA scale: 0.3 ref: True


  0%|          | 0/14 [00:00<?, ?it/s]

Saved: /content/drive/MyDrive/outputs/sat2uav_ControlNet_CannyTile_IP_LoRA/V0.4/015_8_8_8_out_uav.png


In [10]:
from diffusers import StableDiffusionPipeline
sd = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
sd.load_lora_weights(LORA_PATH, adapter_name="uavtex")

for w in [0.0, 0.7, 1.0, 1.3]:
    sd.set_adapters(["uavtex"], adapter_weights=[w])
    im = sd(
        "(uavtex:1.3), nadir UAV photo, red tile roofs, asphalt roads, grassy lawns, high detail",
        num_inference_steps=40, guidance_scale=7.5, generator=torch.Generator(device="cuda").manual_seed(42)
    ).images[0]
    im.save(f"/content/lora_probe_w{w:.1f}.png")


Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

config.json: 0.00B [00:00, ?B/s]

safety_checker/model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/40 [00:00<?, ?it/s]