In [None]:
import torch
from diffusers import StableDiffusionPipeline

def get_device():
    if torch.cuda.is_available():
        print(f"✅ CUDA device found: {torch.cuda.get_device_name(0)}")
        return torch.device("cuda")
    elif getattr(torch.backends, "mps", None) and torch.backends.mps.is_available() and torch.backends.mps.is_built():
        print("✅ MPS device found. Using Metal acceleration.")
        return torch.device("mps")
    else:
        print("⚠️ No GPU acceleration found. Falling back to CPU.")
        return torch.device("cpu")

DEVICE = get_device()
# DEVICE = torch.device("cpu")

# load SD model and sample
pipe = StableDiffusionPipeline.from_pretrained("/kaggle/input/stablediffusion5669/tensorflow2/default/1/models/sd-v1-5", torch_dtype=torch.float32)
pipe.safety_checker = None
pipe.enable_attention_slicing()
pipe = pipe.to(DEVICE)
print(f"✅ Model loaded to {DEVICE}.")

prompt = input("Enter your text prompt: ").strip()
if not prompt:
    prompt = "a futuristic city skyline at sunset"

out = pipe(prompt, num_inference_steps=50, guidance_scale=7.5)
out.images[0].save("sd_sample.png")
print(f"✅ Saved sd_sample.png using {DEVICE}.")


In [None]:
# !/usr/bin/env python3
# ==== env BEFORE importing torch ====
import os
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:64")

import math, random, time
from pathlib import Path
import numpy as np
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.transforms import InterpolationMode

from transformers import CLIPTextModel, CLIPTokenizer, get_cosine_schedule_with_warmup
from diffusers import (
    AutoencoderKL,
    UNet2DConditionModel,
    ControlNetModel,
    DDPMScheduler,
)
from torch.optim import AdamW

# ----------------- TUNED CONFIG (aim: lower loss, fewer epochs) -----------------
MODEL_DIR_SD = "/kaggle/input/stablediffusion5669/tensorflow2/default/1/models/sd-v1-5"
DATA_ROOT     = "/kaggle/input/canny-tuples/dataset"
OUTPUT_DIR    = "controlnet_scratch_out"

BATCH_SIZE  = 1
GRAD_ACCUM  = 8                         # effective batch = 8
EPOCHS      = 5                        # looped, but MAX_STEPS terminates
MAX_STEPS   = 12000                     # steps target (stop early when hit)
LR          = 1e-5
WEIGHT_DECAY= 1e-4                      # milder regularization (was 1e-2)
WARMUP_STEPS= 1000                      # warmup for quicker convergence
IMAGE_SIZE  = 256
SAVE_EVERY  = 500                       # save a bit more frequently
SEED        = 42
DROP_HINT_P = 0.0                       # disable until fidelity is good
OVERFIT_ONE = None                      # e.g., "00001.jpg" to overfit/debug; else None

DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"
USE_FP16 = bool(torch.cuda.is_available())
print(f"[STEP] DEVICE={DEVICE} FP16={USE_FP16}")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------------- SEEDING & CUDA BACKENDS -----------------
def seed_everything(s=SEED):
    random.seed(s); np.random.seed(s)
    torch.manual_seed(s)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(s)
seed_everything()
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True

# ---------------- DATASET -----------------
class TripletDataset(Dataset):
    """
    Expects:
      root/images/*.jpg|png
      root/canny/*.jpg|png (same filenames)
      root/captions.txt -> "<filename>\t<caption>" per line
    """
    def __init__(self, root: str, image_size: int, overfit_one: str | None = None):
        root = Path(root)
        img_dir = root / "images"
        cn_dir  = root / "canny"
        if not img_dir.exists() or not cn_dir.exists():
            raise FileNotFoundError(f"Expected folders: {img_dir} and {cn_dir}")

        img_files = sorted([p for p in img_dir.iterdir() if p.suffix.lower() in (".png",".jpg",".jpeg")])
        can_files = sorted([p for p in cn_dir.iterdir()  if p.suffix.lower() in (".png",".jpg",".jpeg")])

        # --- robust caption mapping by filename ---
        capf = root / "captions.txt"
        caps_by_name = {}
        if capf.exists():
            with open(capf, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line: 
                        continue
                    if "\t" in line:
                        fn, cap = line.split("\t", 1)
                    else:
                        fn, cap = line, line
                    caps_by_name[fn.strip()] = cap.strip()

        # filter to overfit-one if requested
        if overfit_one is not None:
            img_files = [p for p in img_files if p.name == overfit_one]
            can_files = [p for p in can_files if p.name == overfit_one]
            if len(img_files) == 0:
                raise ValueError(f"OVERFIT_ONE='{overfit_one}' not found in images/")
            # replicate sample to stabilize loader stats
            img_files = img_files * 128
            can_files = can_files * 128

        assert len(img_files) == len(can_files), "image and canny counts differ"

        self.img_files   = img_files
        self.canny_files = can_files
        self.captions    = [caps_by_name.get(p.name, "") for p in self.img_files]

        assert len(self.captions) == len(self.img_files), "captions count mismatch"

        self.tf_img = transforms.Compose([
            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.LANCZOS),
            transforms.ToTensor(),  # [0,1]
        ])
        self.tf_hint = transforms.Compose([
            transforms.Resize((image_size, image_size), interpolation=InterpolationMode.NEAREST),
            transforms.ToTensor(),  # [0,1]
        ])

    def __len__(self): return len(self.img_files)

    def __getitem__(self, i):
        img = Image.open(self.img_files[i]).convert("RGB")
        cn  = Image.open(self.canny_files[i]).convert("L")
        img = self.tf_img(img)                 # [3,H,W]
        cn  = self.tf_hint(cn).repeat(3,1,1)   # [1,H,W] -> [3,H,W]
        return {"image": img, "canny": cn, "prompt": self.captions[i]}

# ---------------- LOAD SD COMPONENTS -----------------
def load_sd(sd_dir: str, device: str):
    tok = CLIPTokenizer.from_pretrained(str(Path(sd_dir) / "tokenizer"))
    te  = CLIPTextModel.from_pretrained(str(Path(sd_dir) / "text_encoder"))
    vae = AutoencoderKL.from_pretrained(sd_dir, subfolder="vae")
    un  = UNet2DConditionModel.from_pretrained(sd_dir, subfolder="unet")

    for m in (vae, un, te):
        m.to(device).eval()
        for p in m.parameters(): p.requires_grad = False
    return tok, te, vae, un

tokenizer, text_encoder, vae, unet = load_sd(MODEL_DIR_SD, DEVICE)

# ControlNet from UNet (paper-consistent zero-conv init preserved)
controlnet = ControlNetModel.from_unet(unet).to(DEVICE).train()

# ---- Memory savers: gradient checkpointing + xFormers (optional) ----
try:
    controlnet.enable_gradient_checkpointing()
except Exception as e:
    print("[WARN] grad checkpointing not available:", e)

try:
    import xformers  # noqa:F401
    unet.enable_xformers_memory_efficient_attention()
    controlnet.enable_xformers_memory_efficient_attention()
    print("[OK] xFormers attention enabled.")
except Exception as e:
    print("[WARN] xFormers not enabled:", e)

# ---- SD1.5 scheduler params (scaled_linear) ----
noise_scheduler = DDPMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear"
)

# ---- DataLoader (drop_last keeps shapes stable) ----
ds  = TripletDataset(DATA_ROOT, IMAGE_SIZE, overfit_one=OVERFIT_ONE)
dl  = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True,
                 num_workers=0, pin_memory=True, drop_last=True)
print(f"[OK] dataset={len(ds)} batches/epoch={len(dl)} overfit_one={OVERFIT_ONE}")

# ---- Optimizer + Cosine schedule with warmup ----
def make_optimizer(params, lr):
    try:
        import bitsandbytes as bnb  # pip install bitsandbytes
        print("[OK] Using bitsandbytes AdamW8bit")
        return bnb.optim.AdamW8bit(params, lr=lr, betas=(0.9,0.999), eps=1e-8, weight_decay=WEIGHT_DECAY)
    except Exception as e:
        print("[WARN] 8-bit optimizer not available, using torch AdamW:", e)
        return AdamW(params, lr=lr, betas=(0.9,0.999), eps=1e-8, weight_decay=WEIGHT_DECAY)

opt    = make_optimizer(controlnet.parameters(), LR)
scaler = torch.cuda.amp.GradScaler(enabled=USE_FP16)
mse    = nn.MSELoss()

# total updates = MAX_STEPS, schedule steps move each optimizer step (not every batch)
sched = get_cosine_schedule_with_warmup(
    opt, num_warmup_steps=WARMUP_STEPS, num_training_steps=MAX_STEPS if MAX_STEPS else 12000
)

# ---------------- TRAIN -----------------
global_step = 0
start = time.time()
for ep in range(EPOCHS):
    if MAX_STEPS and global_step >= MAX_STEPS: break
    pbar = tqdm(dl, desc=f"Epoch {ep+1}")
    for step, batch in enumerate(pbar):
        if MAX_STEPS and global_step >= MAX_STEPS: break

        img  = batch["image"].to(DEVICE)     # [0,1]
        hint = batch["canny"].to(DEVICE)     # [0,1]
        prm  = batch["prompt"]

        # text embeddings (frozen)
        with torch.no_grad():
            tok_out = tokenizer(prm, padding="max_length", truncation=True, max_length=77, return_tensors="pt")
            txt_ids = tok_out.input_ids.to(DEVICE)
            txt_emb = text_encoder(txt_ids).last_hidden_state

        # normalize to [-1,1] -> VAE -> latents
        with torch.no_grad():
            img_n = img * 2.0 - 1.0
            lat   = vae.encode(img_n).latent_dist.sample() * 0.18215

        # noise + timesteps
        B  = lat.shape[0]
        ts = torch.randint(0, noise_scheduler.num_train_timesteps, (B,), device=DEVICE).long()
        n  = torch.randn_like(lat)
        nlat = noise_scheduler.add_noise(lat, n, ts)

        # hint preprocess + conditioning dropout (disabled for faster fidelity)
        hint_n = hint * 2.0 - 1.0
        if DROP_HINT_P > 0.0 and random.random() < DROP_HINT_P:
            hint_n = torch.zeros_like(hint_n)

        with torch.cuda.amp.autocast(enabled=USE_FP16):
            cn_out = controlnet(
                sample=nlat, timestep=ts, encoder_hidden_states=txt_emb,
                controlnet_cond=hint_n, return_dict=True
            )
            pred = unet(
                nlat, ts, encoder_hidden_states=txt_emb,
                down_block_additional_residuals=cn_out.down_block_res_samples,
                mid_block_additional_residual=cn_out.mid_block_res_sample,
                return_dict=True
            ).sample
            loss = mse(pred, n) / GRAD_ACCUM

        scaler.scale(loss).backward()

        if (step + 1) % GRAD_ACCUM == 0:
            scaler.unscale_(opt)
            torch.nn.utils.clip_grad_norm_(controlnet.parameters(), 1.0)
            scaler.step(opt); scaler.update()
            sched.step()
            opt.zero_grad(set_to_none=True)
            global_step += 1

            pbar.set_postfix({"loss": f"{(loss.item()*GRAD_ACCUM):.4f}", "step": global_step})
            if global_step % 10 == 0:
                print(f"[PROGRESS] step {global_step} | loss={(loss.item()*GRAD_ACCUM):.4f} | lr={sched.get_last_lr()[0]:.2e}")

            if SAVE_EVERY and global_step % SAVE_EVERY == 0:
                ck = Path(OUTPUT_DIR) / f"ckpt_{global_step}"
                ck.mkdir(parents=True, exist_ok=True)
                controlnet.cpu().save_pretrained(str(ck))
                controlnet.to(DEVICE).train()
                print(f"[SAVE] checkpoint -> {ck}")
                torch.cuda.empty_cache()  # help fragmentation after save

        if (step % 50) == 0:
            torch.cuda.empty_cache()

# final save
final_dir = Path(OUTPUT_DIR) / "controlnet_final"
final_dir.mkdir(parents=True, exist_ok=True)
controlnet.cpu().save_pretrained(str(final_dir))
print(f"[DONE] steps={global_step} time={(time.time()-start)/60:.1f} min | saved -> {final_dir}")


In [None]:
import shutil, os
from IPython.display import FileLink

# You can change this if you want to zip an earlier checkpoint
RUN_DIR = os.environ.get("CTRLNET_RUN_DIR", "/kaggle/working/controlnet_scratch_out")
ZIP_OUT = "/kaggle/working/controlnet_scratch_out.zip"

# Create zip file
if not os.path.exists(RUN_DIR):
    raise FileNotFoundError(f"[ERR] run folder not found: {RUN_DIR}")

# Remove old zip if exists
try:
    if os.path.exists(ZIP_OUT):
        os.remove(ZIP_OUT)
except Exception:
    pass

shutil.make_archive(ZIP_OUT.replace(".zip",""), "zip", RUN_DIR)
print(f"✅ Zipped -> {ZIP_OUT}")

# Kaggle download link
FileLink(ZIP_OUT)


In [None]:
#!/usr/bin/env python3
"""
StableDiffusionControlNetPipeline inference using trained ControlNet.
Tweaks: match train resolution, moderate CFG, enable xFormers if available.
"""

import os
from pathlib import Path
import torch
from PIL import Image
from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import (
    AutoencoderKL,
    UNet2DConditionModel,
    ControlNetModel,
    StableDiffusionControlNetPipeline,
    DDIMScheduler,
)

# ---------- EDIT THESE ----------
MODEL_DIR_SD   = "/kaggle/input/stablediffusion5669/tensorflow2/default/1/models/sd-v1-5"
CHECKPOINT_DIR = "/kaggle/working/controlnet_scratch_out/controlnet_final"
PROMPT         = "eagle with mountain and some grass"     # or the exact caption for the filename you test
CONTROL_IMAGE  = "/kaggle/input/canny-tuples/dataset/canny/00001.jpg"
OUTPUT         = "controlnet_infer_out.png"

IMAGE_SIZE = 256            # must match training size for best fidelity
NUM_INFERENCE_STEPS = 30    # a touch higher for scratch nets
GUIDANCE_SCALE      = 4.0   # lower CFG lets control dominate
SEED = 42
# -------------------------------

device = "cuda" if torch.cuda.is_available() else "cpu"
dtype  = torch.float16 if device == "cuda" else torch.float32
torch.manual_seed(SEED)

# --- load SD v1.5 parts ---
tok = CLIPTokenizer.from_pretrained(str(Path(MODEL_DIR_SD) / "tokenizer"))
txt = CLIPTextModel.from_pretrained(str(Path(MODEL_DIR_SD) / "text_encoder"), torch_dtype=dtype).to(device)
vae = AutoencoderKL.from_pretrained(MODEL_DIR_SD, subfolder="vae",  torch_dtype=dtype).to(device)
unet= UNet2DConditionModel.from_pretrained(MODEL_DIR_SD, subfolder="unet", torch_dtype=dtype).to(device)

# --- load ControlNet ---
if not Path(CHECKPOINT_DIR).exists():
    raise FileNotFoundError(f"ControlNet checkpoint not found: {CHECKPOINT_DIR}")
ctrl = ControlNetModel.from_pretrained(CHECKPOINT_DIR, torch_dtype=dtype).to(device)

# --- build pipeline ---
scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
pipe = StableDiffusionControlNetPipeline(
    unet=unet, controlnet=ctrl, vae=vae, tokenizer=tok, text_encoder=txt,
    scheduler=scheduler, safety_checker=None, feature_extractor=None
).to(device)

pipe.enable_attention_slicing()
if device == "cuda":
    try:
        import xformers  # noqa: F401
        pipe.enable_xformers_memory_efficient_attention()
        print("[ATTN] xFormers enabled.")
    except Exception:
        print("[ATTN] xFormers not available, continuing.")

# --- control image ---
def load_control(img_path, size):
    im = Image.open(img_path).convert("RGB")
    # NEAREST to preserve edge crispness; size must match training size
    return im.resize((size, size), resample=Image.NEAREST)

control_img = load_control(CONTROL_IMAGE, IMAGE_SIZE)

# --- run ---
gen = torch.Generator(device=device).manual_seed(SEED)
with torch.autocast(device_type="cuda", enabled=(device=="cuda")):
    out = pipe(
        PROMPT,
        image=control_img,
        num_inference_steps=NUM_INFERENCE_STEPS,
        guidance_scale=GUIDANCE_SCALE,
        generator=gen,
        negative_prompt=None,
    )

out.images[0].save(OUTPUT)
print(f"✅ Saved -> {OUTPUT}")
