In [None]:
# Cell 1 — Install & login
!pip uninstall -y diffusers -q
!pip install --upgrade --pre diffusers accelerate peft safetensors transformers huggingface-hub datasets sentencepiece protobuf torchvision pillow tqdm

In [None]:
from huggingface_hub import notebook_login
notebook_login()  # click link, paste your HF token

In [None]:
# Cell 2 — Download & patch SD3’s transformer folder
from huggingface_hub import snapshot_download
import json, os

transformer_dir = snapshot_download(
    repo_id="stabilityai/stable-diffusion-3-medium-diffusers",
    subfolder="transformer",
    use_auth_token=True
)

cfg_path = os.path.join(transformer_dir, "config.json")
cfg = json.load(open(cfg_path))
cfg.pop("num_attention_heads", None)   # remove broken field
with open(cfg_path, "w") as f:
    json.dump(cfg, f, indent=2)
print("✅ Patched config.json in", transformer_dir)

In [None]:
# Cell 3 — Load UNet and wrap in LoRA
import torch
from diffusers import UNet2DConditionModel
from peft import LoraConfig, get_peft_model

# 1) Load full SD3 UNet into CPU, then move to GPU
unet = UNet2DConditionModel.from_pretrained(
    transformer_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=False,
    device_map=None,
    local_files_only=True
)
unet = unet.to("cuda")

# 2) Configure and attach LoRA
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["to_q","to_k","to_v"],  # only Q/K/V projections
    lora_dropout=0.05,
    bias="none"
)
unet_lora = get_peft_model(unet, lora_cfg)
print("✅ LoRA wrapped. LoRA params:", sum(p.numel() for p in unet_lora.parameters() if p.requires_grad))


In [None]:
# Cell 4 — Fine-tune LoRA on your dataset
import torch, torch.nn.functional as F
from accelerate import Accelerator
from diffusers import StableDiffusion3Pipeline
from torch.utils.data import DataLoader
from transformers import CLIPTokenizer
from datasets import load_dataset
from torchvision import transforms
from PIL import Image
import os

# ───── CONFIG ─────
MODEL_ID    = "stabilityai/stable-diffusion-3-medium-diffusers"
TRAIN_JSON  = "lora_dataset.json"
OUTPUT_DIR  = "lora_sd3_panel"
BATCH_SIZE  = 4
EPOCHS      = 3
LR          = 1e-4
WARMUP      = 0.03
MAX_LEN     = 64
SEED        = 42
# ───────────────────

torch.manual_seed(SEED)
accelerator = Accelerator(mixed_precision="fp16")
device = accelerator.device

# 1) Load pipeline pieces (VAE, scheduler, text encoder)
pipe = StableDiffusion3Pipeline.from_pretrained(
    MODEL_ID, torch_dtype=torch.float16, use_auth_token=True
).to(device)
vae, scheduler, text_encoder = pipe.vae, pipe.scheduler, pipe.text_encoder

tokenizer = CLIPTokenizer.from_pretrained(
    MODEL_ID, subfolder="tokenizer", use_auth_token=True
)
img_tf = transforms.Compose([
    transforms.Resize((512,512), transforms.InterpolationMode.BILINEAR),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3),
])

# 2) Dataset → DataLoader
ds = load_dataset("json", data_files=TRAIN_JSON)["train"]
def preprocess(ex):
    img = Image.open(ex["image"]).convert("RGB")
    ex["pixel_values"] = img_tf(img)
    ex["input_ids"] = tokenizer(
        ex["caption"],
        padding="max_length", truncation=True, max_length=MAX_LEN,
        return_tensors="pt"
    ).input_ids[0]
    return ex

ds = ds.map(preprocess, remove_columns=["image","caption"])
ds.set_format(type="torch", columns=["pixel_values","input_ids"])
loader = DataLoader(
    ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True,
    collate_fn=lambda b: {
        "pixel_values": torch.stack([x["pixel_values"] for x in b]),
        "input_ids":    torch.stack([x["input_ids"]    for x in b])
    }
)



In [None]:
# 3) Optimizer & LR scheduler on LoRA params
optimizer = torch.optim.AdamW(unet_lora.parameters(), lr=LR)
total_steps = len(loader) * EPOCHS
lr_scheduler = torch.optim.lr_scheduler.LinearLR(
    optimizer, start_factor=WARMUP, total_iters=total_steps
)

# 4) Prepare for fp16/multi-GPU
unet_lora, optimizer, loader, lr_scheduler = accelerator.prepare(
    unet_lora, optimizer, loader, lr_scheduler
)




In [None]:
# 5) Training loop
for epoch in range(1, EPOCHS+1):
    unet_lora.train()
    running_loss = 0.0
    for batch in loader:
        # encode to latents
        with torch.no_grad():
            latents = vae.encode(batch["pixel_values"].to(device)).latent_dist.sample()
            latents *= vae.config.scaling_factor

        # add noise
        noise = torch.randn_like(latents)
        timesteps = torch.randint(
            0, scheduler.config.num_train_timesteps,
            (latents.shape[0],), device=device
        ).long()
        noisy = scheduler.add_noise(latents, noise, timesteps)

        # text → embeddings
        emb = text_encoder(batch["input_ids"].to(device))[0]

        # UNet forward
        noise_pred = unet_lora(noisy, timesteps, encoder_hidden_states=emb).sample
        loss = F.mse_loss(noise_pred, noise)

        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        running_loss += loss.item()

    print(f"Epoch {epoch}/{EPOCHS} — avg loss {running_loss/len(loader):.4f}")


In [None]:

# 6) Save adapters
os.makedirs(OUTPUT_DIR, exist_ok=True)
unet_lora.save_pretrained(OUTPUT_DIR)
print("✅ LoRA adapters saved to", OUTPUT_DIR)