In [1]:
# prepare_lora_data.py

import os
import json
from tqdm import tqdm

# ─────────── CONFIG ───────────
SRC_DIR   = "./dataset/processed_dataset"
TOKEN     = "MyPanel"            # the special token your LoRA will learn
OUT_JSON  = "lora_dataset.json"  # output JSON for fine-tuning

CLASS_CAPTIONS = {
    "clean":              "a solar panel with no defects",
    "snow_covered":       "a solar panel covered in fresh snow",
    "dusty":              "a solar panel scattered with dust and dirt",
    "physical_damage":    "a solar panel with physical cracks and chips",
    "bird_drop":          "a solar panel with bird droppings",
    "electrical_damage":  "a solar panel showing electrical burn marks"
}
# ───────────────────────────────

records = []
for cls_folder, base_caption in CLASS_CAPTIONS.items():
    folder = os.path.join(SRC_DIR, cls_folder)
    if not os.path.isdir(folder):
        print(f"⚠️  Skipping missing folder: {folder}")
        continue

    for fn in tqdm(os.listdir(folder), desc=f"Processing {cls_folder}"):
        path = os.path.abspath(os.path.join(folder, fn))
        # guard: only images
        if not fn.lower().endswith((".jpg", ".jpeg", ".png")):
            continue
        caption = f"<{TOKEN}> {base_caption}"
        records.append({
            "image":   path,
            "caption": caption
        })

# write out
with open(OUT_JSON, "w") as f:
    json.dump(records, f, indent=2)

print(f"\n✅ Wrote {len(records)} records to {OUT_JSON}")


Processing clean: 100%|██████████| 191/191 [00:00<00:00, 187658.01it/s]
Processing snow_covered: 100%|██████████| 114/114 [00:00<00:00, 80159.37it/s]
Processing dusty: 100%|██████████| 182/182 [00:00<00:00, 204600.20it/s]
Processing physical_damage: 100%|██████████| 66/66 [00:00<00:00, 123306.93it/s]
Processing bird_drop: 100%|██████████| 201/201 [00:00<00:00, 214408.72it/s]
Processing electrical_damage: 100%|██████████| 90/90 [00:00<00:00, 162081.31it/s]


✅ Wrote 844 records to lora_dataset.json





In [3]:
# in a notebook cell, prefix with !
!pip install --upgrade pip
!pip install diffusers transformers accelerate peft safetensors huggingface-hub datasets sentencepiece protobuf torchvision pillow tqdm

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.2
    Uninstalling pip-24.2:
      Successfully uninstalled pip-24.2
Successfully installed pip-25.1.1
[0mCollecting diffusers
  Downloading diffusers-0.34.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
Collecting accelerate
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.16.0-py3-none-any.whl.metadata (14 kB)
Collecting safetensors
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting huggingface-hub
  Downloading huggingface_hub-0.33.4-py3-none

In [None]:
# 1) Install the HF hub helper (if you haven’t already)
!pip install --quiet huggingface_hub

# 2) Launch the notebook login flow
from huggingface_hub import notebook_login
notebook_login()
# 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

In [9]:
!pip install --upgrade --pre diffusers

[0m

In [5]:
import os
import torch
from datasets import load_dataset
from transformers import CLIPTokenizer
from diffusers import UNet2DConditionModel
from peft import LoraConfig, get_peft_model, TaskType
from torch.utils.data import DataLoader
from accelerate import Accelerator
from torchvision import transforms
from PIL import Image

# ───── CONFIG ─────
MODEL_ID      = "stabilityai/stable-diffusion-3-medium-diffusers"
TRAIN_JSON    = "lora_dataset.json"   # from prepare_lora_data.py
OUTPUT_DIR    = "lora_sd3_panel"
BATCH_SIZE    = 4
EPOCHS        = 3
LEARNING_RATE = 1e-4
LR_WARMUP     = 0.03
MAX_LEN       = 64
SEED          = 42
# ─────────────────

torch.manual_seed(SEED)
accelerator = Accelerator(mixed_precision="fp16")

# 1) Tokenizer & simple transforms
tokenizer = CLIPTokenizer.from_pretrained(MODEL_ID, subfolder="tokenizer")
img_tf    = transforms.Compose([
    transforms.Resize((512,512), transforms.InterpolationMode.BILINEAR),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3),
])

# 2) Load & preprocess JSON dataset
ds = load_dataset("json", data_files=TRAIN_JSON)["train"]

def prep(ex):
    img = Image.open(ex["image"]).convert("RGB")
    ex["pixel_values"] = img_tf(img)
    ids = tokenizer(
        ex["caption"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    ex["input_ids"] = ids.input_ids[0]
    return ex

ds = ds.map(prep, remove_columns=["image","caption"])
ds.set_format(type="torch", columns=["pixel_values","input_ids"])

def collate_fn(batch):
    return {
        "pixel_values": torch.stack([b["pixel_values"] for b in batch]),
        "input_ids":    torch.stack([b["input_ids"]    for b in batch]),
    }

dataloader = DataLoader(
    ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=4,
    pin_memory=True
)

tokenizer_config.json:   0%|          | 0.00/705 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/588 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/844 [00:00<?, ? examples/s]

In [10]:
# 1) Fetch the SD3 UNet from the "transformer" subfolder (now works in dev diffusers)
unet = UNet2DConditionModel.from_pretrained(
    MODEL_ID,
    subfolder="transformer",        # loads config.json + safetensors here
    torch_dtype=torch.float16,
    use_auth_token=True
).to("cuda")

ValueError: At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19.

In [8]:
from diffusers import UNet2DConditionModel
import torch

MODEL_ID = "stabilityai/stable-diffusion-3-medium-diffusers"

# make sure you’ve already done notebook_login() or set use_auth_token
unet = UNet2DConditionModel.from_pretrained(
    MODEL_ID,
    subfolder="transformer",       # ← loads the config.json & safetensors in transformer/
    torch_dtype=torch.float16,
    use_auth_token=True
).to("cuda")

# test it
print(unet)
x = torch.randn(1, unet.in_channels, 64, 64, device="cuda")
out = unet(x, timestep=torch.tensor([10], device="cuda"), encoder_hidden_states=torch.randn(1, unet.config.cross_attention_dim, device="cuda"))
print("OK, UNet runs; output shape:", out.sample.shape)


ValueError: At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19.

In [None]:
# 4) wrap UNet in LoRA
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["to_q","to_k","to_v","to_out"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.IMAGE_TEXT_MATCH
)
unet_lora = get_peft_model(unet, lora_cfg)

# 5) optimizer + scheduler
optimizer = torch.optim.AdamW(unet_lora.parameters(), lr=LR)
total_steps = len(loader) * EPOCHS
scheduler   = torch.optim.lr_scheduler.LinearLR(
    optimizer, start_factor=LR_WARMUP, total_iters=total_steps
)

# 6) prep everything
unet_lora, optimizer, loader, scheduler = accelerator.prepare(
    unet_lora, optimizer, loader, scheduler
)

In [None]:
# generate_sd3_lora_images.py
import os
import torch
from diffusers import StableDiffusion3Pipeline

# ───────────── CONFIG ─────────────
MODEL_ID    = "stabilityai/stable-diffusion-3-medium-diffusers"
LORA_DIR    = "./lora_sd3_panel"    # from train_sd3_lora.py
OUTPUT_ROOT = "./outs_sd3_lora"

# prompts per class
PROMPTS = {
    "snow":  "<MyPanel> covered in fresh snow, photorealistic lighting",
    "dusty": "<MyPanel> scattered with dust and dirt, high detail",
    "crack": "<MyPanel> with fine micro-cracks and chipped cells",
    "cover": "<MyPanel> partially covered by debris, realistic"
}

# how many to generate per class
NUM_IMAGES = {
    "snow":  100,
    "dusty": 100,
    "crack": 100,
    "cover": 100
}

NEG_PROMPT = "lowres, bad anatomy, text"
STEPS      = 30
GUIDANCE   = 1.5
SEED       = 42
# ────────────────────────────────────

def main():
    torch.manual_seed(SEED)
    pipe = StableDiffusion3Pipeline.from_pretrained(
        MODEL_ID, torch_dtype=torch.float16
    ).to("cuda")
    pipe.enable_xformers_memory_efficient_attention()

    # load LoRA adapters into UNet
    pipe.unet.load_attn_procs(LORA_DIR)

    os.makedirs(OUTPUT_ROOT, exist_ok=True)
    for cls, prompt in PROMPTS.items():
        out_dir = os.path.join(OUTPUT_ROOT, cls)
        os.makedirs(out_dir, exist_ok=True)
        for i in range(NUM_IMAGES.get(cls, 0)):
            img = pipe(
                prompt=prompt,
                negative_prompt=NEG_PROMPT,
                num_inference_steps=STEPS,
                guidance_scale=GUIDANCE
            ).images[0]
            fname = f"{cls}_{i:04d}.png"
            img.save(os.path.join(out_dir, fname))
            print(f"✓ Saved {cls}/{fname}")

if __name__ == "__main__":
    main()
