In [None]:
# Install (run once per fresh runtime)
!pip -q install diffusers datasets transformers accelerate scipy ftfy

import os
import torch
import datasets
import diffusers
import huggingface_hub

from datasets import load_dataset
from torchvision import transforms
from diffusers import DDPMScheduler, UNet2DModel
from torch.nn import functional as F
from google.colab import drive

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Faster GPU math
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True

# Suppress warnings
datasets.logging.set_verbosity_error()
diffusers.logging.set_verbosity_error()
huggingface_hub.logging.set_verbosity_error()

# Mount Drive for persistent saving
drive.mount("/content/drive")

drive_ckpt = "/content/drive/MyDrive/a5-animal_latest.pth"

# Dataset (keep small for speed, increase later if you have time)
dataset = load_dataset("ldgravy/Medieval-Bestiary", split="train")
dataset = dataset.select(range(512))  # faster than 1024
print("Dataset size:", len(dataset))

# Preprocess
image_size = 64
preprocess = transforms.Compose(
    [
        transforms.Resize((image_size, image_size)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
    ]
)

def transform_batch(examples):
    examples["pixel_values"] = [preprocess(img.convert("RGB")) for img in examples["image"]]
    return examples

dataset.set_transform(transform_batch)

class TensorDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.hf_dataset = hf_dataset
    def __len__(self):
        return len(self.hf_dataset)
    def __getitem__(self, idx):
        return self.hf_dataset[idx]["pixel_values"]

tensor_dataset = TensorDataset(dataset)

# DataLoader tuning
batch_size = 64 if device == "cuda" else 16  # bump if stable, lower if OOM
train_dataloader = torch.utils.data.DataLoader(
    tensor_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=(device == "cuda"),
    persistent_workers=True,
)

# Model
model = UNet2DModel(
    sample_size=image_size,
    in_channels=3,
    out_channels=3,
    layers_per_block=2,
    block_out_channels=(128, 256, 256, 512),
    down_block_types=("DownBlock2D","DownBlock2D","AttnDownBlock2D","AttnDownBlock2D"),
    up_block_types=("AttnUpBlock2D","AttnUpBlock2D","UpBlock2D","UpBlock2D"),
).to(device)

# Resume if checkpoint exists
start_epoch = 0
if os.path.exists(drive_ckpt):
    model.load_state_dict(torch.load(drive_ckpt, map_location=device))
    print("Resumed from:", drive_ckpt)
else:
    print("No checkpoint found, training from scratch.")

noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Mixed precision (GPU only) - big speedup
use_amp = (device == "cuda")
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

# Train (AMP enabled if on GPU)
num_epochs = 15

for epoch in range(start_epoch, num_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        clean_images = batch.to(device, non_blocking=True)
        noise = torch.randn_like(clean_images)
        timesteps = torch.randint(
            0,
            noise_scheduler.config.num_train_timesteps,
            (clean_images.size(0),),
            device=device,
        ).long()

        noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)

        optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast(enabled=use_amp):
            noise_pred = model(noisy_images, timesteps).sample
            loss = F.mse_loss(noise_pred, noise)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        if step % 100 == 0:
            print(f"Epoch {epoch} Step {step} Loss {loss.item():.4f}")

    # Save every epoch to Drive
    torch.save(model.state_dict(), drive_ckpt)
    print("Saved to Drive:", drive_ckpt)

print("Done. Final checkpoint:", drive_ckpt)


Extra training (needed based on test renders)

In [None]:
# Continue training more epochs

extra_epochs = 50

for e in range(extra_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        clean_images = batch.to(device, non_blocking=True)
        noise = torch.randn_like(clean_images)
        timesteps = torch.randint(
            0,
            noise_scheduler.config.num_train_timesteps,
            (clean_images.size(0),),
            device=device,
        ).long()

        noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)

        optimizer.zero_grad(set_to_none=True)

        noise_pred = model(noisy_images, timesteps).sample
        loss = F.mse_loss(noise_pred, noise)

        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            print(f"Extra epoch {e} Step {step} Loss {loss.item():.4f}")

    torch.save(model.state_dict(), drive_ckpt)
    print("Saved to Drive:", drive_ckpt)

print("Extra training done.")


Extra epoch 0 Step 0 Loss 0.0397
Saved to Drive: /content/drive/MyDrive/a5-animal_latest.pth
Extra epoch 1 Step 0 Loss 0.0412
Saved to Drive: /content/drive/MyDrive/a5-animal_latest.pth
Extra epoch 2 Step 0 Loss 0.0649
Saved to Drive: /content/drive/MyDrive/a5-animal_latest.pth
Extra epoch 3 Step 0 Loss 0.0521
Saved to Drive: /content/drive/MyDrive/a5-animal_latest.pth
Extra epoch 4 Step 0 Loss 0.0354
Saved to Drive: /content/drive/MyDrive/a5-animal_latest.pth
Extra epoch 5 Step 0 Loss 0.0696
Saved to Drive: /content/drive/MyDrive/a5-animal_latest.pth
Extra epoch 6 Step 0 Loss 0.0580
Saved to Drive: /content/drive/MyDrive/a5-animal_latest.pth
Extra epoch 7 Step 0 Loss 0.0470
Saved to Drive: /content/drive/MyDrive/a5-animal_latest.pth
Extra epoch 8 Step 0 Loss 0.0373
Saved to Drive: /content/drive/MyDrive/a5-animal_latest.pth
Extra epoch 9 Step 0 Loss 0.0684
Saved to Drive: /content/drive/MyDrive/a5-animal_latest.pth
Extra epoch 10 Step 0 Loss 0.0617
Saved to Drive: /content/drive/MyDri

In [None]:
# Continue training from Drive checkpoint with MORE updates (no restart-from-scratch)

import os
import torch
from google.colab import drive
from datasets import load_dataset
from torchvision import transforms
from diffusers import DDPMScheduler, UNet2DModel
from torch.nn import functional as F

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Drive
drive.mount("/content/drive")
drive_ckpt = "/content/drive/MyDrive/a5-animal_latest.pth"
assert os.path.exists(drive_ckpt), "Checkpoint not found on Drive"

# Dataset (use more than 512)
dataset = load_dataset("ldgravy/Medieval-Bestiary", split="train")
dataset = dataset.select(range(1024))  # you can also use the full dataset (remove this line)
print("Dataset size:", len(dataset))

# Preprocess (same as before)
image_size = 64
preprocess = transforms.Compose(
    [
        transforms.Resize((image_size, image_size)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
    ]
)

def transform_batch(examples):
    examples["pixel_values"] = [preprocess(img.convert("RGB")) for img in examples["image"]]
    return examples

dataset.set_transform(transform_batch)

class TensorDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.hf_dataset = hf_dataset
    def __len__(self):
        return len(self.hf_dataset)
    def __getitem__(self, idx):
        return self.hf_dataset[idx]["pixel_values"]

tensor_dataset = TensorDataset(dataset)

# Smaller batch size = more optimizer updates per epoch (key)
batch_size = 16
train_dataloader = torch.utils.data.DataLoader(
    tensor_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=(device == "cuda"),
)

# Model (same architecture)
model = UNet2DModel(
    sample_size=image_size,
    in_channels=3,
    out_channels=3,
    layers_per_block=2,
    block_out_channels=(128, 256, 256, 512),
    down_block_types=("DownBlock2D","DownBlock2D","AttnDownBlock2D","AttnDownBlock2D"),
    up_block_types=("AttnUpBlock2D","AttnUpBlock2D","UpBlock2D","UpBlock2D"),
).to(device)

model.load_state_dict(torch.load(drive_ckpt, map_location=device))
print("Loaded checkpoint:", drive_ckpt)

noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Train by number of updates
target_updates = 5000     # increase to 10000 if needed
save_every = 500

global_step = 0
model.train()

while global_step < target_updates:
    for batch in train_dataloader:
        clean_images = batch.to(device, non_blocking=True)
        noise = torch.randn_like(clean_images)
        timesteps = torch.randint(
            0, noise_scheduler.config.num_train_timesteps,
            (clean_images.size(0),),
            device=device
        ).long()

        noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)

        optimizer.zero_grad(set_to_none=True)
        noise_pred = model(noisy_images, timesteps).sample
        loss = F.mse_loss(noise_pred, noise)

        loss.backward()
        optimizer.step()

        global_step += 1

        if global_step % 100 == 0:
            print("update", global_step, "loss", float(loss.item()))

        if global_step % save_every == 0:
            torch.save(model.state_dict(), drive_ckpt)
            print("Saved:", drive_ckpt)

        if global_step >= target_updates:
            break

torch.save(model.state_dict(), drive_ckpt)
print("Done. Saved:", drive_ckpt)


In [None]:
# Continue training from Drive checkpoint with many more optimizer updates

import os
import torch
from google.colab import drive
from datasets import load_dataset
from torchvision import transforms
from diffusers import DDPMScheduler, UNet2DModel
from torch.nn import functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

drive.mount("/content/drive")
drive_ckpt = "/content/drive/MyDrive/a5-animal_latest.pth"
assert os.path.exists(drive_ckpt), "Checkpoint not found: " + drive_ckpt

# Use more data
dataset = load_dataset("ldgravy/Medieval-Bestiary", split="train")
dataset = dataset.select(range(1024))
print("Dataset size:", len(dataset))

# Preprocess
image_size = 64
preprocess = transforms.Compose(
    [
        transforms.Resize((image_size, image_size)),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
    ]
)

def transform_batch(examples):
    examples["pixel_values"] = [preprocess(img.convert("RGB")) for img in examples["image"]]
    return examples

dataset.set_transform(transform_batch)

class TensorDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.hf_dataset = hf_dataset
    def __len__(self):
        return len(self.hf_dataset)
    def __getitem__(self, idx):
        return self.hf_dataset[idx]["pixel_values"]

tensor_dataset = TensorDataset(dataset)

#  smaller batch size -> more steps -> more learning
batch_size = 16
train_dataloader = torch.utils.data.DataLoader(
    tensor_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=(device == "cuda"),
)

# Model (must match your architecture)
model = UNet2DModel(
    sample_size=image_size,
    in_channels=3,
    out_channels=3,
    layers_per_block=2,
    block_out_channels=(128, 256, 256, 512),
    down_block_types=("DownBlock2D","DownBlock2D","AttnDownBlock2D","AttnDownBlock2D"),
    up_block_types=("AttnUpBlock2D","AttnUpBlock2D","UpBlock2D","UpBlock2D"),
).to(device)

model.load_state_dict(torch.load(drive_ckpt, map_location=device))
print("Loaded checkpoint:", drive_ckpt)

noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Train by updates
extra_updates = 10000
save_every = 500

global_step = 0
model.train()

while global_step < extra_updates:
    for batch in train_dataloader:
        clean_images = batch.to(device, non_blocking=True)
        noise = torch.randn_like(clean_images)
        timesteps = torch.randint(
            0, noise_scheduler.config.num_train_timesteps,
            (clean_images.size(0),),
            device=device
        ).long()

        noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)

        optimizer.zero_grad(set_to_none=True)
        noise_pred = model(noisy_images, timesteps).sample
        loss = F.mse_loss(noise_pred, noise)

        loss.backward()
        optimizer.step()

        global_step += 1

        if global_step % 100 == 0:
            print("update", global_step, "loss", float(loss.item()))

        if global_step % save_every == 0:
            torch.save(model.state_dict(), drive_ckpt)
            print("Saved:", drive_ckpt)

        if global_step >= extra_updates:
            break

torch.save(model.state_dict(), drive_ckpt)
print("Done. Saved:", drive_ckpt)


Updated save images and zip: Switch to CPU due to rate-limits

In [None]:
# Generate 50 images from checkpoint and zip them

import os, shutil, torch
from PIL import Image
from google.colab import drive
from diffusers import UNet2DModel, DDPMPipeline, DDIMScheduler

# --- device ---
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# --- drive + ckpt ---
drive.mount("/content/drive")
ckpt = "/content/drive/MyDrive/a5-animal_latest.pth"
assert os.path.exists(ckpt), f"Missing checkpoint: {ckpt}"

# --- model  ---
image_size = 64
model = UNet2DModel(
    sample_size=image_size,
    in_channels=3,
    out_channels=3,
    layers_per_block=2,
    block_out_channels=(128, 256, 256, 512),
    down_block_types=("DownBlock2D","DownBlock2D","AttnDownBlock2D","AttnDownBlock2D"),
    up_block_types=("AttnUpBlock2D","AttnUpBlock2D","UpBlock2D","UpBlock2D"),
).to(device)

model.load_state_dict(torch.load(ckpt, map_location=device))
model.eval()
print("Loaded:", ckpt)

# --- pipeline: DDIM sampling (faster/cleaner than pure DDPM sampling) ---
scheduler = DDIMScheduler(num_train_timesteps=1000)
pipe = DDPMPipeline(unet=model, scheduler=scheduler).to(device)

out_dir = "/content/animals_out"
os.makedirs(out_dir, exist_ok=True)

num_images = 50
batch = 10
steps = 200 if device=="cuda" else 50

idx = 0
while idx < num_images:
    cur = min(batch, num_images - idx)
    gens = [torch.Generator(device=device).manual_seed(1000 + idx + i) for i in range(cur)]
    imgs = pipe(batch_size=cur, num_inference_steps=steps, generator=gens).images
    for i, im in enumerate(imgs):
        im.save(os.path.join(out_dir, f"animal_{idx+i:03d}.png"))
    idx += cur
    print("saved", idx, "/", num_images)

# zip + copy to drive
zip_path = shutil.make_archive("/content/animals_out", "zip", out_dir)
drive_dest = "/content/drive/MyDrive/animals_out.zip"
shutil.copy2(zip_path, drive_dest)

print("ZIP saved to:", drive_dest)


# Cells below are unused, as I switched to CPU compiling

In [None]:
import torch
print("cuda available:", torch.cuda.is_available())


An example of why I added extra training

In [None]:
# Cell: Test image generation (fixed seed, DDIM sampler, loads from Drive)

import torch
import matplotlib.pyplot as plt
from google.colab import drive
from diffusers import DDPMPipeline, DDIMScheduler, UNet2DModel

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# Drive
drive.mount("/content/drive")
ckpt = "/content/drive/MyDrive/a5-animal_latest.pth"

# Model
image_size = 64
model = UNet2DModel(
    sample_size=image_size,
    in_channels=3,
    out_channels=3,
    layers_per_block=2,
    block_out_channels=(128, 256, 256, 512),
    down_block_types=("DownBlock2D", "DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
    up_block_types=("AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D", "UpBlock2D"),
).to(device)

model.load_state_dict(torch.load(ckpt, map_location=device))
model.eval()

# DDIM scheduler for sampling
scheduler = DDIMScheduler(num_train_timesteps=1000)
pipe = DDPMPipeline(unet=model, scheduler=scheduler).to(device)

# Fixed seed so results are comparable run to run
g = torch.Generator(device=device).manual_seed(123)

# Generate images
images = pipe(batch_size=8, num_inference_steps=100, generator=g).images

# Show grid
plt.figure(figsize=(12, 3))
for i, img in enumerate(images):
    plt.subplot(1, 8, i + 1)
    plt.imshow(img)
    plt.axis("off")
plt.show()


In [None]:
import os, time
ckpt = "/content/drive/MyDrive/a5-animal_latest.pth"
print("exists:", os.path.exists(ckpt))
print("size:", os.path.getsize(ckpt), "bytes")
print("modified:", time.ctime(os.path.getmtime(ckpt)))


In [None]:
import torch
import matplotlib.pyplot as plt
from diffusers import DDPMScheduler

# 1. Take one real image from the dataset (already transformed)
sample = dataset[0]["pixel_values"].unsqueeze(0).to(device)

# 2. Create a fresh scheduler just for this check (no set_timesteps here)
check_scheduler = DDPMScheduler(num_train_timesteps=1000)

# keep tensors on device
t = torch.tensor([500], device=device, dtype=torch.long)

# 4. Add noise to the real image
noise = torch.randn_like(sample)
noisy = check_scheduler.add_noise(sample, noise, t)

with torch.no_grad():
    # Predict noise
    noise_pred = model(noisy, t).sample
    # One reverse step: pass a CPU timestep to the scheduler
    denoised = check_scheduler.step(noise_pred, t[0], noisy).prev_sample

def denorm(x):
    x = x[0].detach().cpu()
    x = (x * 0.5) + 0.5  # [-1, 1] -> [0, 1]
    x = torch.clamp(x, 0, 1)
    return x.permute(1, 2, 0).numpy()

orig_img = denorm(sample)
noisy_img = denorm(noisy)
denoised_img = denorm(denoised)

fig, axes = plt.subplots(1, 3, figsize=(12, 4))
axes[0].imshow(orig_img);     axes[0].set_title("Original");          axes[0].axis("off")
axes[1].imshow(noisy_img);    axes[1].set_title("Noisy (t=500)");     axes[1].axis("off")
axes[2].imshow(denoised_img); axes[2].set_title("One-step denoised"); axes[2].axis("off")
plt.show()


In [None]:
# Check the length of the dataset
print(f"Length of the dataset: {len(dataset)}")

# Visualize some images from the dataset
import matplotlib.pyplot as plt

def visualize_images(dataset, num_images=5):
    fig, axes = plt.subplots(1, num_images, figsize=(15, 3))
    for i in range(num_images):
        # Get an image from the dataset
        image = dataset[i]["pixel_values"]  # Access the preprocessed tensor
        # Convert tensor to numpy and denormalize
        image = image.permute(1, 2, 0).cpu().numpy()  # Change from (C, H, W) to (H, W, C)
        image = (image * 0.5) + 0.5  # Denormalize from [-1, 1] to [0, 1]
        axes[i].imshow(image)
        axes[i].axis("off")
    plt.show()

# Visualize 5 images
visualize_images(dataset, num_images=8)

In [None]:
import torch
import matplotlib.pyplot as plt
from google.colab import drive
from diffusers import DDPMScheduler, UNet2DModel

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

drive.mount("/content/drive")
ckpt_path = "/content/drive/MyDrive/a5-animal_latest.pth"

# Load the trained model
model = UNet2DModel(
    sample_size=64,
    in_channels=3,
    out_channels=3,
    layers_per_block=2,
    block_out_channels=(128, 256, 256, 512),
    down_block_types=("DownBlock2D","DownBlock2D","AttnDownBlock2D","AttnDownBlock2D"),
    up_block_types=("AttnUpBlock2D","AttnUpBlock2D","UpBlock2D","UpBlock2D"),
).to(device)

# Load the saved model weights
model.load_state_dict(torch.load(ckpt_path, map_location=device))
model.eval()

# Set up the noise scheduler
noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
noise_scheduler.set_timesteps(100)

def generate_images(num_images=1):
    with torch.no_grad():
        images = torch.randn((num_images, 3, 64, 64), device=device)
        for t in noise_scheduler.timesteps:
            model_input = noise_scheduler.scale_model_input(images, t)
            noise_pred = model(model_input, t).sample
            images = noise_scheduler.step(noise_pred, t, images).prev_sample

        images = images.detach().cpu()
        images = (images * 0.5) + 0.5
        images = torch.clamp(images, 0, 1)
        return images

num_images = 5
generated_images = generate_images(num_images)

fig, axes = plt.subplots(1, num_images, figsize=(15, 3))
for i, img in enumerate(generated_images):
    axes[i].imshow(img.permute(1, 2, 0))
    axes[i].axis("off")
plt.show()


In [None]:
import os
from matplotlib import pyplot as plt

# Create a directory to save the images
os.makedirs("images", exist_ok=True)

# Plot and save the generated images
fig, axes = plt.subplots(1, num_images, figsize=(15, 3))
for i, img in enumerate(generated_images):
    img = img.permute(1, 2, 0)  # Change from (C, H, W) to (H, W, C)
    axes[i].imshow(img)
    axes[i].axis("off")

    # Save each image with a zero-padded two-digit number
    filename = f"images/animal_{i:02d}.png"
    plt.imsave(filename, img.numpy())

# Save the entire grid of images
plt.savefig("images/animal_grid.png")
plt.show()

In [None]:
# !rm -rf images/

# Utility to zip a file
import shutil
shutil.make_archive("animals", "zip", "images")

In [None]:
from google.colab import drive
drive.mount("/content/drive")
import shutil
shutil.copy2("animals.zip", "/content/drive/MyDrive/animals.zip")
