In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import json
from diffusers import StableDiffusionPipeline, DDPMScheduler
from diffusers.optimization import get_scheduler
from transformers import CLIPTokenizer
import os
import gc

# Paths to your data
image_folder = "/content/drive/MyDrive/interior-Design"
metadata_file = "/content/drive/MyDrive/captions_metadata_updated_paths.json"

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load metadata
with open(metadata_file, "r") as f:
    metadata = json.load(f)

# Dataset class
class InteriorDesignDataset(Dataset):
    def __init__(self, image_folder, metadata, tokenizer, resolution=512):
        self.image_folder = image_folder
        self.metadata = metadata
        self.tokenizer = tokenizer
        self.resolution = resolution
        self.transform = transforms.Compose([
            transforms.Resize((resolution, resolution)),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])  # Normalize to [-1, 1]
        ])

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        entry = self.metadata[idx]
        image_path = os.path.join(self.image_folder, entry["image_path"])
        caption = entry["caption"]

        # Load and preprocess the image
        try:
            image = Image.open(image_path).convert("RGB")
            image = self.transform(image)
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            return None

        # Tokenize caption
        text_inputs = self.tokenizer(
            caption,
            max_length=self.tokenizer.model_max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        return {
            "pixel_values": image,
            "input_ids": text_inputs.input_ids.squeeze()
        }

# Tokenizer
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch16")

# Create dataset and dataloader
dataset = InteriorDesignDataset(image_folder, metadata, tokenizer)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

# Load Stable Diffusion pipeline
pipeline = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float16,  # Use float16 precision
    scheduler=DDPMScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
).to(device)

# Enable memory-efficient attention
pipeline.enable_xformers_memory_efficient_attention()

# Freeze everything except the U-Net
pipeline.text_encoder.requires_grad_(False)
pipeline.vae.requires_grad_(False)

# Set U-Net to training mode
pipeline.unet.train()

# Optimizer
optimizer = torch.optim.AdamW(pipeline.unet.parameters(), lr=1e-6)

# Scheduler
scheduler = get_scheduler("linear", optimizer, num_warmup_steps=100, num_training_steps=len(dataloader) * 5)

# Training loop with AMP
scaler = torch.cuda.amp.GradScaler()
gradient_accumulation_steps = 4
num_epochs = 5

for epoch in range(num_epochs):
    print(f"Starting epoch {epoch + 1}/{num_epochs}")
    for step, batch in enumerate(dataloader):
        try:
            if batch is None:
                continue

            pixel_values = batch["pixel_values"].to(device, dtype=torch.float16)
            input_ids = batch["input_ids"].to(device)

            latents = pipeline.vae.encode(pixel_values).latent_dist.sample()
            latents = latents * pipeline.vae.config.scaling_factor

            noise_factor = 0.1 + (0.9 * step / len(dataloader))  # Gradual noise scaling
            noise = torch.randn_like(latents) * noise_factor
            timesteps = torch.randint(0, pipeline.scheduler.config.num_train_timesteps, (1,), device=device).long()
            noisy_latents = pipeline.scheduler.add_noise(latents, noise, timesteps)

            encoder_hidden_states = pipeline.text_encoder(input_ids)[0]

            with torch.cuda.amp.autocast():
                noise_pred = pipeline.unet(noisy_latents, timesteps, encoder_hidden_states).sample
                loss = torch.nn.functional.mse_loss(noise_pred, noise)
                regularization = 0.01 * torch.sum(latents ** 2)
                loss = loss + regularization

            if torch.isnan(loss) or torch.isinf(loss):
                print(f"Skipping step {step} due to NaN/inf loss")
                optimizer.zero_grad()
                continue

            scaler.scale(loss).backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            if step % 1 == 0:  # Log every step
                print(f"Epoch {epoch + 1}, Step {step}, Loss: {loss.item()}")

        except Exception as e:
            print(f"Error at Step {step}: {e}")

        gc.collect()
        torch.cuda.empty_cache()

    pipeline.save_pretrained(f"/content/drive/MyDrive/finetuned_models/epoch_{epoch + 1}")

print("Fine-tuning complete.")


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Starting epoch 1/5
Epoch 1, Step 0, Loss: 132.57196044921875
Epoch 1, Step 1, Loss: 121.55836486816406
Epoch 1, Step 2, Loss: 120.37560272216797
Error at Step 3: Attempting to unscale FP16 gradients.
Epoch 1, Step 4, Loss: 108.87981414794922
Epoch 1, Step 5, Loss: 113.4191665649414
Epoch 1, Step 6, Loss: 115.22425842285156
Error at Step 7: Attempting to unscale FP16 gradients.
Epoch 1, Step 8, Loss: 87.11736297607422
Epoch 1, Step 9, Loss: 101.39501190185547
Epoch 1, Step 10, Loss: 143.4488525390625
Error at Step 11: Attempting to unscale FP16 gradients.
Epoch 1, Step 12, Loss: 106.17118072509766
Epoch 1, Step 13, Loss: 119.50501251220703
Epoch 1, Step 14, Loss: 113.45347595214844
Error at Step 15: Attempting to unscale FP16 gradients.
Epoch 1, Step 16, Loss: 248.5630645751953
Epoch 1, Step 17, Loss: 104.7156753540039
Epoch 1, Step 18, Loss: 159.15570068359375
Error at Step 19: Attempting to unscale FP16 gradients.
Epoch 1, Step 20, Loss: 120.43010711669922
Epoch 1, Step 21, Loss: 96.9

In [None]:
#!pip install -U xformers --index-url https://download.pytorch.org/whl/cu124

In [4]:
from diffusers import StableDiffusionPipeline
from PIL import Image
from torchvision import transforms
import torch

# Load fine-tuned model
fine_tuned_model_path = "/content/drive/MyDrive/finetuned_models/epoch_5"
pipeline = StableDiffusionPipeline.from_pretrained(
    fine_tuned_model_path,
    torch_dtype=torch.float16
).to("cuda")

# Preprocess input image
input_image_path = "/content/drive/MyDrive/images.jpeg"
image = Image.open(input_image_path).convert("RGB")
image_transform = transforms.Compose([
    transforms.Resize((512, 512)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])
input_tensor = image_transform(image).unsqueeze(0).to("cuda")

# Generate output
prompt = "A beautiful industrial-style bathroom with modern fixtures"
with torch.no_grad():
    generated_images = pipeline(prompt=prompt, guidance_scale=7.5, num_inference_steps=50)
    output_image = generated_images.images[0]

# Save output
output_image.save("/content/drive/MyDrive/generated_image1.jpg")
print("Generated image saved!")


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Generated image saved!
