In [None]:
# Cell 2: Import libraries and setup
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import numpy as np
from diffusers import StableDiffusionPipeline, UNet2DConditionModel, DDPMScheduler
from transformers import CLIPTextModel, CLIPTokenizer
from accelerate import Accelerator
from tqdm import tqdm
from peft import LoraConfig, get_peft_model, TaskType
import logging
#
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Check GPU
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")



In [None]:

# Cell 3: Configuration (Modify these parameters as needed)
class Config:
    model_name = "segmind/tiny-sd"
    images_dir = "dataset/images"  # Your images folder
    captions_csv = "dataset/captions.csv"  # Your captions file
    output_dir = "./fine_tuned_tiny_sd"
    resolution = 512
    batch_size = 2  # Reduced for Colab
    num_epochs = 15  # More epochs for small dataset
    learning_rate = 5e-5  # Lower learning rate
    lora_rank = 16  # Higher rank for more capacity
    batch_size = 1  # Smaller batches
    lora_alpha = 32
    gradient_accumulation_steps = 4
    mixed_precision = "no"

config = Config()

In [None]:
# Cell 4: Dataset class
class CustomDataset(Dataset):
    def __init__(self, images_dir, captions_csv, tokenizer, size=512):
        self.images_dir = images_dir
        self.size = size
        self.tokenizer = tokenizer

        # Load captions
        self.captions_df = pd.read_csv(captions_csv)
        # Assuming CSV has columns: 'image_filename', 'caption'
        self.image_files = self.captions_df['filename'].tolist()
        self.captions = self.captions_df[' captions'].tolist()

        print(f"Loaded {len(self.image_files)} images")

        # Image preprocessing
        self.image_transforms = transforms.Compose([
            transforms.Resize((size, size)),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5])
        ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        # Load image
        image_path = os.path.join(self.images_dir, self.image_files[idx])
        try:
            image = Image.open(image_path).convert('RGB')
            image = self.image_transforms(image)
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            # Return a blank image if loading fails
            image = torch.zeros(3, self.size, self.size)

        # Tokenize caption
        caption = str(self.captions[idx])
        text_inputs = self.tokenizer(
            caption,
            padding="max_length",
            max_length=77,
            truncation=True,
            return_tensors="pt"
        )

        return {
            "pixel_values": image,
            "input_ids": text_inputs.input_ids.squeeze(),
            "attention_mask": text_inputs.attention_mask.squeeze()
        }

def collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])

    return {
        "pixel_values": pixel_values,
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }


In [None]:
print("Loading model...")
pipe = StableDiffusionPipeline.from_pretrained(
    config.model_name,
    torch_dtype=torch.float32  # Changed to float32 to avoid dtype issues
)

# Extract components
unet = pipe.unet.to(torch.float32)  # Ensure float32
text_encoder = pipe.text_encoder.to(torch.float32)  # Ensure float32
tokenizer = pipe.tokenizer
vae = pipe.vae.to(torch.float32)  # Ensure float32
noise_scheduler = DDPMScheduler.from_pretrained(config.model_name, subfolder="scheduler")

# Freeze VAE and text encoder
vae.requires_grad_(False)
text_encoder.requires_grad_(False)

# Setup LoRA for UNet
lora_config = LoraConfig(
    r=config.lora_rank,
    lora_alpha=config.lora_alpha,
    target_modules=["to_k", "to_q", "to_v", "to_out.0"],
    lora_dropout=0.1,
    # Remove task_type as it's not needed for diffusion models
)

unet = get_peft_model(unet, lora_config)
unet.train()

print("LoRA setup complete!")

In [None]:
# Cell 6: Create dataset and dataloader
print("Creating dataset...")

# Check if files exist
if not os.path.exists(config.images_dir):
    print(f"Warning: {config.images_dir} directory not found!")
if not os.path.exists(config.captions_csv):
    print(f"Warning: {config.captions_csv} file not found!")

dataset = CustomDataset(
    images_dir=config.images_dir,
    captions_csv=config.captions_csv,
    tokenizer=tokenizer,
    size=config.resolution
)

dataloader = DataLoader(
    dataset,
    batch_size=config.batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=2  # Reduced for Colab
)

print(f"Dataset created with {len(dataset)} samples")

In [None]:
# Cell 7: Setup training
# Initialize accelerator
accelerator = Accelerator(
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    mixed_precision=config.mixed_precision
)

# Setup optimizer
optimizer = torch.optim.AdamW(
    unet.parameters(),
    lr=config.learning_rate,
    weight_decay=0.01
)

# Prepare everything with accelerator
unet, optimizer, dataloader = accelerator.prepare(unet, optimizer, dataloader)

# Move models to device
vae = vae.to(accelerator.device)
text_encoder = text_encoder.to(accelerator.device)

print("Training setup complete!")


In [None]:
# Cell 8: Training loop
print("Starting training...")
global_step = 0
os.makedirs(config.output_dir, exist_ok=True)

for epoch in range(config.num_epochs):
    unet.train()
    epoch_loss = 0

    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{config.num_epochs}")

    for batch_idx, batch in enumerate(progress_bar):
        with accelerator.accumulate(unet):
            # Convert images to latent space
            with torch.no_grad():
                latents = vae.encode(batch["pixel_values"]).latent_dist.sample()
                latents = latents * vae.config.scaling_factor

            # Add noise to latents
            noise = torch.randn_like(latents)
            timesteps = torch.randint(
                0, noise_scheduler.config.num_train_timesteps,
                (latents.shape[0],), device=latents.device
            ).long()

            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

            # Get text embeddings
            with torch.no_grad():
                encoder_hidden_states = text_encoder(batch["input_ids"])[0]

            # Predict noise
            noise_pred = unet(
                noisy_latents,
                timesteps,
                encoder_hidden_states
            ).sample

            # Calculate loss
            loss = torch.nn.functional.mse_loss(noise_pred, noise)

            # Backward pass
            accelerator.backward(loss)
            optimizer.step()
            optimizer.zero_grad()

            epoch_loss += loss.item()
            global_step += 1

            # Update progress bar
            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'avg_loss': f'{epoch_loss/(batch_idx+1):.4f}'
            })

    avg_loss = epoch_loss/len(dataloader)
    print(f"Epoch {epoch+1} completed. Average loss: {avg_loss:.4f}")

    # Save checkpoint every epoch
    if accelerator.is_main_process:
        save_path = os.path.join(config.output_dir, f"checkpoint-epoch-{epoch+1}")
        os.makedirs(save_path, exist_ok=True)

        # Save LoRA weights
        accelerator.unwrap_model(unet).save_pretrained(save_path)
        print(f"Checkpoint saved to {save_path}")

print("Training completed!")

In [None]:
#Cell 9: Save final model
print("Saving final model...")
if accelerator.is_main_process:
    final_save_path = os.path.join(config.output_dir, "final_model")
    os.makedirs(final_save_path, exist_ok=True)

    # Save LoRA weights
    accelerator.unwrap_model(unet).save_pretrained(final_save_path)

    # Merge LoRA weights and save full pipeline
    print("Merging LoRA weights...")
    unet_merged = accelerator.unwrap_model(unet).merge_and_unload()
    pipe.unet = unet_merged.to(torch.float16)
    pipe.save_pretrained(final_save_path)

    print(f"Final model saved to {final_save_path}")

In [None]:
# Cell 10: Test the fine-tuned model
def test_model(prompt="a beautiful landscape", save_name="test_output.png"):
    """Test the fine-tuned model"""
    print(f"Generating image for prompt: '{prompt}'")

    # Load the fine-tuned model
    test_pipe = StableDiffusionPipeline.from_pretrained(
        os.path.join(config.output_dir, "final_model"),
        torch_dtype=torch.float32
    )
    test_pipe = test_pipe.to("cuda" if torch.cuda.is_available() else "cpu")

    # Generate image
    with torch.no_grad():
        image = test_pipe(
            prompt,
            num_inference_steps=50,
            guidance_scale=7.5,
            height=512,
            width=512
        ).images[0]

    image.save(save_name)
    print(f"Test image saved as {save_name}")
    return image

# Test the model (run this after training completes)
test_image = test_model("Turab Hussain Usmani")

In [None]:

# Cell 11: Download the trained model (Optional)
# Uncomment and run this to download your trained model
"""
from google.colab import files
import shutil

# Create a zip of the final model
shutil.make_archive('fine_tuned_tiny_sd', 'zip', config.output_dir)
files.download('fine_tuned_tiny_sd.zip')
"""

print("Setup complete! Run the cells in order to fine-tune your model.")