In [1]:
# download texts
!gdown 0B0ywwgffWnLLZW9uVHNjb2JmNlE

# download images
!wget https://data.caltech.edu/records/65de6-vp158/files/CUB_200_2011.tgz?download=1

!mv CUB_200_2011.tgz?download=1 CUB_200_2011.tgz

!mkdir raw_texts raw_texts/cvpr2016_cub images

!tar -xzf CUB_200_2011.tgz -C images/
!tar -xzf cvpr2016_cub.tar.gz -C raw_texts/cvpr2016_cub/

Downloading...
From (original): https://drive.google.com/uc?id=0B0ywwgffWnLLZW9uVHNjb2JmNlE
From (redirected): https://drive.google.com/uc?id=0B0ywwgffWnLLZW9uVHNjb2JmNlE&confirm=t&uuid=e9fb23e0-8c9e-4ffc-8be6-432fbf680609
To: /kaggle/working/cvpr2016_cub.tar.gz
100%|█████████████████████████████████████████| 860M/860M [00:08<00:00, 103MB/s]
--2025-06-21 18:44:09--  https://data.caltech.edu/records/65de6-vp158/files/CUB_200_2011.tgz?download=1
Resolving data.caltech.edu (data.caltech.edu)... 35.155.11.48
Connecting to data.caltech.edu (data.caltech.edu)|35.155.11.48|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://s3.us-west-2.amazonaws.com/caltechdata/96/97/8384-3670-482e-a3dd-97ac171e8a10/data?response-content-type=application%2Foctet-stream&response-content-disposition=attachment%3B%20filename%3DCUB_200_2011.tgz&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARCVIVNNAP7NNDVEA%2F20250621%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250621T1

In [2]:
!pip install openai-clip

Collecting openai-clip
  Downloading openai-clip-1.0.1.tar.gz (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from openai-clip)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: openai-clip
  Building wheel for openai-clip (setup.py) ... [?25l[?25hdone
  Created wheel for openai-clip: filename=openai_clip-1.0.1-py3-none-any.whl size=1368605 sha256=d58565d2bdcc3d09d855b87dbb55d0af8ab93e8eea2c06419d2826a7ab1aa573
  Stored in directory: /root/.cache/pip/wheels/0d/17/90/042948fd2e2a87f1dcf6db6d438cad015c49db0c53d1d9c7dc
Successfully built openai-clip
Installing collected packages: ftfy, openai-clip


In [3]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torchvision.io import read_image
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torchvision.utils as vutils
import clip
import random
from torch.nn.utils import spectral_norm, clip_grad_norm_
from torch.amp import autocast, GradScaler
import numpy as np
from PIL import Image
from torch.optim.swa_utils import AveragedModel

class Config:
    """
    Configuration class for all hyperparameters and settings.
    Adjusted for a 16GB GPU and ~10 hour training session.
    """
    # --- Paths ---
    IMG_ROOT = "/kaggle/working/images/CUB_200_2011/images"
    TEXT_ROOT = "/kaggle/working/raw_texts/cvpr2016_cub/text_c10"
    
    # --- Model & Hardware ---
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    CLIP_MODEL = "ViT-B/32"
    
    # --- Training Hyperparameters ---
    # BATCH_SIZE lowered to fit 128x128 model in ~16GB VRAM
    BATCH_SIZE = 128 
    # EPOCHS increased for longer training at higher resolution
    EPOCHS = 600
    # TTUR (Two-Time-Scale Update Rule) learning rates
    LR_G = 1e-4 
    LR_D = 4e-4
    BETA1 = 0.0
    BETA2 = 0.9
    
    # --- Architecture ---
    IMG_SIZE = 128 # Increased image resolution
    Z_DIM = 100
    IMG_CHANNELS = 3
    G_FEATURES = 128
    D_FEATURES = 64
    
    # --- Loss Weights & Regularization ---
    LAMBDA_GP = 10
    # Contrastive loss weight lowered to balance with adversarial loss
    LAMBDA_CONTRASTIVE = 0.2 
    CONTRASTIVE_TEMP = 0.07
    CONTRASTIVE_WARMUP_EPOCHS = 20 # Longer warmup for stability
    GRAD_CLIP = 5.0
    EMA_DECAY = 0.999 # Decay for Generator's Exponential Moving Average

    # --- Training Schedule & Saving ---
    D_STEPS_PER_G_STEP = 1
    SAVE_FREQ = 20 # Save checkpoints less frequently to save time
    

class Text2ImageDataset(Dataset):
    def __init__(self, img_root, text_root, transform=None, tokenizer=None):
        self.tokenizer = tokenizer or (lambda t: clip.tokenize(t, truncate=True))
        
        # Added RandomHorizontalFlip for data augmentation
        self.transform = transform or transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize(Config.IMG_SIZE),
            transforms.RandomCrop(Config.IMG_SIZE),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])
        
        self.pairs = []
        # Fallback for environments where the data isn't present
        if not os.path.exists(img_root) or not os.path.exists(text_root):
              print(f"Warning: Data directory not found. Using placeholder data.")
              exit()
        else:
            for root, _, files in os.walk(img_root):
                for file in files:
                    if file.lower().endswith((".jpg", ".jpeg", ".png")):
                        img_path = os.path.join(root, file)
                        txt_filename = os.path.splitext(file)[0] + ".txt"
                        rel_path_dir = os.path.relpath(root, img_root)
                        text_path = os.path.join(text_root, rel_path_dir, txt_filename)
                        
                        if os.path.exists(text_path):
                            self.pairs.append((img_path, text_path))

        print(f"Loaded {len(self.pairs)} image-text pairs.")

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        img_path, text_info = self.pairs[idx]
        try:
            if os.path.exists(img_path):
                image = read_image(img_path)
                with open(text_info, 'r', encoding='utf-8') as f:
                    captions = [line.strip() for line in f if line.strip()]
                caption = random.choice(captions)
            else: # Fallback for dummy data
                exit()

            # Handle grayscale images
            if image.shape[0] == 1:
                image = image.repeat(3, 1, 1)
            
            image = self.transform(image)
            
            return image, caption

        except Exception as e:
            print(f"[Warning] Skipping corrupted item: {img_path} | Reason: {e}")
            # Return a random different item on error
            return self.__getitem__(random.randint(0, len(self) - 1))

class CLIPImageEncoder(nn.Module):
    def __init__(self, clip_model):
        super().__init__()
        self.model = clip_model.visual
        self.preprocess = transforms.Compose([
            transforms.Resize((224, 224), interpolation=transforms.InterpolationMode.BICUBIC, antialias=True),
            transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073),
                                 std=(0.26862954, 0.26130258, 0.27577711))
        ])
        for param in self.model.parameters():
            param.requires_grad = False
            
    def forward(self, x):
        x = (x + 1) / 2 # from [-1, 1] to [0, 1]
        x = self.preprocess(x)
        return self.model(x.float())

class CLIPTextEncoder(nn.Module):
    def __init__(self, clip_model):
        super().__init__()
        self.model = clip_model
        for param in self.model.parameters():
            param.requires_grad = False
    
    def forward(self, text):
        tokens = clip.tokenize(text, truncate=True).to(Config.DEVICE)
        embed = self.model.encode_text(tokens)
        return embed / embed.norm(dim=-1, keepdim=True)

class ConditionalNorm(nn.Module):
    def __init__(self, feat_size, embed_size):
        super().__init__()
        self.norm = nn.BatchNorm2d(feat_size, affine=False)
        self.gamma = spectral_norm(nn.Linear(embed_size, feat_size))
        self.beta = spectral_norm(nn.Linear(embed_size, feat_size))
        
    def forward(self, x, emb):
        x = self.norm(x)
        gamma = self.gamma(emb)[:,:,None,None]
        beta = self.beta(emb)[:,:,None,None]
        return x * (1 + gamma) + beta

class UpBlock(nn.Module):
    def __init__(self, in_channels, out_channels, embed_dim):
        super().__init__()
        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.conv = spectral_norm(nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False))
        self.cond_norm = ConditionalNorm(out_channels, embed_dim)
        self.relu = nn.ReLU(True)
        
    def forward(self, x, emb):
        x = self.upsample(x)
        x = self.conv(x)
        x = self.cond_norm(x, emb)
        return self.relu(x)

class Generator(nn.Module):
    def __init__(self, embed_dim=512, projected_embed_dim=128):
        super().__init__()
        ngf = Config.G_FEATURES
        self.text_projection = nn.Sequential(
            spectral_norm(nn.Linear(embed_dim, projected_embed_dim)),
            nn.ReLU()
        )
        self.noise_projection = nn.Sequential(
            spectral_norm(nn.Linear(Config.Z_DIM, 4 * 4 * ngf * 8)),
            nn.ReLU()
        )
        # Added an extra UpBlock for 128x128 resolution
        self.main = nn.ModuleList([
            UpBlock(ngf * 8, ngf * 8, projected_embed_dim), # 4x4 -> 8x8
            UpBlock(ngf * 8, ngf * 4, projected_embed_dim), # 8x8 -> 16x16
            UpBlock(ngf * 4, ngf * 2, projected_embed_dim), # 16x16 -> 32x32
            UpBlock(ngf * 2, ngf, projected_embed_dim),     # 32x32 -> 64x64
            UpBlock(ngf, ngf // 2, projected_embed_dim),    # 64x64 -> 128x128
        ])
        self.to_rgb = nn.Sequential(
            spectral_norm(nn.Conv2d(ngf // 2, Config.IMG_CHANNELS, 3, 1, 1, bias=False)),
            nn.Tanh()
        )

    def forward(self, embed_vector, z):
        cond_vector = self.text_projection(embed_vector)
        noise_proj = self.noise_projection(z).view(-1, Config.G_FEATURES * 8, 4, 4)
        x = noise_proj
        for layer in self.main:
            x = layer(x, cond_vector)
        return self.to_rgb(x)

class DownBlock(nn.Module):
    def __init__(self, in_channels, out_channels, norm=True):
        super().__init__()
        layers = [
            spectral_norm(nn.Conv2d(in_channels, out_channels, 4, 2, 1, bias=False)),
        ]
        if norm:
            layers.append(nn.InstanceNorm2d(out_channels))
        layers.append(nn.LeakyReLU(0.2, inplace=True))
        self.block = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.block(x)

class Discriminator(nn.Module):
    def __init__(self, embed_dim=512, projected_embed_dim=128):
        super().__init__()
        ndf = Config.D_FEATURES
        # DownBlocks for 128x128 resolution
        self.conv_blocks = nn.Sequential(
            DownBlock(Config.IMG_CHANNELS, ndf, norm=False), # 128->64
            DownBlock(ndf, ndf * 2),                         # 64->32
            DownBlock(ndf * 2, ndf * 4),                     # 32->16
            DownBlock(ndf * 4, ndf * 8),                     # 16->8
            DownBlock(ndf * 8, ndf * 8),                     # 8->4
        )
        self.text_projection = nn.Sequential(
            spectral_norm(nn.Linear(embed_dim, projected_embed_dim)),
            nn.LeakyReLU(0.2, inplace=True)
        )
        self.joint_conv = nn.Sequential(
            spectral_norm(nn.Conv2d(ndf * 8 + projected_embed_dim, ndf * 8, 1, bias=False)),
            nn.LeakyReLU(0.2, inplace=True),
        )
        self.final_conv = nn.Conv2d(ndf * 8, 1, 4, 1, 0)

    def forward(self, img, embed):
        img_feat = self.conv_blocks(img)
        txt_feat = self.text_projection(embed)
        txt_feat = txt_feat.view(txt_feat.size(0), -1, 1, 1).expand(
            -1, -1, img_feat.size(2), img_feat.size(3)
        )
        combined = torch.cat([img_feat, txt_feat], dim=1)
        combined = self.joint_conv(combined)
        return self.final_conv(combined).view(combined.size(0), -1)

def gradient_penalty(critic, real_images, fake_images, text_embeddings, device):
    bs = real_images.size(0)
    epsilon = torch.rand(bs, 1, 1, 1, device=device).expand_as(real_images)
    interpolated = epsilon * real_images + (1 - epsilon) * fake_images
    interpolated.requires_grad_(True)
    mixed_scores = critic(interpolated, text_embeddings)
    gradients = torch.autograd.grad(
        outputs=mixed_scores,
        inputs=interpolated,
        grad_outputs=torch.ones_like(mixed_scores),
        create_graph=True,
        retain_graph=True,
        only_inputs=True
    )[0]
    gradients = gradients.view(bs, -1)
    grad_norm = gradients.norm(2, dim=1)
    gp = ((grad_norm - 1) ** 2).mean()
    return gp

class CLIPContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.07):
        super().__init__()
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / temperature))
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, image_features, text_features):
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        labels = torch.arange(len(logits_per_image), device=logits_per_image.device)
        loss_i = self.loss_fn(logits_per_image, labels)
        loss_t = self.loss_fn(logits_per_image.t(), labels)
        return (loss_i + loss_t) / 2

# --- Initialization ---
print("Initializing models and optimizers...")
clip_model, _ = clip.load(Config.CLIP_MODEL, device=Config.DEVICE)
clip_model.eval()

clip_text_encoder = CLIPTextEncoder(clip_model).to(Config.DEVICE)
clip_image_encoder = CLIPImageEncoder(clip_model).to(Config.DEVICE)

G = Generator().to(Config.DEVICE)
D = Discriminator().to(Config.DEVICE)

# Initialize EMA model for generator
# The 'decay' argument is not supported in some PyTorch versions.
# We define the averaging function manually using avg_fn.
ema_avg_fn = lambda averaged_model_parameter, model_parameter, num_averaged: \
    Config.EMA_DECAY * averaged_model_parameter + (1 - Config.EMA_DECAY) * model_parameter
ema_G = AveragedModel(G, avg_fn=ema_avg_fn)

optimizerD = optim.Adam(D.parameters(), lr=Config.LR_D, betas=(Config.BETA1, Config.BETA2))
optimizerG = optim.Adam(G.parameters(), lr=Config.LR_G, betas=(Config.BETA1, Config.BETA2))
contrastive_loss_fn = CLIPContrastiveLoss(temperature=Config.CONTRASTIVE_TEMP).to(Config.DEVICE)
optimizerCL = optim.Adam(contrastive_loss_fn.parameters(), lr=1e-4) # Slower LR for temperature

dataset = Text2ImageDataset(img_root=Config.IMG_ROOT, text_root=Config.TEXT_ROOT)
# Use a reasonable number of workers
num_workers = min(os.cpu_count(), Config.BATCH_SIZE, 8) if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ else os.cpu_count() // 2
dataloader = DataLoader(dataset, batch_size=Config.BATCH_SIZE, shuffle=True, drop_last=True, num_workers=num_workers)


scaler = GradScaler()

fixed_noise = torch.randn(64, Config.Z_DIM, device=Config.DEVICE)
fixed_captions = ["a photo of a red bird with a short beak"] * 8 + \
                 ["a blue bird with a long thin beak"] * 8 + \
                 ["this is a small, brown and white bird with a short, pointed beak"] * 8 + \
                 ["a large black bird with a bright yellow crest on its head"] * 8 + \
                 ["a photograph of a green and yellow parrot sitting on a branch"] * 8 + \
                 ["a beautiful bird with a vibrant plumage of blue and orange"] * 8 + \
                 ["a water bird with long slender legs and a curved beak"] * 8 + \
                 ["a small yellow finch with black stripes on its wings"] * 8

with torch.no_grad():
    fixed_text_embed = clip_text_encoder(fixed_captions).float()

# --- Training Loop ---
print("Starting training...")
for epoch in range(Config.EPOCHS):
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{Config.EPOCHS}")

    # Warmup for contrastive loss lambda
    if epoch < Config.CONTRASTIVE_WARMUP_EPOCHS:
        lambda_contrastive = (epoch / Config.CONTRASTIVE_WARMUP_EPOCHS) * Config.LAMBDA_CONTRASTIVE
    else:
        lambda_contrastive = Config.LAMBDA_CONTRASTIVE

    for batch_idx, (real_images, captions) in enumerate(progress_bar):
        real_images = real_images.to(Config.DEVICE)
        b_size = real_images.size(0)

        with torch.no_grad():
            text_embeddings = clip_text_encoder(captions).float()

        # --- Train Discriminator ---
        D.train()
        optimizerD.zero_grad()
        
        with torch.no_grad():
            noise = torch.randn(b_size, Config.Z_DIM, device=Config.DEVICE)
            fake_images = G(text_embeddings, noise).detach()

        # Run D forward passes with autocast for memory savings
        with autocast(device_type=str(Config.DEVICE), dtype=torch.float16):
            real_output = D(real_images, text_embeddings)
            fake_output = D(fake_images, text_embeddings)
            loss_d_real = -torch.mean(real_output)
            loss_d_fake = torch.mean(fake_output)
            
        # Calculate GP in FP32 for stability
        gp = gradient_penalty(D, real_images.float(), fake_images.float(), text_embeddings, Config.DEVICE)
        
        errD = loss_d_real + loss_d_fake + Config.LAMBDA_GP * gp
        
        scaler.scale(errD).backward()
        scaler.step(optimizerD)
        scaler.update()

        # --- Train Generator + Contrastive loss ---
        G.train()
        optimizerG.zero_grad()
        optimizerCL.zero_grad()

        with autocast(device_type=str(Config.DEVICE), dtype=torch.float16):
            noise = torch.randn(b_size, Config.Z_DIM, device=Config.DEVICE)
            fake_images_for_g = G(text_embeddings, noise)
            adv_output = D(fake_images_for_g, text_embeddings)
            adv_loss = -torch.mean(adv_output)

            # CLIP contrastive loss
            fake_image_embeds = clip_image_encoder(fake_images_for_g)
            contrastive_loss = contrastive_loss_fn(fake_image_embeds, text_embeddings)
            
            total_g_loss = adv_loss + lambda_contrastive * contrastive_loss

        scaler.scale(total_g_loss).backward()
        
        # Gradient clipping
        scaler.unscale_(optimizerG)
        clip_grad_norm_(G.parameters(), Config.GRAD_CLIP)
        
        scaler.step(optimizerG)
        scaler.step(optimizerCL) # Update temperature param of contrastive loss
        scaler.update()

        # Update the EMA model - corrected method name
        ema_G.update_parameters(G)

        progress_bar.set_postfix(
            Loss_D=f"{errD.item():.4f}",
            Loss_G_Adv=f"{adv_loss.item():.4f}",
            Loss_G_Clip=f"{contrastive_loss.item():.4f}",
            Lambda_Clip=f"{lambda_contrastive:.3f}"
        )

    # --- Save samples and models ---
    if (epoch + 1) % Config.SAVE_FREQ == 0 or epoch == Config.EPOCHS - 1:
        # Use the EMA model for evaluation/saving samples
        ema_G.eval()
        with torch.no_grad():
            # The AveragedModel forward pass is automatically handled
            with autocast(device_type=str(Config.DEVICE), dtype=torch.float16):
                  fixed_fake = ema_G(fixed_text_embed, fixed_noise).detach().cpu()
            vutils.save_image(fixed_fake, f"sample_ema_epoch_{epoch+1}.png", normalize=True, nrow=8)
        
        # Save models
        torch.save(G.state_dict(), f"G_epoch_{epoch+1}.pth")
        torch.save(D.state_dict(), f"D_epoch_{epoch+1}.pth")
        torch.save(ema_G.state_dict(), f"ema_G_epoch_{epoch+1}.pth")

print("\nTraining finished.")
torch.save(G.state_dict(), "G_final.pth")
torch.save(D.state_dict(), "D_final.pth")
torch.save(ema_G.state_dict(), "ema_G_final.pth")

Initializing models and optimizers...


100%|████████████████████████████████████████| 338M/338M [00:02<00:00, 157MiB/s]


Loaded 11788 image-text pairs.
Starting training...


Epoch 1/600: 100%|██████████| 92/92 [02:24<00:00,  1.57s/it, Lambda_Clip=0.000, Loss_D=-36.0534, Loss_G_Adv=36.4375, Loss_G_Clip=4.8632]
Epoch 2/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.010, Loss_D=-10.1469, Loss_G_Adv=-5.6289, Loss_G_Clip=4.8700] 
Epoch 3/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.020, Loss_D=0.6173, Loss_G_Adv=-5.5312, Loss_G_Clip=4.8917]  
Epoch 4/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.030, Loss_D=-4.7137, Loss_G_Adv=4.8125, Loss_G_Clip=4.8865] 
Epoch 5/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.040, Loss_D=-4.9379, Loss_G_Adv=-4.2344, Loss_G_Clip=4.8739]
Epoch 6/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.050, Loss_D=-3.4315, Loss_G_Adv=1.4834, Loss_G_Clip=4.8833] 
Epoch 7/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.060, Loss_D=-3.9839, Loss_G_Adv=1.0410, Loss_G_Clip=4.8771] 
Epoch 8/600: 100%|██████████| 92/92 [02:22<0

Epoch 119/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-4.5264, Loss_G_Adv=-19.1094, Loss_G_Clip=4.8620]
Epoch 120/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-5.6829, Loss_G_Adv=-16.4062, Loss_G_Clip=4.8521]
Epoch 121/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-7.2800, Loss_G_Adv=-14.2656, Loss_G_Clip=4.8566]
Epoch 122/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-6.2175, Loss_G_Adv=-17.5938, Loss_G_Clip=4.8481]
Epoch 123/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-6.0781, Loss_G_Adv=-17.1719, Loss_G_Clip=4.8623]
Epoch 124/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-5.0892, Loss_G_Adv=-21.1250, Loss_G_Clip=4.8511]
Epoch 125/600: 100%|██████████| 92/92 [02:23<00:00,  1.56s/it, Lambda_Clip=0.200, Loss_D=-6.6880, Loss_G_Adv=-19.6719, Loss_G_Clip=4.8627]
Epoch 126/600: 100%|███████

Epoch 235/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-8.7035, Loss_G_Adv=-144.0000, Loss_G_Clip=4.8583]
Epoch 236/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-5.7102, Loss_G_Adv=-135.6250, Loss_G_Clip=4.8547]
Epoch 237/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-6.9208, Loss_G_Adv=-143.0000, Loss_G_Clip=4.8524]
Epoch 238/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-7.1722, Loss_G_Adv=-147.7500, Loss_G_Clip=4.8578]
Epoch 239/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-6.3123, Loss_G_Adv=-139.1250, Loss_G_Clip=4.8511]
Epoch 240/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-6.7049, Loss_G_Adv=-148.6250, Loss_G_Clip=4.8507]
Epoch 241/600: 100%|██████████| 92/92 [02:22<00:00,  1.55s/it, Lambda_Clip=0.200, Loss_D=-5.9884, Loss_G_Adv=-148.1250, Loss_G_Clip=4.8557]
Epoch 242/600: 100%|