## <b>Venilla GAN<b>

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt

# -------------------------------
# Parameters
# -------------------------------
IMG_SIZE = 64  # smaller for fast GAN training
BATCH_SIZE = 32
LATENT_DIM = 100
EPOCHS = 2000
SAVE_INTERVAL = 200

train_dir = r"C:\Users\ASUS\.cache\kagglehub\datasets\samuelcortinhas\cats-and-dogs-image-classification\versions\4\train\dogs"

# -------------------------------
# Load dog images
# -------------------------------
datagen = ImageDataGenerator(rescale=1./127.5 - 1)  # scale to [-1,1]
dog_gen = datagen.flow_from_directory(
    os.path.dirname(train_dir),
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    classes=['dogs'],
    class_mode=None,
    shuffle=True
)

# -------------------------------
# Build Generator
# -------------------------------
def build_generator():
    model = models.Sequential()
    model.add(layers.Dense(8*8*128, input_dim=LATENT_DIM))
    model.add(layers.Reshape((8, 8, 128)))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv2DTranspose(128, kernel_size=4, strides=2, padding='same', activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv2DTranspose(64, kernel_size=4, strides=2, padding='same', activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv2DTranspose(3, kernel_size=4, strides=2, padding='same', activation='tanh'))
    return model

# -------------------------------
# Build Discriminator
# -------------------------------
def build_discriminator():
    model = models.Sequential()
    model.add(layers.Conv2D(64, kernel_size=4, strides=2, padding='same', input_shape=(IMG_SIZE, IMG_SIZE, 3)))
    model.add(layers.LeakyReLU(0.2))
    model.add(layers.Conv2D(128, kernel_size=4, strides=2, padding='same'))
    model.add(layers.LeakyReLU(0.2))
    model.add(layers.Flatten())
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

# -------------------------------
# GAN Setup
# -------------------------------
discriminator = build_discriminator()
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

generator = build_generator()

z = layers.Input(shape=(LATENT_DIM,))
img = generator(z)
discriminator.trainable = False
valid = discriminator(img)

gan = models.Model(z, valid)
gan.compile(optimizer='adam', loss='binary_crossentropy')

# -------------------------------
# Training Loop
# -------------------------------
def save_images(epoch, generator, examples=4):
    noise = np.random.normal(0, 1, (examples, LATENT_DIM))
    gen_imgs = generator.predict(noise)
    gen_imgs = 0.5 * gen_imgs + 0.5  # scale back to [0,1]
    
    plt.figure(figsize=(4,4))
    for i in range(examples):
        plt.subplot(2,2,i+1)
        plt.imshow(gen_imgs[i])
        plt.axis('off')
    plt.show()

for epoch in range(EPOCHS):
    # ---------------------
    # Train Discriminator
    # ---------------------
    imgs = dog_gen.next()
    noise = np.random.normal(0, 1, (BATCH_SIZE, LATENT_DIM))
    gen_imgs = generator.predict(noise)
    
    real_y = np.ones((BATCH_SIZE, 1))
    fake_y = np.zeros((BATCH_SIZE, 1))
    
    d_loss_real = discriminator.train_on_batch(imgs, real_y)
    d_loss_fake = discriminator.train_on_batch(gen_imgs, fake_y)
    d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
    
    # ---------------------
    # Train Generator
    # ---------------------
    noise = np.random.normal(0, 1, (BATCH_SIZE, LATENT_DIM))
    valid_y = np.ones((BATCH_SIZE, 1))
    g_loss = gan.train_on_batch(noise, valid_y)
    
    # ---------------------
    # Print & Save
    # ---------------------
    if epoch % 100 == 0:
        print(f"{epoch} [D loss: {d_loss[0]:.4f}] [G loss: {g_loss:.4f}]")
    if epoch % SAVE_INTERVAL == 0:
        save_images(epoch, generator)

## <b>CNN Autoencoder (AE)<b>

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt

# -------------------------------
# Parameters
# -------------------------------
IMG_SIZE = 128
BATCH_SIZE = 16
EPOCHS = 50

train_dir = r"C:\Users\ASUS\.cache\kagglehub\datasets\samuelcortinhas\cats-and-dogs-image-classification\versions\4\train\dogs"

# -------------------------------
# Load dog images
# -------------------------------
datagen = ImageDataGenerator(rescale=1./255)
dog_gen = datagen.flow_from_directory(
    os.path.dirname(train_dir),
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    classes=['dogs'],
    class_mode=None,
    shuffle=True
)

# -------------------------------
# Autoencoder Model
# -------------------------------
input_img = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))

# Encoder
x = layers.Conv2D(64, (3,3), activation='relu', padding='same')(input_img)
x = layers.MaxPooling2D((2,2), padding='same')(x)
x = layers.Conv2D(128, (3,3), activation='relu', padding='same')(x)
x = layers.MaxPooling2D((2,2), padding='same')(x)
x = layers.Conv2D(256, (3,3), activation='relu', padding='same')(x)
encoded = layers.MaxPooling2D((2,2), padding='same')(x)

# Decoder
x = layers.Conv2D(256, (3,3), activation='relu', padding='same')(encoded)
x = layers.UpSampling2D((2,2))(x)
x = layers.Conv2D(128, (3,3), activation='relu', padding='same')(x)
x = layers.UpSampling2D((2,2))(x)
x = layers.Conv2D(64, (3,3), activation='relu', padding='same')(x)
x = layers.UpSampling2D((2,2))(x)
decoded = layers.Conv2D(3, (3,3), activation='sigmoid', padding='same')(x)

autoencoder = models.Model(input_img, decoded)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.summary()

# -------------------------------
# Train
# -------------------------------
autoencoder.fit(dog_gen, epochs=EPOCHS)

# -------------------------------
# Generate & Show Reconstructed Images
# -------------------------------
def show_reconstructed(gen, n=5):
    imgs = gen.next()
    reconstructed = autoencoder.predict(imgs)
    
    plt.figure(figsize=(10,4))
    for i in range(n):
        # Original
        plt.subplot(2,n,i+1)
        plt.imshow(imgs[i])
        plt.axis('off')
        # Reconstructed
        plt.subplot(2,n,i+1+n)
        plt.imshow(reconstructed[i])
        plt.axis('off')
    plt.show()

show_reconstructed(dog_gen)


## <b>Variational Autoencoder (VAE)<b>

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt

# -------------------------------
# Parameters
# -------------------------------
IMG_SIZE = 64  # smaller for faster training
BATCH_SIZE = 16
LATENT_DIM = 128
EPOCHS = 50

train_dir = r"C:\Users\ASUS\.cache\kagglehub\datasets\samuelcortinhas\cats-and-dogs-image-classification\versions\4\train\dogs"

# -------------------------------
# Load dog images
# -------------------------------
datagen = ImageDataGenerator(rescale=1./255)
dog_gen = datagen.flow_from_directory(
    os.path.dirname(train_dir),
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    classes=['dogs'],
    class_mode=None,
    shuffle=True
)

# -------------------------------
# Sampling Layer
# -------------------------------
class Sampling(layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        epsilon = tf.random.normal(shape=tf.shape(z_mean))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

# -------------------------------
# Encoder
# -------------------------------
encoder_inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
x = layers.Conv2D(32, 3, activation='relu', padding='same')(encoder_inputs)
x = layers.MaxPooling2D(2, padding='same')(x)
x = layers.Conv2D(64, 3, activation='relu', padding='same')(x)
x = layers.MaxPooling2D(2, padding='same')(x)
x = layers.Conv2D(128, 3, activation='relu', padding='same')(x)
x = layers.MaxPooling2D(2, padding='same')(x)
x = layers.Flatten()(x)
x = layers.Dense(256, activation='relu')(x)
z_mean = layers.Dense(LATENT_DIM)(x)
z_log_var = layers.Dense(LATENT_DIM)(x)
z = Sampling()([z_mean, z_log_var])
encoder = models.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

# -------------------------------
# Decoder
# -------------------------------
latent_inputs = layers.Input(shape=(LATENT_DIM,))
x = layers.Dense(8*8*128, activation='relu')(latent_inputs)
x = layers.Reshape((8,8,128))(x)
x = layers.Conv2DTranspose(128, 3, strides=2, padding='same', activation='relu')(x)
x = layers.Conv2DTranspose(64, 3, strides=2, padding='same', activation='relu')(x)
x = layers.Conv2DTranspose(32, 3, strides=2, padding='same', activation='relu')(x)
decoder_outputs = layers.Conv2D(3, 3, activation='sigmoid', padding='same')(x)
decoder = models.Model(latent_inputs, decoder_outputs, name="decoder")
decoder.summary()

# -------------------------------
# VAE Model
# -------------------------------
class VAE(models.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def compile(self, optimizer):
        super(VAE, self).compile()
        self.optimizer = optimizer
        self.loss_fn = tf.keras.losses.MeanSquaredError()

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            # Reconstruction loss
            recon_loss = self.loss_fn(data, reconstruction)
            # KL divergence
            kl_loss = -0.5 * tf.reduce_mean(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            total_loss = recon_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {"loss": total_loss, "reconstruction_loss": recon_loss, "kl_loss": kl_loss}

vae = VAE(encoder, decoder)
vae.compile(optimizer=tf.keras.optimizers.Adam())

# -------------------------------
# Train VAE
# -------------------------------
vae.fit(dog_gen, epochs=EPOCHS)

# -------------------------------
# Generate New Dog Images
# -------------------------------
def generate_dogs(n=5):
    noise = np.random.normal(0,1,(n,LATENT_DIM))
    gen_imgs = decoder.predict(noise)
    plt.figure(figsize=(10,2))
    for i in range(n):
        plt.subplot(1,n,i+1)
        plt.imshow(gen_imgs[i])
        plt.axis('off')
    plt.show()

generate_dogs(n=5)


## <b>Autoregressive (iGPT / Image GPT)<b>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt

# -------------------------------
# Parameters
# -------------------------------
IMG_SIZE = 32  # small for autoregressive training
CHANNELS = 3
BATCH_SIZE = 16
EPOCHS = 20
SEQ_LEN = IMG_SIZE * IMG_SIZE * CHANNELS  # flatten image to sequence
EMBED_DIM = 128
NUM_HEADS = 4
NUM_LAYERS = 4

train_dir = r"C:\Users\ASUS\.cache\kagglehub\datasets\samuelcortinhas\cats-and-dogs-image-classification\versions\4\train\dogs"

# -------------------------------
# Load dog images
# -------------------------------
datagen = ImageDataGenerator(rescale=1./255)
dog_gen = datagen.flow_from_directory(
    os.path.dirname(train_dir),
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    classes=['dogs'],
    class_mode=None,
    shuffle=True
)

# -------------------------------
# Prepare sequences
# -------------------------------
def preprocess(imgs):
    # Flatten images to sequences
    return imgs.reshape((imgs.shape[0], SEQ_LEN))

# -------------------------------
# Transformer Block
# -------------------------------
def transformer_block(x, embed_dim, num_heads):
    attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x, x)
    x = layers.LayerNormalization()(x + attn_output)
    ff = layers.Dense(embed_dim*4, activation='relu')(x)
    ff = layers.Dense(embed_dim)(ff)
    x = layers.LayerNormalization()(x + ff)
    return x

# -------------------------------
# Autoregressive Image GPT Model
# -------------------------------
inputs = layers.Input(shape=(SEQ_LEN,))
x = layers.Embedding(input_dim=256, output_dim=EMBED_DIM)(tf.cast(inputs*255, tf.int32))
for _ in range(NUM_LAYERS):
    x = transformer_block(x, EMBED_DIM, NUM_HEADS)
outputs = layers.Dense(256, activation='softmax')(x)  # predict next pixel intensity
model = models.Model(inputs, outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()

# -------------------------------
# Custom Data Generator for GPT
# -------------------------------
class GPTData(tf.keras.utils.Sequence):
    def __init__(self, generator):
        self.generator = generator

    def __len__(self):
        return len(self.generator)

    def __getitem__(self, idx):
        batch = self.generator[idx]
        seq = preprocess(batch)
        x = seq[:, :-1]  # input sequence
        y = seq[:, 1:]   # next-pixel prediction
        return x, y

gpt_gen = GPTData(dog_gen)

# -------------------------------
# Train
# -------------------------------
model.fit(gpt_gen, epochs=EPOCHS)

# -------------------------------
# Generate new image
# -------------------------------
def generate_image():
    seq = np.zeros((1, SEQ_LEN-1))
    for i in range(SEQ_LEN-1):
        preds = model.predict(seq, verbose=0)
        next_pixel = np.argmax(preds[0, i])
        seq[0, i] = next_pixel / 255.0
    img = seq.reshape((IMG_SIZE, IMG_SIZE, CHANNELS))
    plt.imshow(img)
    plt.axis('off')
    plt.show()

generate_image()


## <b>Diffusion Models<b>

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt

# -------------------------------
# Parameters
# -------------------------------
IMG_SIZE = 32
CHANNELS = 3
BATCH_SIZE = 16
EPOCHS = 50
TIMESTEPS = 100  # diffusion steps
TRAIN_DIR = r"C:\Users\ASUS\.cache\kagglehub\datasets\samuelcortinhas\cats-and-dogs-image-classification\versions\4\train\dogs"

# -------------------------------
# Load dog images
# -------------------------------
datagen = ImageDataGenerator(rescale=1./255)
dog_gen = datagen.flow_from_directory(
    os.path.dirname(TRAIN_DIR),
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    classes=['dogs'],
    class_mode=None,
    shuffle=True
)

# -------------------------------
# Simple UNet-like model for denoising
# -------------------------------
def get_unet(img_shape=(IMG_SIZE, IMG_SIZE, CHANNELS)):
    inputs = layers.Input(img_shape)
    t_input = layers.Input(shape=(1,))  # timestep input

    x = layers.Conv2D(64, 3, activation='relu', padding='same')(inputs)
    x = layers.Conv2D(128, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling2D()(x)
    x = layers.Conv2D(128, 3, activation='relu', padding='same')(x)
    x = layers.UpSampling2D()(x)
    x = layers.Conv2D(64, 3, activation='relu', padding='same')(x)
    outputs = layers.Conv2D(CHANNELS, 3, activation='sigmoid', padding='same')(x)
    return models.Model([inputs, t_input], outputs)

model = get_unet()
model.compile(optimizer='adam', loss='mse')
model.summary()

# -------------------------------
# Noise schedule
# -------------------------------
def add_noise(x, t):
    noise = tf.random.normal(shape=tf.shape(x))
    alpha = 1 - t / TIMESTEPS
    return tf.sqrt(alpha) * x + tf.sqrt(1 - alpha) * noise, noise

# -------------------------------
# Training loop
# -------------------------------
for epoch in range(EPOCHS):
    for batch in dog_gen:
        t = np.random.randint(1, TIMESTEPS+1)
        noisy_imgs, noise = add_noise(batch, t)
        loss = model.train_on_batch([noisy_imgs, np.full((len(batch),1), t)], noise)
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {loss}")

# -------------------------------
# Sampling / Generation
# -------------------------------
def generate_image():
    img = tf.random.normal((1, IMG_SIZE, IMG_SIZE, CHANNELS))
    for t in reversed(range(1, TIMESTEPS+1)):
        pred_noise = model.predict([img, np.array([[t]])])
        alpha = 1 - t / TIMESTEPS
        img = (img - tf.sqrt(1-alpha) * pred_noise) / tf.sqrt(alpha)
    plt.imshow(np.clip(img[0], 0, 1))
    plt.axis('off')
    plt.show()

generate_image()


## <b>Latent Diffusion Models (LDMs)<b>

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt

# -------------------------------
# Parameters
# -------------------------------
IMG_SIZE = 32
LATENT_DIM = 16  # small latent dimension
CHANNELS = 3
BATCH_SIZE = 16
EPOCHS = 30
TIMESTEPS = 50
TRAIN_DIR = r"C:\Users\ASUS\.cache\kagglehub\datasets\samuelcortinhas\cats-and-dogs-image-classification\versions\4\train\dogs"

# -------------------------------
# Load dog images
# -------------------------------
datagen = ImageDataGenerator(rescale=1./255)
dog_gen = datagen.flow_from_directory(
    os.path.dirname(TRAIN_DIR),
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    classes=['dogs'],
    class_mode=None,
    shuffle=True
)

# -------------------------------
# Encoder: Pixel -> Latent
# -------------------------------
def build_encoder():
    inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, CHANNELS))
    x = layers.Conv2D(64, 3, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling2D()(x)
    x = layers.Conv2D(128, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling2D()(x)
    x = layers.Flatten()(x)
    latent = layers.Dense(LATENT_DIM)(x)
    return models.Model(inputs, latent, name='encoder')

# -------------------------------
# Decoder: Latent -> Pixel
# -------------------------------
def build_decoder():
    inputs = layers.Input(shape=(LATENT_DIM,))
    x = layers.Dense(8*8*128, activation='relu')(inputs)
    x = layers.Reshape((8,8,128))(x)
    x = layers.Conv2DTranspose(128, 3, strides=2, padding='same', activation='relu')(x)
    x = layers.Conv2DTranspose(64, 3, strides=2, padding='same', activation='relu')(x)
    outputs = layers.Conv2D(CHANNELS, 3, activation='sigmoid', padding='same')(x)
    return models.Model(inputs, outputs, name='decoder')

encoder = build_encoder()
decoder = build_decoder()

# -------------------------------
# UNet-like Denoiser in latent space
# -------------------------------
def get_latent_unet(latent_dim):
    latent_input = layers.Input(shape=(latent_dim,))
    t_input = layers.Input(shape=(1,))
    x = layers.Dense(128, activation='relu')(latent_input)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dense(latent_dim)(x)  # predict noise
    return models.Model([latent_input, t_input], x, name='latent_unet')

latent_unet = get_latent_unet(LATENT_DIM)
latent_unet.compile(optimizer='adam', loss='mse')

# -------------------------------
# Diffusion helper
# -------------------------------
def add_noise(latent, t):
    noise = tf.random.normal(shape=tf.shape(latent))
    alpha = 1 - t / TIMESTEPS
    return tf.sqrt(alpha) * latent + tf.sqrt(1-alpha) * noise, noise

# -------------------------------
# Training loop
# -------------------------------
for epoch in range(EPOCHS):
    for batch in dog_gen:
        latent = encoder.predict(batch)
        t = np.random.randint(1, TIMESTEPS+1)
        noisy_latent, noise = add_noise(latent, t)
        loss = latent_unet.train_on_batch([noisy_latent, np.full((len(batch),1), t)], noise)
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {loss}")

# -------------------------------
# Generate new dog image
# -------------------------------
def generate_image():
    latent = tf.random.normal((1, LATENT_DIM))
    for t in reversed(range(1, TIMESTEPS+1)):
        pred_noise = latent_unet.predict([latent, np.array([[t]])])
        alpha = 1 - t / TIMESTEPS
        latent = (latent - tf.sqrt(1-alpha) * pred_noise) / tf.sqrt(alpha)
    img = decoder.predict(latent)
    plt.imshow(np.clip(img[0],0,1))
    plt.axis('off')
    plt.show()

generate_image()


## <b>GAN (DCGAN / cGAN / StyleGAN / CycleGAN)<b>

In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt

# -------------------------------
# Parameters
# -------------------------------
IMG_SIZE = 64
CHANNELS = 3
BATCH_SIZE = 32
EPOCHS = 2000
NOISE_DIM = 100
TRAIN_DIR = r"C:\Users\ASUS\.cache\kagglehub\datasets\samuelcortinhas\cats-and-dogs-image-classification\versions\4\train\dogs"

# -------------------------------
# Load dog images
# -------------------------------
datagen = ImageDataGenerator(rescale=1./127.5 - 1)
dog_gen = datagen.flow_from_directory(
    os.path.dirname(TRAIN_DIR),
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    classes=['dogs'],
    class_mode=None,
    shuffle=True
)

# -------------------------------
# Generator
# -------------------------------
def build_generator():
    model = models.Sequential()
    model.add(layers.Dense(8*8*256, use_bias=False, input_shape=(NOISE_DIM,)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    model.add(layers.Reshape((8,8,256)))
    model.add(layers.Conv2DTranspose(128, 5, strides=2, padding='same', activation='relu'))
    model.add(layers.Conv2DTranspose(64, 5, strides=2, padding='same', activation='relu'))
    model.add(layers.Conv2DTranspose(CHANNELS, 5, strides=2, padding='same', activation='tanh'))
    return model

# -------------------------------
# Discriminator
# -------------------------------
def build_discriminator():
    model = models.Sequential()
    model.add(layers.Conv2D(64, 5, strides=2, padding='same', input_shape=(IMG_SIZE,IMG_SIZE,CHANNELS)))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    model.add(layers.Conv2D(128, 5, strides=2, padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.3))
    model.add(layers.Flatten())
    model.add(layers.Dense(1))
    return model

generator = build_generator()
discriminator = build_discriminator()

# -------------------------------
# Optimizers and loss
# -------------------------------
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
gen_optimizer = tf.keras.optimizers.Adam(1e-4)
disc_optimizer = tf.keras.optimizers.Adam(1e-4)

# -------------------------------
# Training step
# -------------------------------
@tf.function
def train_step(images):
    noise = tf.random.normal([BATCH_SIZE, NOISE_DIM])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator(noise, training=True)

        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)

        gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)
        disc_loss = (cross_entropy(tf.ones_like(real_output), real_output) +
                     cross_entropy(tf.zeros_like(fake_output), fake_output)) * 0.5

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    gen_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    return gen_loss, disc_loss

# -------------------------------
# Training loop
# -------------------------------
for epoch in range(EPOCHS):
    for batch in dog_gen:
        gen_loss, disc_loss = train_step(batch)
        break  # only one batch per step for simplicity
    if (epoch+1) % 100 == 0:
        print(f"Epoch {epoch+1}, Gen Loss: {gen_loss:.4f}, Disc Loss: {disc_loss:.4f}")
        # Generate sample
        noise = tf.random.normal([1, NOISE_DIM])
        img = generator(noise, training=False)
        plt.imshow((img[0]+1)/2)
        plt.axis('off')
        plt.show()

## <b>Text-to-Image cGAN (from scratch)<b>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import matplotlib.pyplot as plt
import os

# -------------------------------
# Parameters
# -------------------------------
IMG_SIZE = 64
CHANNELS = 3
BATCH_SIZE = 32
EPOCHS = 2000
NOISE_DIM = 100
TRAIN_DIR = r"C:\Users\ASUS\.cache\kagglehub\datasets\samuelcortinhas\cats-and-dogs-image-classification\versions\4\train\dogs"

# -------------------------------
# Load dog images
# -------------------------------
datagen = ImageDataGenerator(rescale=1./127.5 - 1)
dog_gen = datagen.flow_from_directory(
    os.path.dirname(TRAIN_DIR),
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    classes=['dogs'],
    class_mode=None,
    shuffle=True
)

# -------------------------------
# Generator (CNN-based)
# -------------------------------
def build_generator():
    model = models.Sequential()
    model.add(layers.Dense(8*8*256, use_bias=False, input_shape=(NOISE_DIM,)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    model.add(layers.Reshape((8,8,256)))
    model.add(layers.Conv2DTranspose(128, 5, strides=2, padding='same', activation='relu'))
    model.add(layers.Conv2DTranspose(64, 5, strides=2, padding='same', activation='relu'))
    model.add(layers.Conv2DTranspose(CHANNELS, 5, strides=2, padding='same', activation='tanh'))
    return model

# -------------------------------
# Vision Transformer (ViT) Discriminator
# -------------------------------
def build_vit_discriminator(img_size=IMG_SIZE, channels=CHANNELS, patch_size=8, embed_dim=64, num_heads=4, mlp_dim=128):
    input_img = layers.Input(shape=(img_size,img_size,channels))
    # Create patches
    patches = layers.Conv2D(embed_dim, patch_size, strides=patch_size, padding='valid')(input_img)
    # Flatten patches
    patches = layers.Reshape((-1, embed_dim))(patches)
    # Add positional embeddings
    num_patches = patches.shape[1]
    pos_emb = layers.Embedding(input_dim=num_patches, output_dim=embed_dim)
    positions = tf.range(start=0, limit=num_patches, delta=1)
    x = patches + pos_emb(positions)
    # Transformer encoder
    for _ in range(2):
        # Layer norm
        x1 = layers.LayerNormalization(epsilon=1e-6)(x)
        # Multi-head attention
        attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x1,x1)
        x = layers.Add()([x, attn_output])
        # MLP
        x1 = layers.LayerNormalization(epsilon=1e-6)(x)
        x1 = layers.Dense(mlp_dim, activation='relu')(x1)
        x1 = layers.Dense(embed_dim)(x1)
        x = layers.Add()([x, x1])
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    x = layers.Flatten()(x)
    out = layers.Dense(1)(x)
    return models.Model(input_img, out)

generator = build_generator()
discriminator = build_vit_discriminator()

# -------------------------------
# Optimizers and loss
# -------------------------------
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
gen_optimizer = tf.keras.optimizers.Adam(1e-4)
disc_optimizer = tf.keras.optimizers.Adam(1e-4)

# -------------------------------
# Training step
# -------------------------------
@tf.function
def train_step(images):
    noise = tf.random.normal([BATCH_SIZE, NOISE_DIM])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator(noise, training=True)

        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)

        gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)
        disc_loss = (cross_entropy(tf.ones_like(real_output), real_output) +
                     cross_entropy(tf.zeros_like(fake_output), fake_output)) * 0.5

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    gen_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    return gen_loss, disc_loss

# -------------------------------
# Training loop
# -------------------------------
for epoch in range(EPOCHS):
    for batch in dog_gen:
        gen_loss, disc_loss = train_step(batch)
        break  # one batch per step for simplicity
    if (epoch+1) % 100 == 0:
        print(f"Epoch {epoch+1}, Gen Loss: {gen_loss:.4f}, Disc Loss: {disc_loss:.4f}")
        # Generate sample
        noise = tf.random.normal([1, NOISE_DIM])
        img = generator(noise, training=False)
        plt.imshow((img[0]+1)/2)
        plt.axis('off')
        plt.show()

## <b>CNN + VIT<b>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import matplotlib.pyplot as plt
import os

# -------------------------------
# Parameters
# -------------------------------
IMG_SIZE = 64
CHANNELS = 3
BATCH_SIZE = 32
EPOCHS = 2000
NOISE_DIM = 100
TRAIN_DIR = r"C:\Users\ASUS\.cache\kagglehub\datasets\samuelcortinhas\cats-and-dogs-image-classification\versions\4\train\dogs"

# -------------------------------
# Load dog images
# -------------------------------
datagen = ImageDataGenerator(rescale=1./127.5 - 1)
dog_gen = datagen.flow_from_directory(
    os.path.dirname(TRAIN_DIR),
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    classes=['dogs'],
    class_mode=None,
    shuffle=True
)

# -------------------------------
# Generator (CNN-based)
# -------------------------------
def build_generator():
    model = models.Sequential()
    model.add(layers.Dense(8*8*256, use_bias=False, input_shape=(NOISE_DIM,)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    model.add(layers.Reshape((8,8,256)))
    model.add(layers.Conv2DTranspose(128, 5, strides=2, padding='same', activation='relu'))
    model.add(layers.Conv2DTranspose(64, 5, strides=2, padding='same', activation='relu'))
    model.add(layers.Conv2DTranspose(CHANNELS, 5, strides=2, padding='same', activation='tanh'))
    return model

# -------------------------------
# Discriminator (CNN + ViT hybrid)
# -------------------------------
def build_discriminator(img_size=IMG_SIZE, channels=CHANNELS, patch_size=8, embed_dim=64, num_heads=4, mlp_dim=128):
    input_img = layers.Input(shape=(img_size,img_size,channels))
    
    # CNN feature extractor
    x = layers.Conv2D(64, 5, strides=2, padding='same', activation='relu')(input_img)
    x = layers.Conv2D(128, 5, strides=2, padding='same', activation='relu')(x)
    x = layers.Conv2D(embed_dim, 3, strides=1, padding='same', activation='relu')(x)
    
    # Flatten patches for ViT
    num_patches = (img_size // patch_size) ** 2
    patch_dim = embed_dim * patch_size * patch_size
    x = layers.Reshape((num_patches, patch_dim))(x)
    
    # Positional embedding
    pos_emb = layers.Embedding(input_dim=num_patches, output_dim=patch_dim)
    positions = tf.range(start=0, limit=num_patches, delta=1)
    x = x + pos_emb(positions)
    
    # Transformer encoder
    for _ in range(2):
        # Layer norm
        x1 = layers.LayerNormalization(epsilon=1e-6)(x)
        # Multi-head attention
        attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=patch_dim)(x1, x1)
        x = layers.Add()([x, attn_output])
        # MLP
        x1 = layers.LayerNormalization(epsilon=1e-6)(x)
        x1 = layers.Dense(mlp_dim, activation='relu')(x1)
        x1 = layers.Dense(patch_dim)(x1)
        x = layers.Add()([x, x1])
    
    x = layers.LayerNormalization(epsilon=1e-6)(x)
    x = layers.Flatten()(x)
    out = layers.Dense(1)(x)
    
    return models.Model(input_img, out)

generator = build_generator()
discriminator = build_discriminator()

# -------------------------------
# Optimizers and loss
# -------------------------------
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
gen_optimizer = tf.keras.optimizers.Adam(1e-4)
disc_optimizer = tf.keras.optimizers.Adam(1e-4)

# -------------------------------
# Training step
# -------------------------------
@tf.function
def train_step(images):
    noise = tf.random.normal([BATCH_SIZE, NOISE_DIM])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator(noise, training=True)

        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)

        gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)
        disc_loss = (cross_entropy(tf.ones_like(real_output), real_output) +
                     cross_entropy(tf.zeros_like(fake_output), fake_output)) * 0.5

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    gen_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    return gen_loss, disc_loss

# -------------------------------
# Training loop
# -------------------------------
for epoch in range(EPOCHS):
    for batch in dog_gen:
        gen_loss, disc_loss = train_step(batch)
        break  # one batch per step for simplicity
    if (epoch+1) % 100 == 0:
        print(f"Epoch {epoch+1}, Gen Loss: {gen_loss:.4f}, Disc Loss: {disc_loss:.4f}")
        # Generate sample
        noise = tf.random.normal([1, NOISE_DIM])
        img = generator(noise, training=False)
        plt.imshow((img[0]+1)/2)
        plt.axis('off')
        plt.show()

## <b>Fine tuning pretrained model<b>

In [None]:
# Install dependencies first (run once)
# pip install diffusers transformers accelerate torch

import torch
from diffusers import StableDiffusionPipeline
import matplotlib.pyplot as plt

# -------------------------------
# Load pre-trained Stable Diffusion model
# -------------------------------
model_id = "runwayml/stable-diffusion-v1-5"  # lightweight, free-to-use version
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda") if torch.cuda.is_available() else pipe.to("cpu")

# -------------------------------
# Text prompt -> Image
# -------------------------------
prompt = "A cute dog playing in the garden, photorealistic"
image = pipe(prompt, guidance_scale=7.5).images[0]

# Display the generated image
plt.imshow(image)
plt.axis("off")
plt.show()

# Save the image
image.save("generated_dog.png")

## <b>Text-to-Image Fine-Tuned Pretrained (Stable Diffusion)<b>

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import matplotlib.pyplot as plt
import os

# -------------------------------
# Parameters
# -------------------------------
IMG_SIZE = 64
CHANNELS = 3
BATCH_SIZE = 16
EPOCHS = 1000
NOISE_DIM = 100
MAX_SEQ_LEN = 10
EMBED_DIM = 50
TRAIN_DIR = r"C:\Users\ASUS\.cache\kagglehub\datasets\samuelcortinhas\cats-and-dogs-image-classification\versions\4\train\dogs"

# -------------------------------
# Load dog images
# -------------------------------
datagen = ImageDataGenerator(rescale=1./127.5 - 1)
dog_gen = datagen.flow_from_directory(
    os.path.dirname(TRAIN_DIR),
    target_size=(IMG_SIZE, IMG_SIZE),
    batch_size=BATCH_SIZE,
    classes=['dogs'],
    class_mode=None,
    shuffle=True
)

# -------------------------------
# Text prompts (for small example)
# -------------------------------
# Normally, you would have one prompt per image
texts = ["dog"] * 1000  # simple repetitive label
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
text_data = pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')
vocab_size = len(tokenizer.word_index) + 1

# -------------------------------
# Generator (CNN + Text Embedding)
# -------------------------------
def build_generator():
    noise_input = layers.Input(shape=(NOISE_DIM,))
    text_input = layers.Input(shape=(MAX_SEQ_LEN,))
    
    # Text embedding
    x_text = layers.Embedding(vocab_size, EMBED_DIM, input_length=MAX_SEQ_LEN)(text_input)
    x_text = layers.LSTM(128)(x_text)
    
    # Combine noise + text
    x = layers.Concatenate()([noise_input, x_text])
    x = layers.Dense(8*8*256, use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.LeakyReLU()(x)
    x = layers.Reshape((8,8,256))(x)
    x = layers.Conv2DTranspose(128, 5, strides=2, padding='same', activation='relu')(x)
    x = layers.Conv2DTranspose(64, 5, strides=2, padding='same', activation='relu')(x)
    x = layers.Conv2DTranspose(CHANNELS, 5, strides=2, padding='same', activation='tanh')(x)
    
    return models.Model([noise_input, text_input], x)

# -------------------------------
# Discriminator (CNN + conditional)
# -------------------------------
def build_discriminator():
    img_input = layers.Input(shape=(IMG_SIZE, IMG_SIZE, CHANNELS))
    text_input = layers.Input(shape=(MAX_SEQ_LEN,))
    
    # Text embedding
    x_text = layers.Embedding(vocab_size, EMBED_DIM, input_length=MAX_SEQ_LEN)(text_input)
    x_text = layers.LSTM(128)(x_text)
    x_text = layers.RepeatVector(IMG_SIZE*IMG_SIZE)(x_text)
    x_text = layers.Reshape((IMG_SIZE, IMG_SIZE, 128))(x_text)
    
    # CNN on image
    x_img = layers.Conv2D(64, 5, strides=2, padding='same', activation='relu')(img_input)
    x_img = layers.Conv2D(128, 5, strides=2, padding='same', activation='relu')(x_img)
    
    # Concatenate image + text features
    x = layers.Concatenate()([x_img, x_text])
    x = layers.Flatten()(x)
    x = layers.Dense(1)(x)
    
    return models.Model([img_input, text_input], x)

# -------------------------------
# Build models
# -------------------------------
generator = build_generator()
discriminator = build_discriminator()

# -------------------------------
# Optimizers & loss
# -------------------------------
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
gen_optimizer = tf.keras.optimizers.Adam(1e-4)
disc_optimizer = tf.keras.optimizers.Adam(1e-4)

# -------------------------------
# Training step
# -------------------------------
@tf.function
def train_step(images, text_seq):
    noise = tf.random.normal([BATCH_SIZE, NOISE_DIM])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator([noise, text_seq], training=True)

        real_output = discriminator([images, text_seq], training=True)
        fake_output = discriminator([generated_images, text_seq], training=True)

        gen_loss = cross_entropy(tf.ones_like(fake_output), fake_output)
        disc_loss = (cross_entropy(tf.ones_like(real_output), real_output) +
                     cross_entropy(tf.zeros_like(fake_output), fake_output)) * 0.5

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    gen_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    return gen_loss, disc_loss

# -------------------------------
# Training loop
# -------------------------------
for epoch in range(EPOCHS):
    for batch_idx, batch in enumerate(dog_gen):
        text_batch = text_data[:BATCH_SIZE]  # use same text for simplicity
        gen_loss, disc_loss = train_step(batch, text_batch)
        break  # one batch per step for demo
    
    if (epoch+1) % 100 == 0:
        print(f"Epoch {epoch+1}, Gen Loss: {gen_loss:.4f}, Disc Loss: {disc_loss:.4f}")
        # Generate sample image
        noise = tf.random.normal([1, NOISE_DIM])
        sample_img = generator([noise, text_data[:1]], training=False)
        plt.imshow((sample_img[0]+1)/2)
        plt.axis('off')
        plt.show()

In [None]:
# Install dependencies if not already installed:
# pip install diffusers transformers accelerate datasets safetensors torch torchvision

import torch
from diffusers import StableDiffusionPipeline, StableDiffusionTrainer
from diffusers import UNet2DConditionModel, AutoencoderKL, PNDMScheduler
from transformers import CLIPTextModel, CLIPTokenizer
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os

# -------------------------------
# Parameters
# -------------------------------
DATA_DIR = r"C:\Users\ASUS\.cache\kagglehub\datasets\samuelcortinhas\cats-and-dogs-image-classification\versions\4\train\dogs"
OUTPUT_DIR = "./fine_tuned_dogs"
BATCH_SIZE = 2
EPOCHS = 5
IMG_SIZE = 256
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
PROMPT = "A cute dog"  # text prompt for conditioning

# -------------------------------
# Custom dataset
# -------------------------------
class DogDataset(Dataset):
    def __init__(self, img_dir, transform=None):
        self.img_dir = img_dir
        self.images = [os.path.join(img_dir, f) for f in os.listdir(img_dir) if f.endswith((".jpg",".png"))]
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img = Image.open(self.images[idx]).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return {"image": img, "prompt": PROMPT}

# -------------------------------
# Transform & DataLoader
# -------------------------------
from torchvision import transforms

transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

dataset = DogDataset(DATA_DIR, transform=transform)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# -------------------------------
# Load pre-trained Stable Diffusion components
# -------------------------------
vae = AutoencoderKL.from_pretrained("runwayml/stable-diffusion-v1-5").to(DEVICE)
unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5").to(DEVICE)
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(DEVICE)
scheduler = PNDMScheduler.from_pretrained("runwayml/stable-diffusion-v1-5")

# -------------------------------
# Training loop (fine-tuning UNet)
# -------------------------------
optimizer = torch.optim.Adam(unet.parameters(), lr=1e-5)
loss_fn = torch.nn.MSELoss()

for epoch in range(EPOCHS):
    for batch in dataloader:
        images = batch["image"].to(DEVICE)
        prompts = batch["prompt"]
        text_inputs = tokenizer(prompts, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt").to(DEVICE)
        text_embeddings = text_encoder(**text_inputs).last_hidden_state

        # Forward pass through UNet
        noise = torch.randn_like(images).to(DEVICE)
        noisy_images = images + noise  # simplified for demonstration
        noise_pred = unet(noisy_images, text_embeddings).sample
        loss = loss_fn(noise_pred, noise)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss.item():.4f}")

# -------------------------------
# Save fine-tuned model
# -------------------------------
os.makedirs(OUTPUT_DIR, exist_ok=True)
unet.save_pretrained(os.path.join(OUTPUT_DIR, "unet"))
vae.save_pretrained(os.path.join(OUTPUT_DIR, "vae"))
text_encoder.save_pretrained(os.path.join(OUTPUT_DIR, "text_encoder"))
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "tokenizer"))

# -------------------------------
# Generate a sample image
# -------------------------------
pipe = StableDiffusionPipeline.from_pretrained(OUTPUT_DIR, torch_dtype=torch.float16).to(DEVICE)
image = pipe("A cute dog playing", guidance_scale=7.5).images[0]
image.save("fine_tuned_dog.png")
image.show()

| **Model**                                                  | **Architecture / Type**            | **Strengths**                                                             | **Weaknesses**                                                      | **Use Case**                                                 |
| ---------------------------------------------------------- | ---------------------------------- | ------------------------------------------------------------------------- | ------------------------------------------------------------------- | ------------------------------------------------------------ |
| **CNN Autoencoder (AE)**                                   | CNN encoder + CNN decoder          | Simple, easy to implement, reconstructs images well                       | Limited generative capability, blurry outputs, no text conditioning | Image reconstruction, compression, feature learning          |
| **Variational Autoencoder (VAE)**                          | CNN + probabilistic latent space   | Generates new images from latent space, smooth latent interpolation       | Often blurry images, less detailed                                  | Unconditional image generation, latent space exploration     |
| **Autoregressive (iGPT / Image GPT)**                      | Transformer on pixels              | Captures global dependencies, generates high-fidelity images              | Very computationally expensive, slow sampling                       | Small-scale generative experiments, pixel-wise modeling      |
| **Diffusion Models**                                       | Noise-to-image iterative denoising | High-quality images, stable training                                      | Slow generation, heavy computation                                  | Unconditional / conditional image synthesis                  |
| **Latent Diffusion Models (LDMs)**                         | Diffusion in latent space          | Faster and lighter than pixel-space diffusion                             | Requires pre-trained VAE, less direct control                       | High-res generation, text-to-image synthesis                 |
| **GAN (DCGAN / cGAN / StyleGAN / CycleGAN)**               | CNN-based adversarial training     | Sharp, realistic images; conditional GANs allow text / class conditioning | Training instability, mode collapse                                 | Unconditional / conditional image generation, style transfer |
| **Text-to-Image cGAN (from scratch)**                      | CNN generator + LSTM text encoder  | Lightweight, trainable on small datasets, custom prompts                  | Low-quality images on small datasets, limited text understanding    | Educational, small text-to-image experiments                 |
| **Text-to-Image Fine-Tuned Pretrained (Stable Diffusion)** | Pretrained UNet + CLIP + VAE       | High-quality, realistic images, text-conditioned, fast convergence        | Requires GPU, fine-tuning still computationally expensive           | Personalized text-to-image generation (e.g., dog images)     |