<a href="https://colab.research.google.com/github/usshaa/BK_BIRLA_DL/blob/main/08_DL/04_VAE_Implementation_with_Dog_vs_Cat_Images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### VAE Implementation with Kaggle Dog vs Cat Images

In [1]:
!curl -L -o dog-and-cat-classification-dataset.zip \
  https://www.kaggle.com/api/v1/datasets/download/bhavikjikadara/dog-and-cat-classification-dataset

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  775M  100  775M    0     0  69.7M      0  0:00:11  0:00:11 --:--:--  125M


In [2]:
!unzip -q dog-and-cat-classification-dataset.zip -d ./data

replace ./data/PetImages/Cat/0.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: no
replace ./data/PetImages/Cat/1.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [3]:
import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
import cv2
import os

In [4]:
IMG_SIZE = 64
DATA_DIR = "./data/PetImages"

In [5]:
LATENT_DIM = 64   # Latent vector size
IMG_SHAPE = (IMG_SIZE, IMG_SIZE, 3)

In [6]:
cats = []
dogs = []

In [7]:
for img in os.listdir(os.path.join(DATA_DIR, "Cat"))[:5000]:  # limit for speed
    try:
        img_array = cv2.imread(os.path.join(DATA_DIR, "Cat", img))
        img_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
        cats.append(img_array)
    except:
        pass

In [8]:
for img in os.listdir(os.path.join(DATA_DIR, "Dog"))[:5000]:
    try:
        img_array = cv2.imread(os.path.join(DATA_DIR, "Dog", img))
        img_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
        dogs.append(img_array)
    except:
        pass

In [9]:
X = np.array(cats + dogs) / 255.0  # Normalize to [0,1]
print("Dataset shape:", X.shape)

Dataset shape: (10000, 64, 64, 3)


In [10]:
# Train-test split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

In [11]:
# Sampling layer for reparameterization trick
class Sampling(layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [12]:
# Encoder
def build_encoder():
    encoder_inputs = layers.Input(shape=IMG_SHAPE)
    x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
    x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
    x = layers.Flatten()(x)
    x = layers.Dense(128, activation="relu")(x)
    z_mean = layers.Dense(LATENT_DIM, name="z_mean")(x)
    z_log_var = layers.Dense(LATENT_DIM, name="z_log_var")(x)
    z = Sampling()([z_mean, z_log_var])
    return models.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")

In [13]:
# Decoder
def build_decoder():
    latent_inputs = layers.Input(shape=(LATENT_DIM,))
    x = layers.Dense(16 * 16 * 64, activation="relu")(latent_inputs)  # adjust for IMG_SIZE=64
    x = layers.Reshape((16, 16, 64))(x)
    x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(x)
    x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same")(x)
    decoder_outputs = layers.Conv2DTranspose(3, 3, activation="sigmoid", padding="same")(x)
    return models.Model(latent_inputs, decoder_outputs, name="decoder")

In [14]:
# VAE class
class VAE(tf.keras.models.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.optimizer = tf.keras.optimizers.Adam()

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstruction = self.decoder(z)
        return reconstruction

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(tf.reduce_sum(tf.keras.losses.mse(data, reconstruction), axis=(1, 2)))
            kl_loss = -0.5 * tf.reduce_mean(tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {"loss": total_loss, "reconstruction_loss": reconstruction_loss, "kl_loss": kl_loss}

In [15]:
# Build VAE
encoder = build_encoder()
decoder = build_decoder()
vae = VAE(encoder, decoder)
vae.compile(optimizer=tf.keras.optimizers.Adam(), run_eagerly=True)

In [None]:
# Train VAE
vae.fit(X_train, epochs=15, batch_size=128)

Epoch 1/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 2s/step - kl_loss: 1.6023 - loss: 268.9652 - reconstruction_loss: 267.3629
Epoch 2/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 2s/step - kl_loss: 4.0613 - loss: 243.1698 - reconstruction_loss: 239.1086
Epoch 3/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 2s/step - kl_loss: 6.7463 - loss: 223.9952 - reconstruction_loss: 217.2489
Epoch 4/15
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 2s/step - kl_loss: 8.2493 - loss: 217.7059 - reconstruction_loss: 209.4566
Epoch 5/15
[1m58/63[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m8s[0m 2s/step - kl_loss: 8.1451 - loss: 215.7073 - reconstruction_loss: 207.5623

In [None]:
# Reconstruct images
z_mean, z_log_var, z = encoder.predict(X_test[:10])
reconstructed = decoder.predict(z)

In [None]:
# Plot comparison
n = 10
plt.figure(figsize=(20, 4))
for i in range(n):
    # Original
    ax = plt.subplot(2, n, i + 1)
    plt.imshow(X_test[i])
    plt.title("Original")
    plt.axis("off")

    # VAE Reconstruction
    ax = plt.subplot(2, n, i + 1 + n)
    plt.imshow(reconstructed[i])
    plt.title("VAE Recon")
    plt.axis("off")
plt.show()

### Key Difference

* **Autoencoder (AE)**: Learns **direct mapping** → Compress → Reconstruct.
* **Variational Autoencoder (VAE)**: Learns a **distribution in latent space** → Can generate new samples, not just reconstruct.