In [None]:
# Install KaggleHub and other dependencies
!pip install kagglehub tensorflow numpy matplotlib nltk h5py


In [None]:
import kagglehub

path = kagglehub.dataset_download("wenewone/cub2002011")
print("Path to dataset files:", path)


In [None]:
import os

dataset_path = r"C:\Users\Arun\.cache\kagglehub\datasets\wenewone\cub2002011\versions\7"
image_dir = os.path.join(dataset_path, "CUB_200_2011/images")
labels_file = os.path.join(dataset_path, "CUB_200_2011/classes.txt")

print("Images directory exists:", os.path.exists(image_dir))
print("Class labels file exists:", os.path.exists(labels_file))

for root, dirs, files in os.walk(dataset_path):
    print("Directory:", root)
    for file in files:
        print("File:", file)


In [None]:
def load_class_labels(labels_file):
    """Load class labels from the file."""
    class_labels = {}
    with open(labels_file, "r") as f:
        for line in f:
            parts = line.strip().split(" ")
            class_id = parts[0].zfill(3)
            class_name = " ".join(parts[1:])
            class_labels[class_id] = class_name
    return class_labels

class_labels = load_class_labels(labels_file)
print(f"Loaded {len(class_labels)} class labels.")


In [None]:
def map_images_to_classes(image_dir):
    """Map images to their respective class labels."""
    image_paths = []
    captions = []
    for folder_name in os.listdir(image_dir):
        folder_path = os.path.join(image_dir, folder_name)
        if os.path.isdir(folder_path):
            class_name = folder_name.split(".")[1]
            for image_file in os.listdir(folder_path):
                image_paths.append(os.path.join(folder_path, image_file))
                captions.append(class_name)
    return image_paths, captions

image_paths, captions = map_images_to_classes(image_dir)
print(f"Mapped {len(image_paths)} images to class labels.")


In [None]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np

def load_images_and_labels(image_paths, captions, img_size=(64, 64)):
    images = []
    labels = []
    for img_path, caption in zip(image_paths, captions):
        img = load_img(img_path, target_size=img_size)
        img_array = img_to_array(img) / 255.0
        images.append(img_array)
        labels.append(caption)
    return np.array(images), labels

images, labels = load_images_and_labels(image_paths, captions)
print(f"Loaded {len(images)} images and their corresponding labels.")


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(labels)
tokenized_labels = tokenizer.texts_to_sequences(labels)
max_length = max(len(seq) for seq in tokenized_labels)
padded_labels = pad_sequences(tokenized_labels, maxlen=max_length)

vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)
print("Max sequence length:", max_length)


In [None]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Reshape, Concatenate, Conv2DTranspose
from tensorflow.keras import Model

def build_generator(vocab_size, max_length):
    text_input = Input(shape=(max_length,))
    text_embedding = Embedding(vocab_size, 256)(text_input)
    text_features = LSTM(256)(text_embedding)

    noise_input = Input(shape=(100,))
    combined = Concatenate()([noise_input, text_features])

    x = Dense(8 * 8 * 256, activation='relu')(combined)
    x = Reshape((8, 8, 256))(x)
    x = Conv2DTranspose(128, (4, 4), strides=(2, 2), padding='same', activation='relu')(x)
    x = Conv2DTranspose(64, (4, 4), strides=(2, 2), padding='same', activation='relu')(x)
    output = Conv2DTranspose(3, (4, 4), strides=(2, 2), padding='same', activation='tanh')(x)

    model = Model([noise_input, text_input], output)
    return model

generator = build_generator(vocab_size, max_length)
generator.summary()


In [None]:
from tensorflow.keras.layers import Conv2D, LeakyReLU, Flatten

def build_discriminator(vocab_size, max_length):
    text_input = Input(shape=(max_length,))
    text_embedding = Embedding(vocab_size, 256)(text_input)
    text_features = LSTM(256)(text_embedding)

    image_input = Input(shape=(64, 64, 3))
    x = Conv2D(64, (4, 4), strides=(2, 2), padding='same')(image_input)
    x = LeakyReLU(alpha=0.2)(x)
    x = Conv2D(128, (4, 4), strides=(2, 2), padding='same')(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Flatten()(x)

    combined = Concatenate()([x, text_features])
    x = Dense(256, activation='relu')(combined)
    output = Dense(1, activation='sigmoid')(x)

    model = Model([image_input, text_input], output)
    return model

discriminator = build_discriminator(vocab_size, max_length)
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
discriminator.summary()


In [None]:
def build_cgan(generator, discriminator):
    discriminator.trainable = False

    noise_input = Input(shape=(100,))
    text_input = Input(shape=(max_length,))
    generated_image = generator([noise_input, text_input])
    validity = discriminator([generated_image, text_input])

    model = Model([noise_input, text_input], validity)
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

cgan = build_cgan(generator, discriminator)
cgan.summary()


In [None]:
def train(generator, discriminator, cgan, images, labels, epochs=200, batch_size=32):
    real = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))

    for epoch in range(epochs):
        idx = np.random.randint(0, images.shape[0], batch_size)
        real_images = images[idx]
        real_labels = labels[idx]

        d_loss_real = discriminator.train_on_batch([real_images, real_labels], real)

        noise = np.random.normal(0, 1, (batch_size, 100))
        fake_images = generator.predict([noise, real_labels])
        d_loss_fake = discriminator.train_on_batch([fake_images, real_labels], fake)

        noise = np.random.normal(0, 1, (batch_size, 100))
        g_loss = cgan.train_on_batch([noise, real_labels], real)

        if epoch % 100 == 0:
            print(f"{epoch}/{epochs}, D Loss Real: {d_loss_real}, D Loss Fake: {d_loss_fake}, G Loss: {g_loss}")

train(generator, discriminator, cgan, images, padded_labels, epochs=200)


In [None]:
import matplotlib.pyplot as plt

def generate_images(generator, tokenizer, text_description):
    sequence = tokenizer.texts_to_sequences([text_description])
    sequence = pad_sequences(sequence, maxlen=max_length)
    noise = np.random.normal(0, 1, (1, 100))
    generated_image = generator.predict([noise, sequence])
    return (generated_image[0] + 1) / 2.0

text = "blue bird with yellow beak"
generated_image = generate_images(generator, tokenizer, text)

plt.imshow(generated_image)
plt.title(text)
plt.axis("off")
plt.show()
