In [18]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.models import Sequential
import cv2
import matplotlib.pyplot as pt
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Dropout, Conv2DTranspose, Embedding, Concatenate, Conv2D
from tensorflow.keras.layers import BatchNormalization, LeakyReLU, ReLU

In [2]:
def load_data(image_folder, caption_file):
    image_paths = []
    captions = []
    imgs = []
    
    with open(caption_file, 'r') as file:
        lines = file.readlines()
        for line in lines:
            # Split by tab character and ensure there are exactly two parts
            parts = line.strip().split('\t')
            if len(parts) == 2:
                image_name, caption = parts
                img_path = os.path.join(image_folder, image_name)
                if os.path.exists(img_path):
                    image_paths.append(img_path)
                    captions.append(caption)
                    
                    # Load image and resize to 64x64
                    img = Image.open(img_path).resize((64, 64))
                    imgs.append(np.array(img))
            else:
                # Handle or log lines that don't have exactly two parts
                print(f"Skipping invalid line: {line.strip()}")
    
    return np.array(imgs), captions

In [3]:

# Folder and caption file paths (adjust these)
image_folder = r'D:\\archive\\Images'  # Folder containing images
caption_file = r"D:\\archive\\captions.txt"  # Captions file

# Load the data
imgs, captions = load_data(image_folder, caption_file)

Skipping invalid line: image,caption
Skipping invalid line: 1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set of stairs in an entry way .
Skipping invalid line: 1000268201_693b08cb0e.jpg,A girl going into a wooden building .
Skipping invalid line: 1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
Skipping invalid line: 1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playhouse .
Skipping invalid line: 1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a wooden cabin .
Skipping invalid line: 1001773457_577c3a7d70.jpg,A black dog and a spotted dog are fighting
Skipping invalid line: 1001773457_577c3a7d70.jpg,A black dog and a tri-colored dog playing with each other on the road .
Skipping invalid line: 1001773457_577c3a7d70.jpg,A black dog and a white dog with brown spots are staring at each other in the street .
Skipping invalid line: 1001773457_577c3a7d70.jpg,Two dogs of different breeds looking at each othe

In [4]:
# Dataset Paths
image_folder = r'D:\archive\Images'  # Folder containing images
caption_file = r"D:\archive\captions.txt"  # Text file with image-caption pairs

# Read captions (Flickr8k dataset usually has captions per image, split by tab)
def load_data(image_folder, caption_file):
    image_paths = []
    captions = []
    imgs = []

    with open(caption_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                img_name, caption = parts
                img_path = os.path.join(image_folder, img_name)
                if os.path.exists(img_path):
                    image_paths.append(img_path)
                    captions.append(caption)

                    # Load and resize images to 64x64 pixels
                    img = Image.open(img_path).resize((64, 64))
                    imgs.append(np.array(img))

    imgs = np.array(imgs)
    return imgs, image_paths, captions

imgs, image_paths, captions = load_data(image_folder, caption_file)

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_captions(captions, max_length=15):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    sequences = tokenizer.texts_to_sequences(captions)
    captions_pad = pad_sequences(sequences, maxlen=max_length, padding='post')
    vocab_size = len(tokenizer.word_index) + 1
    return captions_pad, vocab_size, tokenizer

captions_pad, vocab_size, tokenizer = preprocess_captions(captions)


In [5]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Example: Use GloVe pre-trained embeddings (simplified for this demo)
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load GloVe embeddings
glove_embeddings = load_glove_embeddings(r"D:\archive1\glove.6B.100d.txt")  # Ensure to download GloVe beforehand

# Tokenize and embed the text descriptions
def text_to_embedding(text, max_length=30):
    words = word_tokenize(text.lower())[:max_length]
    embedding = np.zeros((max_length, 100))  # 100d embedding for GloVe
    for i, word in enumerate(words):
        if word in glove_embeddings:
            embedding[i] = glove_embeddings[word]
    return embedding

# Convert captions to embeddings
captions_embeddings = np.array([text_to_embedding(c) for c in captions])


[nltk_data] Downloading package punkt to C:\Users\B
[nltk_data]     Vasundhara\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, Reshape, Embedding, LSTM, Concatenate, Conv2D, Conv2DTranspose, BatchNormalization, LeakyReLU, Input
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from PIL import Image

def build_generator(latent_dim=100, text_embedding_dim=100, img_shape=(64, 64, 3)):
    # Text input (embedding of the text description)
    text_input = Input(shape=(30, 100))  # 30 words max, 100d for GloVe

    # Flatten text input
    x = Flatten()(text_input)

    # Combine with random noise
    noise_input = Input(shape=(latent_dim,))
    model_input = Concatenate()([x, noise_input])

    # Dense layers to upsample to image shape
    x = Dense(128 * 16 * 16)(model_input)
    x = Reshape((16, 16, 128))(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    # Upsample to 64x64 image
    x = Conv2DTranspose(128, kernel_size=4, strides=2, padding='same')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Conv2DTranspose(64, kernel_size=4, strides=2, padding='same')(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)

    # Output image
    img_output = Conv2DTranspose(3, kernel_size=4, strides=2, padding='same', activation='tanh')(x)

    generator = Model([text_input, noise_input], img_output)
    return generator

generator = build_generator()
generator.summary()


Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_23 (InputLayer)       [(None, 30, 100)]            0         []                            
                                                                                                  
 flatten_10 (Flatten)        (None, 3000)                 0         ['input_23[0][0]']            
                                                                                                  
 input_24 (InputLayer)       [(None, 100)]                0         []                            
                                                                                                  
 concatenate_7 (Concatenate  (None, 3100)                 0         ['flatten_10[0][0]',          
 )                                                                   'input_24[0][0]']      

In [22]:
def build_discriminator(img_shape=(64, 64, 3), text_embedding_dim=100):
    img_input = Input(shape=img_shape)
    text_input = Input(shape=(30, 100))  # 30 words max, 100d for GloVe

    # Process image
    x = Conv2D(64, kernel_size=4, strides=2, padding='same')(img_input)
    x = LeakyReLU(alpha=0.2)(x)
    x = Conv2D(128, kernel_size=4, strides=2, padding='same')(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Flatten()(x)

    # Process text input
    y = Flatten()(text_input)

    # Concatenate image and text features
    combined = Concatenate()([x, y])

    # Dense layer to classify real or fake
    z = Dense(1, activation='sigmoid')(combined)

    discriminator = Model([img_input, text_input], z)
    return discriminator

discriminator = build_discriminator()
discriminator.summary()


Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_25 (InputLayer)       [(None, 64, 64, 3)]          0         []                            
                                                                                                  
 conv2d_4 (Conv2D)           (None, 32, 32, 64)           3136      ['input_25[0][0]']            
                                                                                                  
 leaky_re_lu_10 (LeakyReLU)  (None, 32, 32, 64)           0         ['conv2d_4[0][0]']            
                                                                                                  
 conv2d_5 (Conv2D)           (None, 16, 16, 128)          131200    ['leaky_re_lu_10[0][0]']      
                                                                                            

In [None]:
def build_cgan(generator, discriminator):
    # Discriminator is set as non-trainable in the combined model
    discriminator.trainable = False

    # Generator input (noise + text)
    noise_input = Input(shape=(100,))
    text_input = Input(shape=(30, 100))  # Adjust if necessary for text embedding shape

    # Generate the image
    generated_image = generator([text_input, noise_input])  # Output shape: (None, 128, 128, 3)

    # Process the text input
    text_embedding = Dense(128 * 128 * 3)(Flatten()(text_input))  # Flatten and project text input to 128x128x3
    text_embedding = Reshape((128, 128, 3))(text_embedding)  # Reshape to match generated_image

    # Combine the generated image and text embedding (e.g., by adding)
    combined_input = tf.keras.layers.Add()([generated_image, text_embedding])  # Shape: (None, 128, 128, 3)

    # Pass the combined input to the discriminator
    validity = discriminator(combined_input)

    # Define the combined model
    cgan = Model([text_input, noise_input], validity)
    return cgan

# Build and summarize the CGAN
cgan = build_cgan(generator, discriminator)
cgan.summary()


In [12]:
def build_generator():
    noise_input = layers.Input(shape=(100,))
    text_input = layers.Input(shape=(100,))
    combined = layers.Concatenate()([noise_input, text_input])

    model = Sequential()
    model.add(layers.Dense(8 * 8 * 128, activation="relu"))
    model.add(layers.Conv2DTranspose(3, 4, strides=4, activation="tanh"))

    return model

In [13]:
def build_discriminator():
    image_input = layers.Input(shape=(100,))
    text_input = layers.Input(shape=(100,))
    combined = layers.Concatenate()([image_input, text_input])

    model = Sequential()
    model.add(layers.Dense(128, activation="relu"))
    model.add(layers.Dense(1, activation="sigmoid"))
    return model

In [14]:
generator = build_generator()
discriminator = build_discriminator()

In [None]:
generator.summary()

In [35]:
discriminator.summary()

In [None]:
fig = pt.figure(figsize=(10,10))

for i in range(25):
    pt.subplot(5,5,i+1)
    pt.imshow(imgs[i])