In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Dropout, add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from nltk.translate.bleu_score import sentence_bleu
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input

# Parameters
vocab_size = 10000  # This should match the actual vocab size in your tokenizer
max_length = 34  # Longest caption length for padding (should be derived from dataset captions)
# Example captions dictionary (replace this with actual data loading from your dataset)
captions_dict = {
    "img1.jpeg": ["A dog running in the field.", "A dog is playing outside."],
    "img2.jpeg": ["A girl on a swing.", "A child enjoying a swing in the park."],
    # Add more image-caption pairs here...
}

# Create train_data as a list of (image_path, caption) pairs
train_data = []
for img, captions in captions_dict.items():
    for caption in captions:
        train_data.append((f"{img}", caption))

# Initialize captions list from captions_dict
captions_list = [caption for captions in captions_dict.values() for caption in captions]

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(captions_list)  # captions_list should contain all captions

# Function to extract image features
def extract_image_features(model, image_path):
    image = load_img(image_path, target_size=(224, 224))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)
    features = model.predict(image, verbose=0)
    return features

# Load pre-trained VGG16 model and remove the final layer
cnn_model = VGG16(weights="imagenet")
cnn_model = Model(inputs=cnn_model.input, outputs=cnn_model.layers[-2].output)

# Function to create input-output sequence pairs
def create_sequences(tokenizer, max_length, desc, photo_features, vocab_size):
    X1, X2, y = [], [], []
    desc_seq = tokenizer.texts_to_sequences([desc])[0]
    for i in range(1, len(desc_seq)):
        in_seq, out_seq = desc_seq[:i], desc_seq[i]
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        X1.append(photo_features)
        X2.append(in_seq)
        y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

# Define the combined CNN-LSTM model
def define_model(vocab_size, max_length):
    # Image feature branch
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation="relu")(fe1)

    # Sequence feature branch
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Decoder (merge features from CNN and LSTM)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation="relu")(decoder1)
    outputs = Dense(vocab_size, activation="softmax")(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss="categorical_crossentropy", optimizer="adam")
    return model

model = define_model(vocab_size, max_length)

# Training data preparation
X1, X2, y = [], [], []
for img_path, caption in train_data:  # train_data should be a list of (image_path, caption) pairs
    photo_features = extract_image_features(cnn_model, img_path)
    in_img, in_seq, out_word = create_sequences(tokenizer, max_length, caption, photo_features, vocab_size)
    X1.append(in_img)
    X2.append(in_seq)
    y.append(out_word)

X1, X2, y = np.vstack(X1), np.vstack(X2), np.vstack(y)
model.fit([X1, X2], y, epochs=20, verbose=2)

# Caption generation function
def generate_caption(model, tokenizer, photo_features, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo_features, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None:
            break
        in_text += " " + word
        if word == "endseq":
            break
    return in_text
test_images=['test1.jpeg','test2.jpeg']
test_captions=["A dog running in the field.","A child enjoying a swing in the park."]
# Evaluate model on BLEU score
for img_path in test_images:  # test_images should be a list of paths to test images
    reference = [caption.split() for caption in test_captions[img_path]]  # List of true captions
    photo_features = extract_image_features(cnn_model, img_path)
    generated = generate_caption(model, tokenizer, photo_features, max_length).split()
    bleu_score = sentence_bleu(reference, generated)
    print(f"Image: {img_path}, BLEU Score: {bleu_score}")


Epoch 1/20


ValueError: Input 0 of layer "functional_1" is incompatible with the layer: expected shape=(None, 4096), found shape=(None, 1, 4096)

In [11]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Dropout, add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
import os

# Parameters
vocab_size = 10000  # Adjust based on your dataset
max_length = 35     # Set to a reasonable max length for captions

# Initialize VGG16 model for feature extraction
def extract_image_features(image_folder, model):
    features = {}
    for img_name in os.listdir(image_folder):
        img_path = os.path.join(image_folder, img_name)
        image = load_img(img_path, target_size=(224, 224))
        image = img_to_array(image)
        image = np.expand_dims(image, axis=0)
        image = tf.keras.applications.vgg16.preprocess_input(image)
        feature = model.predict(image, verbose=0)
        features[img_name] = feature.flatten()
    return features

# Load the pre-trained VGG16 model without top layers
cnn_model = VGG16(weights="imagenet", include_top=False, pooling="avg")

# Extract image features
image_folder = r"C:\Users\muni karthik\Desktop\SEM7\DLCA\MODEL ANS"  # Replace with the path to your image folder
image_features = extract_image_features(image_folder, cnn_model)

# Example captions dataset (load from file or define manually)
captions_dict = {
    "img1.jpeg": ["A dog running in the field.", "A dog is playing outside."],
    "img2.jpeg": ["A girl on a swing.", "A child enjoying a swing in the park."],
    # Add more image-caption pairs here...
}

# Prepare captions data
captions_list = list(captions_dict.values())

# Tokenize captions
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(captions_list)

# Prepare training sequences
def create_sequences(tokenizer, max_length, desc, photo_features, vocab_size):
    X1, X2, y = [], [], []
    seq = tokenizer.texts_to_sequences([desc])[0]
    for i in range(1, len(seq)):
        in_seq, out_seq = seq[:i], seq[i]
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        X1.append(photo_features)
        X2.append(in_seq)
        y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

# Prepare data for training
X1, X2, y = [], [], []
for img_name, caption in captions_dict.items():
    if img_name in image_features:
        in_img, in_seq, out_word = create_sequences(
            tokenizer, max_length, caption, image_features[img_name], vocab_size
        )
        X1.append(in_img)
        X2.append(in_seq)
        y.append(out_word)

X1, X2, y = np.vstack(X1), np.vstack(X2), np.vstack(y)

# Model definition
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation="relu")(fe1)

    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation="relu")(decoder1)
    outputs = Dense(vocab_size, activation="softmax")(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss="categorical_crossentropy", optimizer="adam")
    return model

# Initialize model
model = define_model(vocab_size, max_length)

# Train model
model.fit([X1, X2], y, epochs=20, verbose=2)

# Caption generation
def generate_caption(model, tokenizer, photo_features, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo_features, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat, None)
        if word is None:
            break
        in_text += " " + word
        if word == "endseq":
            break
    return in_text

# Test with a new image
test_image = "path_to_test_image.jpg"  # Replace with the path to a test image
test_features = extract_image_features("path_to_images", cnn_model)[test_image]
print("Generated Caption:", generate_caption(model, tokenizer, test_features.reshape(1, -1), max_length))


UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x0000021C6DF126B0>