In [1]:
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

In [2]:
def build_cnn_encoder():
    model = InceptionV3(weights='imagenet')
    model = Model(inputs=model.input, outputs=model.layers[-2].output)  # Remove the top layer
    return model

cnn_model = build_cnn_encoder()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5
[1m96112376/96112376[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 0us/step


In [3]:
def build_decoder(vocab_size, max_length):
    # Image feature input
    image_input = Input(shape=(2048,))
    image_features = Dropout(0.5)(image_input)
    image_features = Dense(256, activation='relu')(image_features)
    
    # Sequence input for the text
    seq_input = Input(shape=(max_length,))
    seq_features = Embedding(vocab_size, 256, mask_zero=True)(seq_input)
    seq_features = Dropout(0.5)(seq_features)
    seq_features = LSTM(256)(seq_features)
    
    # Combine image features and text features
    decoder = add([image_features, seq_features])
    decoder = Dense(256, activation='relu')(decoder)
    output = Dense(vocab_size, activation='softmax')(decoder)
    
    # Define the final model
    model = Model(inputs=[image_input, seq_input], outputs=output)
    return model

vocab_size = 5000  # Size of the vocabulary
max_length = 20    # Maximum length of captions
decoder_model = build_decoder(vocab_size, max_length)

In [4]:
def extract_image_features(image_path, model):
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(299, 299))
    img = tf.keras.preprocessing.image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = tf.keras.applications.inception_v3.preprocess_input(img)
    features = model.predict(img)
    return features[0]

In [12]:
import numpy as np

def generate_caption(image_features, tokenizer, max_length, temperature=1.0):
    in_text = 'startseq'
    for _ in range(max_length):
        # Tokenize and pad the input text
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        
        # Predict the probability distribution over the next word
        yhat = decoder_model.predict([image_features.reshape(1, 2048), sequence], verbose=0)[0]
        
        # Apply temperature to adjust the probability distribution
        yhat = np.log(yhat + 1e-8) / temperature  # Log scaling
        yhat = np.exp(yhat) / np.sum(np.exp(yhat))  # Softmax

        # Sample the next word index from the adjusted probability distribution
        next_word_index = np.random.choice(len(yhat), p=yhat)

        # Convert index to word and handle unknowns
        word = tokenizer.index_word.get(next_word_index, "unknown")

        # If the word is "endseq", stop generating
        if word == 'endseq':
            break
        
        # Append the word to the generated caption
        in_text += ' ' + word

    return in_text


In [13]:
tokenizer = Tokenizer(num_words=vocab_size)

In [14]:
image_path = 'church.jpg'  # Replace with the path to an image
image_features = extract_image_features(image_path, cnn_model)
caption = generate_caption(image_features, tokenizer, max_length)
print("Generated Caption:", caption)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
Generated Caption: startseq unknown unknown unknown unknown unknown unknown unknown unknown unknown unknown unknown unknown unknown unknown unknown unknown unknown unknown unknown unknown
