In [1]:
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding

image_model = VGG16(weights='imagenet', include_top=False, pooling='avg')

def load_image(image_path):
    img = tf.keras.preprocessing.image.load_img(image_path, target_size=(224, 224))
    x = tf.keras.preprocessing.image.img_to_array(img)
    x = tf.expand_dims(x, axis=0)
    x = tf.keras.applications.vgg16.preprocess_input(x)
    return image_model.predict(x).reshape(-1)

max_length = 20
vocab_size = 10000
embedding_dim = 256

inputs1 = Input(shape=(4096,))
fe1 = Dense(256, activation='relu')(inputs1)
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = LSTM(256)(se1)

decoder1 = tf.keras.layers.add([fe1, se2])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
caption_model = Model(inputs=[inputs1, inputs2], outputs=outputs)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 0us/step


In [3]:
caption_model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
images = [...]
captions = [...]

image_features = [load_image(image_path) for image_path in images]
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token='<unk>')
tokenizer.fit_on_texts(captions)
sequences = tokenizer.texts_to_sequences(captions)

padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post')

train_images, val_images, train_sequences, val_sequences = train_test_split(image_features, padded_sequences, test_size=0.2)

batch_size = 32
epochs = 10

history = caption_model.fit([train_images, train_sequences], np.array(train_sequences), 
                            batch_size=batch_size, epochs=epochs, 
                            validation_data=([val_images, val_sequences], np.array(val_sequences)))

In [2]:
def generate_caption(image_path):
    image_features = load_image(image_path)
    in_text = '<start>' 
    for i in range(max_length):
        sequence = [word_to_index[w] for w in in_text.split() if w in word_to_index]
        sequence = tf.keras.preprocessing.sequence.pad_sequences([sequence], maxlen=max_length)
        yhat = caption_model.predict([image_features, sequence], verbose=0)
        yhat = tf.argmax(yhat)
        word = index_to_word[yhat.numpy()]
        in_text += ' ' + word
        if word == '<end>':
            break
    final = in_text.split()[1:-1]
    final = ' '.join(final)
    return final