# 04 Inference Demo

Generate captions for a new image and convert to speech.

In [None]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gtts import gTTS
import IPython.display as ipd
import pickle

### Load Model and Tokenizer

In [None]:
model = load_model('../models/decoder/caption_model.h5')

with open('../data/Flickr8k_text/tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

max_length = 37  # Based on previous computation

### Load Image and Extract Features

In [None]:
def extract_features(image_path):
    vgg_model = VGG16(weights='imagenet')
    vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output)

    image = load_img(image_path, target_size=(224, 224))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = preprocess_input(image)

    return vgg_model.predict(image, verbose=0)

### Caption Generation Function

In [None]:
def idx_to_word(integer, tokenizer):
    return tokenizer.index_word.get(integer)

def generate_caption(model, tokenizer, image_features, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([image_features, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = idx_to_word(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

### Test with an Image

In [None]:
image_path = '../data/Flickr8k_text/sample.jpg'  # Replace with your own image
features = extract_features(image_path)
caption = generate_caption(model, tokenizer, features, max_length)
print('Generated Caption:', caption)

### Convert Caption to Speech

In [None]:
tts = gTTS(caption, lang='en')
tts.save('caption_audio.mp3')
ipd.Audio('caption_audio.mp3')