In [1]:
from pickle import load
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.applications.vgg16 import VGG16
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

import cv2
import load_data as ld
import generate_model as gen
import argparse

# extract features from each photo in the directory
def extract_features(filename):
    # load the model
    model = VGG16()
    # re-structure the model
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    # load the photo

    image = load_img(filename, target_size=(224, 224))
    # convert the image pixels to a numpy array
    image = img_to_array(image)
#     print(image)
    # reshape data for the model
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # prepare the image for the VGG model
    image = preprocess_input(image)
    # get features
    feature = model.predict(image, verbose=0)
    return feature

# generate a description for an image
def generate_desc(model, tokenizer, photo, index_word, max_length, beam_size=5):

  captions = [['startseq', 0.0]]
  # seed the generation process
  in_text = 'startseq'
  # iterate over the whole length of the sequence
  for i in range(max_length):
    all_caps = []
    # expand each current candidate
    for cap in captions:
      sentence, score = cap
      # if final word is 'end' token, just add the current caption
      if sentence.split()[-1] == 'endseq':
        all_caps.append(cap)
        continue
      # integer encode input sequence
      sequence = tokenizer.texts_to_sequences([sentence])[0]
      # pad input
      sequence = pad_sequences([sequence], maxlen=max_length)
      # predict next words
      y_pred = model.predict([photo,sequence], verbose=0)[0]
      # convert probability to integer
      yhats = np.argsort(y_pred)[-beam_size:]

      for j in yhats:
        # map integer to word
        word = index_word.get(j)
        # stop if we cannot map the word
        if word is None:
          continue
        # Add word to caption, and generate log prob
        caption = [sentence + ' ' + word, score + np.log(y_pred[j])]
        all_caps.append(caption)

    # order all candidates by score
    ordered = sorted(all_caps, key=lambda tup:tup[1], reverse=True)
    captions = ordered[:beam_size]

  return captions

# evaluate the skill of the model
def evaluate_model(model, descriptions, photos, tokenizer, index_word, max_length):
  actual, predicted = list(), list()
  # step over the whole set
  for key, desc_list in descriptions.items():
    # generate description
    yhat = generate_desc(model, tokenizer, photos[key], index_word, max_length)[0]
    # store actual and predicted
    references = [d.split() for d in desc_list]
    actual.append(references)
    # Use best caption
    predicted.append(yhat[0].split())
  # calculate BLEU score
  print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
  print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

def eval_test_set(model, descriptions, photos, tokenizer, index_word, max_length):
  actual, predicted = list(), list()
  # step over the whole set
  for key, desc_list in descriptions.items():
    # generate description
    yhat = generate_desc(model, tokenizer, photos[key], index_word, max_length)[0]
    # store actual and predicted
    references = [d.split() for d in desc_list]
    actual.append(references)
    # Use best caption
    predicted.append(yhat[0].split())
  predicted = sorted(predicted)
  actual = [x for _,x in sorted(zip(actual,predicted))]



Using TensorFlow backend.


In [2]:
# load the tokenizer
tokenizer = load(open('/media/thaovt6/DATA/build_data/Image-Captioning/models/tokenizer.pkl', 'rb'))
index_word = load(open('/media/thaovt6/DATA/build_data/Image-Captioning/models/index_word.pkl', 'rb'))
# pre-define the max sequence length (from training)
max_length = 34

filename = '/media/thaovt6/DATA/build_data/Image-Captioning/models/model_weight.h5'
model = load_model(filename)



W0702 02:16:42.775471 139980607653696 deprecation_wrapper.py:119] From /home/thaovt6/yes/envs/py3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:529: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0702 02:16:42.809324 139980607653696 deprecation_wrapper.py:119] From /home/thaovt6/yes/envs/py3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0702 02:16:42.811180 139980607653696 deprecation_wrapper.py:119] From /home/thaovt6/yes/envs/py3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:136: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0702 02:16:42.823193 139980607653696 deprecation.py:506] From /home/thaovt6/yes/envs/py3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3721: calling dropout (from tensorflow.python.ops.nn_

In [3]:
def caption(imgpath):
    imgpath= "/home/thaovt6/Downloads/index.jpeg"
    photo = extract_features(imgpath)
    # generate description
    captions = generate_desc(model, tokenizer, photo, index_word, max_length)
    caption = captions[0][0].split()[1:-1]
    caption = ' '.join(caption)
    return caption

W0702 02:16:50.943610 139980607653696 deprecation_wrapper.py:119] From /home/thaovt6/yes/envs/py3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4255: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.



In [4]:
caption

'the blonde boy is smiling'