In [1]:
import math
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
import nltk
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [2]:
import tensorflow.python.platform
from keras.preprocessing import sequence
from collections import Counter

import keras
from keras.applications.vgg16 import preprocess_input

Using TensorFlow backend.


In [17]:
model_path = './models/tensorflow-baseline/'
vgg_path = './data/vgg16-20160129.tfmodel'
feature_path = './data/feats.npy'
annotation_path = './data/results_20130124.token'
meta_file = 'model.meta'

In [4]:
image_path = '../Project/data/flickr30k_images/'
chencherry = nltk.translate.bleu_score.SmoothingFunction()

In [5]:
def get_data(image_path, annotation_path, feature_path,start_index,end_index):
     img_file_name = os.listdir(image_path)[start_index-1:end_index]
     annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption'],
                                 skiprows=(start_index-1)*5,nrows=(end_index-start_index+1)*5)
     return img_file_name, np.load(feature_path,'r')[(start_index-1)*5:(end_index)*5], annotations['caption'].values


In [6]:
dim_embed = 256
dim_hidden = 256
dim_in = 4096
batch_size = 1
learning_rate = 0.001
momentum = 0.9
n_epochs = 25

In [7]:
class Caption_Generator():
    def __init__(self, dim_in, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, init_b=None):

        self.dim_in = dim_in
        self.dim_embed = dim_embed
        self.dim_hidden = dim_hidden
        self.batch_size = batch_size
        self.n_lstm_steps = n_lstm_steps
        self.n_words = n_words
        
        # declare the variables to be used for our word embeddings
        with tf.device("/cpu:0"):
            self.word_embedding = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='word_embedding')

        self.embedding_bias = tf.Variable(tf.zeros([dim_embed]), name='embedding_bias')
        
        # declare the LSTM itself
        self.lstm = tf.contrib.rnn.BasicLSTMCell(dim_hidden)
        
        # declare the variables to be used to embed the image feature embedding to the word embedding space
        self.img_embedding = tf.Variable(tf.random_uniform([dim_in, dim_hidden], -0.1, 0.1), name='img_embedding')
        self.img_embedding_bias = tf.Variable(tf.zeros([dim_hidden]), name='img_embedding_bias')

        # declare the variables to go from an LSTM output to a word encoding output
        self.word_encoding = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='word_encoding')
        
        # optional initialization setter for encoding bias variable 
        if init_b is not None:
            self.word_encoding_bias = tf.Variable(init_b, name='word_encoding_bias')
        else:
            self.word_encoding_bias = tf.Variable(tf.zeros([n_words]), name='word_encoding_bias')


    def build_generator(self, maxlen, batchsize=1):
        #same setup as `build_model` function 
        img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])
        image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias
        state = self.lstm.zero_state(batchsize,dtype=tf.float32)

        #declare list to hold the words of our generated captions
        all_words = []
        with tf.variable_scope("RNN"):
            # in the first iteration we have no previous word, so we directly pass in the image embedding
            # and set the `previous_word` to the embedding of the start token ([0]) for the future iterations
            output, state = self.lstm(image_embedding, state)
            previous_word = tf.nn.embedding_lookup(self.word_embedding, [0]) + self.embedding_bias

            for i in range(maxlen):
                tf.get_variable_scope().reuse_variables()

                out, state = self.lstm(previous_word, state)


                # get a one-hot word encoding from the output of the LSTM
                logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias
                best_word = tf.argmax(logit, 1)

                with tf.device("/cpu:0"):
                    # get the embedding of the best_word to use as input to the next iteration of our LSTM 
                    previous_word = tf.nn.embedding_lookup(self.word_embedding, best_word)

                previous_word += self.embedding_bias

                all_words.append(best_word)

        return img, all_words

In [8]:
if not os.path.exists('data/ixtoword.npy'):
    print ('You must run 1. O\'reilly Training.ipynb first.')
else:
    tf.reset_default_graph()
    with open(vgg_path,'rb') as f:
        fileContent = f.read()
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(fileContent)

    images = tf.placeholder("float32", [1, 224, 224, 3])
    tf.import_graph_def(graph_def, input_map={"images":images})

    ixtoword = np.load('data/ixtoword.npy').tolist()
    n_words = len(ixtoword)
    maxlen=15
    graph = tf.get_default_graph()
    sess = tf.InteractiveSession(graph=graph)
    caption_generator = Caption_Generator(dim_in, dim_hidden, dim_embed, batch_size, maxlen+2, n_words)
    graph = tf.get_default_graph()

    image, generated_words = caption_generator.build_generator(maxlen=maxlen)

In [9]:
def read_image(img_path):
    img = keras.preprocessing.image.load_img(img_path, target_size=(224, 224))
    x = keras.preprocessing.image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

In [28]:
def test(sess,generated_words,ixtoword, image_path): # Naive greedy search

    disp_flag = False
    start = 7001
    end = 8000
    img_file_name, feat, captions = get_data(image_path, annotation_path, feature_path,start,end)
    #print('Length of image file names are:', len(img_file_name))
   
    feat = feat[::5]
    #fc7 = sess.run(graph.get_tensor_by_name("import/Relu_1:0"), feed_dict={images:feat})

    #saver = tf.train.import_meta_graph(model_path+meta_file)
    saver = tf.train.Saver()
    sanity_check=False
    # sanity_check=True
    if not sanity_check:
        saved_path=tf.train.latest_checkpoint(model_path)
        saver.restore(sess, saved_path)
    else:
        tf.global_variables_initializer().run()
    
    hypothesis,references = [],[]
    print(len(feat))
    for index,f in enumerate(feat):
        generated_word_index= sess.run(generated_words, feed_dict={image:np.reshape(f,(1,4096))})
        generated_word_index = np.hstack(generated_word_index)
        output_words = [ixtoword[x] for x in generated_word_index]
        punctuation = np.argmax(np.array(output_words) == '.')+1

        output_words = output_words[:punctuation]
        generated_sentence = ' '.join(output_words)
        #print(generated_sentence,output_words)
        caption_wordList = []
        for c in captions[index*5:index*5+5]:
            c = c.split()
            caption_wordList.append(c)
        hypothesis.append(output_words)
        references.append(caption_wordList)
        # Display the image
        if disp_flag:
            img=mpimg.imread(os.path.join(image_path,img_file_name[index]))
            imgplot = plt.imshow(img)
            plt.show()
            print('Generated Caption:', ' '.join(output_words))
            print('Expected captions:', captions[index*5:index*5+5])
    #print(hypothesis,len(references))
    validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method6))
    #print(references)    
    print("Validation BLEU4 Score: ", validation_score)
    validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method6, weights=(0.333,0.333,0.333)))
    #print(references)    
    print("Validation BLEU3 Score: ", validation_score)
    validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method6, weights=(0.5,0.5)))
    #print(references)    
    print("Validation BLEU2 Score: ", validation_score)
    validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method6,weights=(1)))
    #print(references)    
    print("Validation BLEU1 Score: ", validation_score)
    #print(nltk.translate.bleu_score.sentence_bleu(caption_wordList, output_words, smoothing_function=chencherry.method7))

In [29]:
test(sess,generated_words,ixtoword, image_path)

ValueError: At least two variables have the same name: word_encoding/Adam