In [2]:
import math
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
import nltk
#import cv2
#import skimage
import pickle as pkl
import tensorflow.python.platform
from keras.preprocessing import sequence
from collections import Counter

Using TensorFlow backend.


In [3]:
model_path = './models/tensorflow-baseline/'
model_path_transfer = './models/tf_final'
feature_path = './data/feats.npy'
annotation_path = './data/results_20130124.token'
chencherry = nltk.translate.bleu_score.SmoothingFunction()

In [4]:
def get_data(annotation_path, feature_path):
     annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption'])
     return np.load(feature_path,'r'), annotations['caption'].values

In [5]:
def preProBuildWordVocab(sentence_iterator, word_count_threshold=30): # function from Andre Karpathy's NeuralTalk
    print('preprocessing %d word vocab' % (word_count_threshold, ))
    word_counts = {}
    nsents = 0
    for sent in sentence_iterator:
      nsents += 1
      for w in sent.lower().split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1
    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]

    ixtoword = {}
    ixtoword[0] = '.'  
    wordtoix = {}
    wordtoix['#START#'] = 0 
    ix = 1
    for w in vocab:
      wordtoix[w] = ix
      ixtoword[ix] = w
      ix += 1

    word_counts['.'] = nsents
    bias_init_vector = np.array([1.0*word_counts[ixtoword[i]] for i in ixtoword])
    bias_init_vector /= np.sum(bias_init_vector) 
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector) 
    print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))

    return wordtoix, ixtoword, bias_init_vector.astype(np.float32)

In [6]:
class Caption_Generator():
    def __init__(self, dim_in, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, init_b):

        self.dim_in = dim_in
        self.dim_embed = dim_embed
        self.dim_hidden = dim_hidden
        self.batch_size = batch_size
        self.n_lstm_steps = n_lstm_steps
        self.n_words = n_words
        
        # declare the variables to be used for our word embeddings
        with tf.device("/cpu:0"):
            self.word_embedding = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='word_embedding')

        self.embedding_bias = tf.Variable(tf.zeros([dim_embed]), name='embedding_bias')
        
        # declare the LSTM itself
        self.lstm = tf.contrib.rnn.core_rnn_cell.BasicLSTMCell(dim_hidden)
        
        # declare the variables to be used to embed the image feature embedding to the word embedding space
        self.img_embedding = tf.Variable(tf.random_uniform([dim_in, dim_hidden], -0.1, 0.1), name='img_embedding')
        self.img_embedding_bias = tf.Variable(tf.zeros([dim_hidden]), name='img_embedding_bias')

        # declare the variables to go from an LSTM output to a word encoding output
        self.word_encoding = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='word_encoding')
        # initialize this bias variable from the preProBuildWordVocab output
        self.word_encoding_bias = tf.Variable(init_b, name='word_encoding_bias')

    def build_model(self):
        # declaring the placeholders for our extracted image feature vectors, our caption, and our mask
        # (describes how long our caption is with an array of 0/1 values of length `maxlen`  
        img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])
        caption_placeholder = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])
        mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])
        
        # getting an initial LSTM embedding from our image_imbedding
        image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias
        
        # setting initial state of our LSTM
        state = self.lstm.zero_state(self.batch_size, dtype=tf.float32)

        total_loss = 0.0
        with tf.variable_scope("RNN"):
            for i in range(self.n_lstm_steps): 
                if i > 0:
                   #if this isn’t the first iteration of our LSTM we need to get the word_embedding corresponding
                   # to the (i-1)th word in our caption 
                    with tf.device("/cpu:0"):
                        current_embedding = tf.nn.embedding_lookup(self.word_embedding, caption_placeholder[:,i-1]) + self.embedding_bias
                else:
                     #if this is the first iteration of our LSTM we utilize the embedded image as our input 
                    current_embedding = image_embedding
                if i > 0: 
                    # allows us to reuse the LSTM tensor variable on each iteration
                    tf.get_variable_scope().reuse_variables()

                out, state = self.lstm(current_embedding, state)

                
                if i > 0:
                    #get the one-hot representation of the next word in our caption 
                    labels = tf.expand_dims(caption_placeholder[:, i], 1)
                    ix_range=tf.range(0, self.batch_size, 1)
                    ixs = tf.expand_dims(ix_range, 1)
                    concat = tf.concat([ixs, labels],1)
                    onehot = tf.sparse_to_dense(
                            concat, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)


                    #perform a softmax classification to generate the next word in the caption
                    logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias
                    xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=onehot)
                    xentropy = xentropy * mask[:,i]

                    loss = tf.reduce_sum(xentropy)
                    total_loss += loss
                    
                    #train_prediction = tf.nn.softmax(logit)
                    

            total_loss = total_loss / tf.reduce_sum(mask[:,1:])
            return total_loss, img,  caption_placeholder, mask
        
    def build_generator(self, maxlen, batchsize=1):
        #same setup as `build_model` function 
        img = tf.placeholder(tf.float32, [batchsize, self.dim_in])
        image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias
        state = self.lstm.zero_state(batchsize,dtype=tf.float32)

        #declare list to hold the words of our generated captions
        all_words = []
        with tf.variable_scope("RNN"):
            tf.get_variable_scope().reuse_variables()
            # in the first iteration we have no previous word, so we directly pass in the image embedding
            # and set the `previous_word` to the embedding of the start token ([0]) for the future iterations
            output, state = self.lstm(image_embedding, state)
            previous_word = tf.nn.embedding_lookup(self.word_embedding, [0]) + self.embedding_bias
            for i in range(maxlen):
                tf.get_variable_scope().reuse_variables()
                out, state = self.lstm(previous_word, state)


                # get a get maximum probability word and it's encoding from the output of the LSTM
                logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias
                best_word = tf.argmax(logit, 1)

                with tf.device("/cpu:0"):
                    # get the embedding of the best_word to use as input to the next iteration of our LSTM 
                    previous_word = tf.nn.embedding_lookup(self.word_embedding, best_word)

                previous_word += self.embedding_bias

                all_words.append(best_word)

        return img, all_words

In [7]:
def test(sess,image,generated_words,ixtoword,idx=0): # Naive greedy search

    feats, captions = get_data(annotation_path, feature_path)
    feat = np.array([feats[idx]])
    
    saver = tf.train.Saver()
    sanity_check= False
    # sanity_check=True
    if not sanity_check:
        saved_path=tf.train.latest_checkpoint(model_path)
        saver.restore(sess, saved_path)
    else:
        tf.global_variables_initializer().run()

    generated_word_index= sess.run(generated_words, feed_dict={image:feat})
    generated_word_index = np.hstack(generated_word_index)

    generated_sentence = [ixtoword[x] for x in generated_word_index]
    print(generated_sentence)

In [10]:
### Parameters ###
dim_embed = 256
dim_hidden = 256
dim_in = 4096
batch_size = 128
momentum = 0.9
n_epochs = 100

def train(learning_rate=0.001, continue_training=False, transfer=True):
    
    tf.reset_default_graph()

    feats, captions = get_data(annotation_path, feature_path)
    wordtoix, ixtoword, init_b = preProBuildWordVocab(captions)
    
    train_data, valid_data,_ = np.split(feats,[6000*5,7000*5])
    train_captions, valid_captions,_ = np.split(captions,[6000*5,7000*5])
    
    valid_data = valid_data[::5]
    
    np.save('data/ixtoword', ixtoword)

    index = (np.arange(len(train_data)).astype(int))
    np.random.shuffle(index)


    sess = tf.InteractiveSession()
    n_words = len(wordtoix)
    maxlen = np.max( [x for x in map(lambda x: len(x.split(' ')), train_captions) ] )
    
    caption_generator = Caption_Generator(dim_in, dim_hidden, dim_embed, batch_size, maxlen+2, n_words, init_b)

    loss, image, sentence, mask = caption_generator.build_model()

    saver = tf.train.Saver(max_to_keep=100)
    global_step=tf.Variable(0,trainable=False)
    learning_rate = tf.train.exponential_decay(learning_rate, global_step,
                                       int(len(index)/batch_size), 0.95)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    tf.global_variables_initializer().run()

    if continue_training:
        if not transfer:
            saver.restore(sess,tf.train.latest_checkpoint(model_path))
        else:
            saver.restore(sess,tf.train.latest_checkpoint(model_path_transfer))
    losses=[]
    for epoch in range(n_epochs):
        for start, end in zip( range(0, len(index), batch_size), range(batch_size, len(index), batch_size)):

            current_feats = train_data[index[start:end]]
            current_captions = train_captions[index[start:end]]
            current_caption_ind = [x for x in map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ')[:-1] if word in wordtoix], current_captions)]

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=maxlen+1)
            current_caption_matrix = np.hstack( [np.full( (len(current_caption_matrix),1), 0), current_caption_matrix] )

            current_mask_matrix = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array([x for x in map(lambda x: (x != 0).sum()+2, current_caption_matrix )])

            for ind, row in enumerate(current_mask_matrix):
                row[:nonzeros[ind]] = 1

            _, loss_value = sess.run([train_op, loss], feed_dict={
                image: current_feats.astype(np.float32),
                sentence : current_caption_matrix.astype(np.int32),
                mask : current_mask_matrix.astype(np.float32)
                })

            print("Current Cost: ", loss_value, "\t Epoch {}/{}".format(epoch, n_epochs), "\t Iter {}/{}".format(start,len(train_data)))
        
        print("Saving the model from epoch: ", epoch)
        saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
        
        #Perform Validation
               
        validation_image, generated_words = caption_generator.build_generator(15)
        hypothesis,references = [],[]
        for ind, v_image in enumerate(valid_data):
            generated_word_index= sess.run(generated_words, feed_dict={validation_image:np.reshape(v_image,(1,4096))})
            generated_word_index = np.hstack(generated_word_index)
            output_words = [ixtoword[x] for x in generated_word_index]
            punctuation = np.argmax(np.array(output_words) == '.')+1
            output_words = output_words[:punctuation]
            caption_wordList = []
            for c in captions[ind*5:ind*5+5]:
                c = c.split()
                caption_wordList.append(c)
            hypothesis.append(output_words)
            references.append(caption_wordList)
        #validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method7))
        
        #print("Validation BLEU Score: ", validation_score, "\t Epoch {}/{}".format(epoch, n_epochs))
        
        validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method7))
        #print(references)    
        print("Validation BLEU4 Score: ", validation_score)
        validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method7, weights=[0.3333,0.3333,0.3333]))
        #print(references)    
        print("Validation BLEU3 Score: ", validation_score)
        validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method7, weights=[0.5,0.5]))
        #print(references)    
        print("Validation BLEU2 Score: ", validation_score)
        validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method7,weights=[1]))
        #print(references)    
        print("Validation BLEU1 Score: ", validation_score)
    

In [11]:
try:
    #train(.001,False,False) #train from scratch
    #train(.001,True,True)    #continue training from pretrained weights @epoch500
    train(.001,True,False)  #train from previously saved weights 
except KeyboardInterrupt:
    print('Exiting Training')

preprocessing 30 word vocab
preprocessed words 20326 -> 2942
INFO:tensorflow:Restoring parameters from ./models/tensorflow-baseline/model-6
Current Cost:  0.727067 	 Epoch 0/100 	 Iter 0/30000
Current Cost:  0.722145 	 Epoch 0/100 	 Iter 128/30000
Current Cost:  0.755244 	 Epoch 0/100 	 Iter 256/30000
Current Cost:  0.783488 	 Epoch 0/100 	 Iter 384/30000
Current Cost:  0.784208 	 Epoch 0/100 	 Iter 512/30000
Current Cost:  0.763874 	 Epoch 0/100 	 Iter 640/30000
Current Cost:  0.762458 	 Epoch 0/100 	 Iter 768/30000
Current Cost:  0.840972 	 Epoch 0/100 	 Iter 896/30000
Current Cost:  0.801165 	 Epoch 0/100 	 Iter 1024/30000
Current Cost:  0.829922 	 Epoch 0/100 	 Iter 1152/30000
Current Cost:  0.813943 	 Epoch 0/100 	 Iter 1280/30000
Current Cost:  0.793355 	 Epoch 0/100 	 Iter 1408/30000
Current Cost:  0.810972 	 Epoch 0/100 	 Iter 1536/30000
Current Cost:  0.761901 	 Epoch 0/100 	 Iter 1664/30000
Current Cost:  0.821304 	 Epoch 0/100 	 Iter 1792/30000
Current Cost:  0.796532 	 Epoc

Current Cost:  0.868412 	 Epoch 0/100 	 Iter 18432/30000
Current Cost:  0.854922 	 Epoch 0/100 	 Iter 18560/30000
Current Cost:  0.799928 	 Epoch 0/100 	 Iter 18688/30000
Current Cost:  0.856694 	 Epoch 0/100 	 Iter 18816/30000
Current Cost:  0.838222 	 Epoch 0/100 	 Iter 18944/30000
Current Cost:  0.791212 	 Epoch 0/100 	 Iter 19072/30000
Current Cost:  0.805417 	 Epoch 0/100 	 Iter 19200/30000
Current Cost:  0.858243 	 Epoch 0/100 	 Iter 19328/30000
Current Cost:  0.839005 	 Epoch 0/100 	 Iter 19456/30000
Current Cost:  0.812398 	 Epoch 0/100 	 Iter 19584/30000
Current Cost:  0.81451 	 Epoch 0/100 	 Iter 19712/30000
Current Cost:  0.808213 	 Epoch 0/100 	 Iter 19840/30000
Current Cost:  0.833304 	 Epoch 0/100 	 Iter 19968/30000
Current Cost:  0.865739 	 Epoch 0/100 	 Iter 20096/30000
Current Cost:  0.82897 	 Epoch 0/100 	 Iter 20224/30000
Current Cost:  0.86591 	 Epoch 0/100 	 Iter 20352/30000
Current Cost:  0.803352 	 Epoch 0/100 	 Iter 20480/30000
Current Cost:  0.837465 	 Epoch 0/

Current Cost:  0.797012 	 Epoch 1/100 	 Iter 6656/30000
Current Cost:  0.804829 	 Epoch 1/100 	 Iter 6784/30000
Current Cost:  0.795392 	 Epoch 1/100 	 Iter 6912/30000
Current Cost:  0.782557 	 Epoch 1/100 	 Iter 7040/30000
Current Cost:  0.759423 	 Epoch 1/100 	 Iter 7168/30000
Current Cost:  0.771486 	 Epoch 1/100 	 Iter 7296/30000
Current Cost:  0.84759 	 Epoch 1/100 	 Iter 7424/30000
Current Cost:  0.821498 	 Epoch 1/100 	 Iter 7552/30000
Current Cost:  0.739943 	 Epoch 1/100 	 Iter 7680/30000
Current Cost:  0.77736 	 Epoch 1/100 	 Iter 7808/30000
Current Cost:  0.806166 	 Epoch 1/100 	 Iter 7936/30000
Current Cost:  0.772063 	 Epoch 1/100 	 Iter 8064/30000
Current Cost:  0.811981 	 Epoch 1/100 	 Iter 8192/30000
Current Cost:  0.820722 	 Epoch 1/100 	 Iter 8320/30000
Current Cost:  0.776796 	 Epoch 1/100 	 Iter 8448/30000
Current Cost:  0.811356 	 Epoch 1/100 	 Iter 8576/30000
Current Cost:  0.840678 	 Epoch 1/100 	 Iter 8704/30000
Current Cost:  0.840587 	 Epoch 1/100 	 Iter 8832/

Current Cost:  0.792738 	 Epoch 1/100 	 Iter 25216/30000
Current Cost:  0.823628 	 Epoch 1/100 	 Iter 25344/30000
Current Cost:  0.857241 	 Epoch 1/100 	 Iter 25472/30000
Current Cost:  0.84273 	 Epoch 1/100 	 Iter 25600/30000
Current Cost:  0.832155 	 Epoch 1/100 	 Iter 25728/30000
Current Cost:  0.834451 	 Epoch 1/100 	 Iter 25856/30000
Current Cost:  0.821943 	 Epoch 1/100 	 Iter 25984/30000
Current Cost:  0.842737 	 Epoch 1/100 	 Iter 26112/30000
Current Cost:  0.810811 	 Epoch 1/100 	 Iter 26240/30000
Current Cost:  0.827542 	 Epoch 1/100 	 Iter 26368/30000
Current Cost:  0.857004 	 Epoch 1/100 	 Iter 26496/30000
Current Cost:  0.896021 	 Epoch 1/100 	 Iter 26624/30000
Current Cost:  0.894904 	 Epoch 1/100 	 Iter 26752/30000
Current Cost:  0.802163 	 Epoch 1/100 	 Iter 26880/30000
Current Cost:  0.85267 	 Epoch 1/100 	 Iter 27008/30000
Current Cost:  0.857298 	 Epoch 1/100 	 Iter 27136/30000
Current Cost:  0.826125 	 Epoch 1/100 	 Iter 27264/30000
Current Cost:  0.875412 	 Epoch 1