In [24]:
import math
import os
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
import nltk
#import cv2
#import skimage
import pickle as pkl
import tensorflow.python.platform
from keras.preprocessing import sequence
from collections import Counter

In [25]:
model_path = './models/tensorflow'
model_path_transfer = './models/tf_final'
feature_path = './data/feats.npy'
annotation_path = './data/results_20130124.token'
chencherry = nltk.translate.bleu_score.SmoothingFunction()

In [26]:
def get_data(annotation_path, feature_path):
     annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption'])
     return np.load(feature_path,'r'), annotations['caption'].values

In [27]:
def preProBuildWordVocab(sentence_iterator, word_count_threshold=30): # function from Andre Karpathy's NeuralTalk
    print('preprocessing %d word vocab' % (word_count_threshold, ))
    word_counts = {}
    nsents = 0
    for sent in sentence_iterator:
      nsents += 1
      for w in sent.lower().replace('-',' ').split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1
    vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]

    ixtoword = {}
    ixtoword[0] = '.'  
    wordtoix = {}
    wordtoix['#START#'] = 0 
    ix = 1
    for w in vocab:
      wordtoix[w] = ix
      ixtoword[ix] = w
      ix += 1

    word_counts['.'] = nsents
    bias_init_vector = np.array([1.0*word_counts[ixtoword[i]] for i in ixtoword])
    bias_init_vector /= np.sum(bias_init_vector) 
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector) 
    print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))

    return wordtoix, ixtoword, bias_init_vector.astype(np.float32)

In [28]:
class Caption_Generator():
    def __init__(self, dim_in, dim_hidden, dim_embed, batch_size, n_lstm_steps, n_words, init_b, glove_embedding):

        self.dim_in = dim_in
        self.dim_embed = dim_embed
        self.dim_hidden = dim_hidden
        self.batch_size = batch_size
        self.n_lstm_steps = n_lstm_steps
        self.n_words = n_words
        self.glove_embedding = np.array(glove_embedding).astype(np.float32)
        # declare the variables to be used for our word embeddings
        with tf.device("/cpu:0"):
            self.glove_embedding = tf.Variable(tf.convert_to_tensor(glove_embedding, np.float32), name='glove_embedding') 

        self.embedding_bias = tf.Variable(tf.zeros([dim_embed]), name='embedding_bias')
        
        # declare the LSTM itself
        self.lstm = tf.contrib.rnn.core_rnn_cell.BasicLSTMCell(dim_hidden)
        
        # declare the variables to be used to embed the image feature embedding to the word embedding space
        self.img_embedding = tf.Variable(tf.random_uniform([dim_in, dim_embed], -0.1, 0.1), name='img_embedding')
        self.img_embedding_bias = tf.Variable(tf.zeros([dim_embed]), name='img_embedding_bias')

        # declare the variables to go from an LSTM output to a word encoding output
        self.word_encoding = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='word_encoding')
        # initialize this bias variable from the preProBuildWordVocab output
        self.word_encoding_bias = tf.Variable(init_b, name='word_encoding_bias')

    def build_model(self):
        # declaring the placeholders for our extracted image feature vectors, our caption, and our mask
        # (describes how long our caption is with an array of 0/1 values of length `maxlen`  
        img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])
        caption_placeholder = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])
        mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])
        
        # getting an initial LSTM embedding from our image_imbedding
        image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias
        
        # setting initial state of our LSTM
        state = self.lstm.zero_state(self.batch_size, dtype=tf.float32)
        #print('initial state:', self.lstm.state_size)
        total_loss = 0.0
        with tf.variable_scope("RNN"):
            for i in range(self.n_lstm_steps): 
                if i > 0:
                   #if this isn’t the first iteration of our LSTM we need to get the word_embedding corresponding
                   # to the (i-1)th word in our caption
                    with tf.device("/cpu:0"):                    
                        current_embedding = tf.nn.embedding_lookup(self.glove_embedding, caption_placeholder[:,i-1]) + self.embedding_bias
                else:
                     #if this is the first iteration of our LSTM we utilize the embedded image as our input 
                    current_embedding = image_embedding
                if i > 0: 
                    # allows us to reuse the LSTM tensor variable on each iteration
                    tf.get_variable_scope().reuse_variables()
                '''print('i:', i)
                print(current_embedding)
                print(current_embedding.shape)
                print('interm state before:', self.lstm.state_size)'''
                out, state = self.lstm(current_embedding, state)
                #print('interm state after:', self.lstm.state_size)
                
                if i > 0:
                    #get the one-hot representation of the next word in our caption 
                    labels = tf.expand_dims(caption_placeholder[:, i], 1)
                    ix_range=tf.range(0, self.batch_size, 1)
                    ixs = tf.expand_dims(ix_range, 1)
                    concat = tf.concat([ixs, labels],1)
                    onehot = tf.sparse_to_dense(
                            concat, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)


                    #perform a softmax classification to generate the next word in the caption
                    logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias
                    xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=onehot)
                    xentropy = xentropy * mask[:,i]

                    loss = tf.reduce_sum(xentropy)
                    total_loss += loss
                    
                    #train_prediction = tf.nn.softmax(logit)
                    

            total_loss = total_loss / tf.reduce_sum(mask[:,1:])
            return total_loss, img,  caption_placeholder, mask
        
    def build_generator(self, maxlen, batchsize=1):
        #same setup as `build_model` function 
        img = tf.placeholder(tf.float32, [batchsize, self.dim_in])
        image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias
        state = self.lstm.zero_state(batchsize,dtype=tf.float32)

        #declare list to hold the words of our generated captions
        all_words = []
        with tf.variable_scope("RNN"):
            tf.get_variable_scope().reuse_variables()
            # in the first iteration we have no previous word, so we directly pass in the image embedding
            # and set the `previous_word` to the embedding of the start token ([0]) for the future iterations
            output, state = self.lstm(image_embedding, state)
            previous_word = tf.nn.embedding_lookup(self.glove_embedding, [0]) + self.embedding_bias
            for i in range(maxlen):
                tf.get_variable_scope().reuse_variables()
                out, state = self.lstm(previous_word, state)


                # get a get maximum probability word and it's encoding from the output of the LSTM
                logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias
                best_word = tf.argmax(logit, 1)
                
                # get the embedding of the best_word to use as input to the next iteration of our LSTM
                with tf.device("/cpu:0"):
                    previous_word = tf.nn.embedding_lookup(self.glove_embedding, best_word)

                previous_word += self.embedding_bias

                all_words.append(best_word)

        return img, all_words

In [29]:
def load_glove_model():
    from gensim.scripts.glove2word2vec import glove2word2vec
    glove_input_file = './glove.6B/glove.6B.100d.txt'
    word2vec_output_file = 'glove.6B.100d.txt.word2vec'
    glove2word2vec(glove_input_file, word2vec_output_file)
    from gensim.models import KeyedVectors
    # load the Stanford GloVe model
    filename = 'glove.6B.100d.txt.word2vec'
    model = KeyedVectors.load_word2vec_format(filename, binary=False)
    return model

In [30]:
def test(sess,image,generated_words,ixtoword,idx=0): # Naive greedy search

    feats, captions = get_data(annotation_path, feature_path)
    feat = np.array([feats[idx]])
    
    saver = tf.train.Saver()
    sanity_check= False
    # sanity_check=True
    if not sanity_check:
        saved_path=tf.train.latest_checkpoint(model_path)
        saver.restore(sess, saved_path)
    else:
        tf.global_variables_initializer().run()

    generated_word_index= sess.run(generated_words, feed_dict={image:feat})
    generated_word_index = np.hstack(generated_word_index)

    generated_sentence = [ixtoword[x] for x in generated_word_index]
    print(generated_sentence)

In [39]:
### Parameters ###
dim_embed = 100
dim_hidden = 256
dim_in = 4096
batch_size = 128
momentum = 0.9
n_epochs = 100

def train(learning_rate=0.001, continue_training=False, transfer=True):
    
    tf.reset_default_graph()

    feats, captions = get_data(annotation_path, feature_path)
    wordtoix, ixtoword, init_b = preProBuildWordVocab(captions)

    train_data, valid_data,_ = np.split(feats,[7000*5,8000*5])
    train_captions, valid_captions,_ = np.split(captions,[7000*5,8000*5])
    del feats
    valid_data = valid_data[::5]
    
    np.save('data/ixtoword', ixtoword)

    index = (np.arange(len(train_data)).astype(int))
    np.random.shuffle(index)


    sess = tf.InteractiveSession()
    n_words = len(wordtoix)
    maxlen = np.max( [x for x in map(lambda x: len(x.split(' ')), train_captions) ] )
    
    glove_embeddings = np.zeros((n_words, dim_embed))
    glove_model = load_glove_model()
    for k, v in wordtoix.items():
        if(k == '#START#'):
            glove_embeddings[v] = np.zeros((1,dim_embed))
            continue
        if(k in glove_model):    
            glove_embeddings[v] = np.reshape(glove_model[k],(1,dim_embed))
        else:
            print(k)
    
    print('Building Model...')
    caption_generator = Caption_Generator(dim_in, dim_hidden, dim_embed, batch_size, maxlen+2, n_words, init_b, glove_embeddings)

    loss, image, sentence, mask = caption_generator.build_model()

    saver = tf.train.Saver(max_to_keep=2)
    global_step=tf.Variable(0,trainable=False)
    learning_rate = tf.train.exponential_decay(learning_rate, global_step,
                                       int(len(index)/batch_size), 0.95)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    tf.global_variables_initializer().run()

    if continue_training:
        if not transfer:
            saver.restore(sess,tf.train.latest_checkpoint(model_path))
        else:
            saver.restore(sess,tf.train.latest_checkpoint(model_path_transfer))
    losses=[]
    for epoch in range(n_epochs):
        for start, end in zip( range(0, len(index), batch_size), range(batch_size, len(index), batch_size)):

            current_feats = train_data[index[start:end]]
            current_captions = train_captions[index[start:end]]
            current_caption_ind = [x for x in map(lambda cap: [wordtoix[word] for word in cap.lower().replace('-',' ').split(' ')[:-1] if word in wordtoix], current_captions)]

            current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=maxlen+1)
            current_caption_matrix = np.hstack( [np.full( (len(current_caption_matrix),1), 0), current_caption_matrix] )

            current_mask_matrix = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
            nonzeros = np.array([x for x in map(lambda x: (x != 0).sum()+2, current_caption_matrix )])

            for ind, row in enumerate(current_mask_matrix):
                row[:nonzeros[ind]] = 1

            _, loss_value = sess.run([train_op, loss], feed_dict={
                image: current_feats.astype(np.float32),
                sentence : current_caption_matrix.astype(np.int32),
                mask : current_mask_matrix.astype(np.float32)
                })

            print("Current Cost: ", loss_value, "\t Epoch {}/{}".format(epoch, n_epochs), "\t Iter {}/{}".format(start,len(train_data)))
        
        print("Saving the model from epoch: ", epoch)
        saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
        
        #Perform Validation
               
        validation_image, generated_words = caption_generator.build_generator(15)
        hypothesis,references = [],[]
        for ind, v_image in enumerate(valid_data):
            generated_word_index= sess.run(generated_words, feed_dict={validation_image:np.reshape(v_image,(1,4096))})
            generated_word_index = np.hstack(generated_word_index)
            output_words = [ixtoword[x] for x in generated_word_index]
            punctuation = np.argmax(np.array(output_words) == '.')+1
            output_words = output_words[:punctuation]
            caption_wordList = []
            for c in captions[ind*5:ind*5+5]:
                c = c.lower().replace('-',' ').split()
                caption_wordList.append(c)
            hypothesis.append(output_words)
            references.append(caption_wordList)
        #validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method7))
        
        #print("Validation BLEU Score: ", validation_score, "\t Epoch {}/{}".format(epoch, n_epochs))

        validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method7))
    #print(references)    
        print("Validation BLEU4 Score: ", validation_score)
        validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method7, weights=[0.3333,0.3333,0.3333]))
        #print(references)    
        print("Validation BLEU3 Score: ", validation_score)
        validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method7, weights=[0.5,0.5]))
        #print(references)    
        print("Validation BLEU2 Score: ", validation_score)
        validation_score = (nltk.translate.bleu_score.corpus_bleu(references, hypothesis,smoothing_function=chencherry.method7,weights=[1]))
        #print(references)    
        print("Validation BLEU1 Score: ", validation_score)
    

In [40]:
try:
    #train(.001,False,False) #train from scratch
    #train(.001,True,True)    #continue training from pretrained weights @epoch500
    train(.001,True,False)  #train from previously saved weights 
except KeyboardInterrupt:
    print('Exiting Training')

preprocessing 30 word vocab
preprocessed words 18426 -> 2954

Building Model...
INFO:tensorflow:Restoring parameters from ./models/tensorflow\model-0
Current Cost:  1.59634 	 Epoch 0/100 	 Iter 0/35000
Current Cost:  1.49798 	 Epoch 0/100 	 Iter 128/35000
Current Cost:  1.73572 	 Epoch 0/100 	 Iter 256/35000
Current Cost:  1.45889 	 Epoch 0/100 	 Iter 384/35000
Current Cost:  1.51052 	 Epoch 0/100 	 Iter 512/35000
Current Cost:  1.56077 	 Epoch 0/100 	 Iter 640/35000
Current Cost:  1.42602 	 Epoch 0/100 	 Iter 768/35000
Current Cost:  1.48855 	 Epoch 0/100 	 Iter 896/35000
Current Cost:  1.66747 	 Epoch 0/100 	 Iter 1024/35000
Current Cost:  1.61204 	 Epoch 0/100 	 Iter 1152/35000
Current Cost:  1.58725 	 Epoch 0/100 	 Iter 1280/35000
Current Cost:  1.6045 	 Epoch 0/100 	 Iter 1408/35000
Current Cost:  1.40805 	 Epoch 0/100 	 Iter 1536/35000
Current Cost:  1.38357 	 Epoch 0/100 	 Iter 1664/35000
Current Cost:  1.73667 	 Epoch 0/100 	 Iter 1792/35000
Current Cost:  1.5288 	 Epoch 0/100 

Current Cost:  1.45915 	 Epoch 0/100 	 Iter 18688/35000
Current Cost:  1.78413 	 Epoch 0/100 	 Iter 18816/35000
Current Cost:  1.35915 	 Epoch 0/100 	 Iter 18944/35000
Current Cost:  1.52611 	 Epoch 0/100 	 Iter 19072/35000
Current Cost:  1.46554 	 Epoch 0/100 	 Iter 19200/35000
Current Cost:  1.49497 	 Epoch 0/100 	 Iter 19328/35000
Current Cost:  1.41442 	 Epoch 0/100 	 Iter 19456/35000
Current Cost:  1.60927 	 Epoch 0/100 	 Iter 19584/35000
Current Cost:  1.74925 	 Epoch 0/100 	 Iter 19712/35000
Current Cost:  1.72036 	 Epoch 0/100 	 Iter 19840/35000
Current Cost:  1.5811 	 Epoch 0/100 	 Iter 19968/35000
Current Cost:  1.59382 	 Epoch 0/100 	 Iter 20096/35000
Current Cost:  1.46185 	 Epoch 0/100 	 Iter 20224/35000
Current Cost:  1.30933 	 Epoch 0/100 	 Iter 20352/35000
Current Cost:  1.56178 	 Epoch 0/100 	 Iter 20480/35000
Current Cost:  1.46363 	 Epoch 0/100 	 Iter 20608/35000
Current Cost:  1.62294 	 Epoch 0/100 	 Iter 20736/35000
Current Cost:  1.60703 	 Epoch 0/100 	 Iter 20864

Current Cost:  1.37849 	 Epoch 1/100 	 Iter 2176/35000
Current Cost:  1.81502 	 Epoch 1/100 	 Iter 2304/35000
Current Cost:  1.49012 	 Epoch 1/100 	 Iter 2432/35000
Current Cost:  1.33244 	 Epoch 1/100 	 Iter 2560/35000
Current Cost:  1.43094 	 Epoch 1/100 	 Iter 2688/35000
Current Cost:  1.31124 	 Epoch 1/100 	 Iter 2816/35000
Current Cost:  1.62725 	 Epoch 1/100 	 Iter 2944/35000
Current Cost:  1.81875 	 Epoch 1/100 	 Iter 3072/35000
Current Cost:  1.60313 	 Epoch 1/100 	 Iter 3200/35000
Current Cost:  1.57913 	 Epoch 1/100 	 Iter 3328/35000
Current Cost:  1.48369 	 Epoch 1/100 	 Iter 3456/35000
Current Cost:  1.58554 	 Epoch 1/100 	 Iter 3584/35000
Current Cost:  1.28614 	 Epoch 1/100 	 Iter 3712/35000
Current Cost:  1.49805 	 Epoch 1/100 	 Iter 3840/35000
Current Cost:  1.3209 	 Epoch 1/100 	 Iter 3968/35000
Current Cost:  1.63672 	 Epoch 1/100 	 Iter 4096/35000
Current Cost:  1.46343 	 Epoch 1/100 	 Iter 4224/35000
Current Cost:  1.5301 	 Epoch 1/100 	 Iter 4352/35000
Current Cost