In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow.keras import layers
import os

import string
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL
import random
import time
from pathlib import Path

import re
from IPython import display

import tensorflow_hub as hub
import bert
from tensorflow.keras.models import Model       # Keras is the new high level API for TensorFlow
import math

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the first GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


##### Preprocess the input sentences

In [3]:
dictionary_path = './dictionary'
vocab = np.load(dictionary_path + '/vocab.npy')
print('there are {} vocabularies in total'.format(len(vocab)))

word2Id_dict = dict(np.load(dictionary_path + '/word2Id.npy'))
id2word_dict = dict(np.load(dictionary_path + '/id2Word.npy'))
print('Word to id mapping, for example: %s -> %s' % ('flower', word2Id_dict['flower']))
print('Id to word mapping, for example: %s -> %s' % ('1', id2word_dict['1']))
print('Tokens: <PAD>: %s; <RARE>: %s' % (word2Id_dict['<PAD>'], word2Id_dict['<RARE>']))

there are 5427 vocabularies in total
Word to id mapping, for example: flower -> 1
Id to word mapping, for example: 1 -> flower
Tokens: <PAD>: 5427; <RARE>: 5428


In [4]:
def sent2IdList(line, MAX_SEQ_LENGTH=20):
    MAX_SEQ_LIMIT = MAX_SEQ_LENGTH
    padding = 0
    
    # data preprocessing, remove all puntuation in the texts
    prep_line = re.sub('[%s]' % re.escape(string.punctuation), ' ', line.rstrip())
    prep_line = prep_line.replace('-', ' ')
    prep_line = prep_line.replace('-', ' ')
    prep_line = prep_line.replace('  ', ' ')
    prep_line = prep_line.replace('.', '')
    tokens = prep_line.split(' ')
    tokens = [
        tokens[i] for i in range(len(tokens))
        if tokens[i] != ' ' and tokens[i] != ''
    ]
    l = len(tokens)
    padding = MAX_SEQ_LIMIT - l
    
    # make sure length of each text is equal to MAX_SEQ_LENGTH, and replace the less common word with <RARE> token
    for i in range(padding):
        tokens.append('<PAD>')
    line = [
        word2Id_dict[tokens[k]]
        if tokens[k] in word2Id_dict else word2Id_dict['<RARE>']
        for k in range(len(tokens))
    ]

    return line

text = "the flower shown has yellow anther red pistil and bright red petals."
print(text)
print(sent2IdList(text))

the flower shown has yellow anther red pistil and bright red petals.
['9', '1', '82', '5', '11', '70', '20', '31', '3', '29', '20', '2', '5427', '5427', '5427', '5427', '5427', '5427', '5427', '5427']


##### hyper parameter 

In [5]:
hparas = {
    'MAX_SEQ_LENGTH': 20,                       # maximum sequence length
    'EMBED_DIM': 256,                                  # word embedding dimension
    'VOCAB_SIZE': len(word2Id_dict),          # size of dictionary of captions
    'RNN_HIDDEN_SIZE': 128,                     # number of RNN neurons
    'CODE_DIM':128,                                     # dim of code after both rnn_encoder and cnn_encoder, need to be the same in order to calculate cosine similarity
    'DENSE_DIM': 128,                                  # number of neurons in dense layer
    'IMAGE_SIZE': [64, 64, 3],                      # render image size
    'BATCH_SIZE': 64,
    'LR': 1e-4,
    'LR_DECAY': 0.5,
    'BETA_1': 0.5,
    'N_EPOCH': 200,               
    'PRINT_FREQ': 1,                                      # printing frequency of loss
    'N_SAMPLE': 70504                                    # size of training data
}

##### dataset 

In [6]:
data_path = './dataset'
df = pd.read_pickle(data_path + '/text2ImgData.pkl')
num_training_sample = len(df)
n_images_train = num_training_sample
print('There are %d image in training data' % (n_images_train))

There are 7370 image in training data


In [7]:
df.head(5)

Unnamed: 0_level_0,Captions,ImagePath
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
6734,"[[9, 2, 17, 9, 1, 6, 14, 13, 18, 3, 41, 8, 11,...",./102flowers/image_06734.jpg
6736,"[[4, 1, 5, 12, 2, 3, 11, 31, 28, 68, 106, 132,...",./102flowers/image_06736.jpg
6737,"[[9, 2, 27, 4, 1, 6, 14, 7, 12, 19, 5427, 5427...",./102flowers/image_06737.jpg
6738,"[[9, 1, 5, 8, 54, 16, 38, 7, 12, 116, 325, 3, ...",./102flowers/image_06738.jpg
6739,"[[4, 12, 1, 5, 29, 11, 19, 7, 26, 70, 5427, 54...",./102flowers/image_06739.jpg


In [8]:
captions = df['Captions'].values[0]
len(captions)

9

##### We use every sentence (1~10 sentences) corresponding to the image to create the dataset.

In [9]:
filenames = data_path + '/text2ImgData.pkl'

# load the training data into two NumPy arrays
df = pd.read_pickle(filenames)
captions = df['Captions'].values
image_paths = df['ImagePath'].values
caption = []
image_path = []
# each image has 1 to 10 corresponding captions
# we choose one of them randomly for training
for i in range(len(captions)):
    im_path = image_paths[i]
    for c in captions[i]:
        caption.append (c)
        image_path.append (im_path)
#  caption.append(random.choice(captions[i]))
caption = np.asarray(caption)
caption = caption.astype(np.int)

print(caption.shape)
print(len(image_path))

(70504, 20)
70504


##### 'dataset_generator' is the function for creating dataset which uses every sentence corresponding to the image, whereas 'dataset_generator_random' is the function for creating dataset which only randomly picks a sentence from every sentence corresponding to the image. In the final decision, we use 'dataset_generator' for creating dataset.

In [10]:
# in this competition, you have to generate image in size 64x64x3
IMAGE_HEIGHT = 64
IMAGE_WIDTH = 64
IMAGE_CHANNEL = 3

def training_data_generator(caption, image_path):
    # load in the image according to image path
    img = tf.io.read_file(image_path)
    img = tf.image.decode_image(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = (img*2 ) - 1 # -1 to 1
    img.set_shape([None, None, 3])
    img = tf.image.resize(img, size=[IMAGE_HEIGHT, IMAGE_WIDTH])
    img.set_shape([IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNEL])
    caption = tf.cast(caption, tf.int32)
    return img, caption

def dataset_generator(batch_size, data_generator, with_image = True):
    # assume that each row of `features` corresponds to the same row as `labels`.
    # assert caption.shape[0] == image_path.shape[0]
    
    dataset = tf.data.Dataset.from_tensor_slices((caption, image_path))
    dataset = dataset.map(data_generator,    num_parallel_calls=6)
    dataset = dataset.shuffle(len(caption)).batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(buffer_size=5000)
    hparas['N_SAMPLE'] = 70504 
    return dataset

def dataset_generator_random(filenames, batch_size, data_generator):
    # load the training data into two NumPy arrays
    df = pd.read_pickle(filenames)
    captions = df['Captions'].values
    caption = []
    # each image has 1 to 10 corresponding captions
    # we choose one of them randomly for training
    for i in range(len(captions)):
        caption.append(random.choice(captions[i]))
    caption = np.asarray(caption)
    caption = caption.astype(np.int)
    image_path = df['ImagePath'].values
    
    # assume that each row of `features` corresponds to the same row as `labels`.
    assert caption.shape[0] == image_path.shape[0]
    
    dataset = tf.data.Dataset.from_tensor_slices((caption, image_path))
    dataset = dataset.map(data_generator, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.shuffle(len(caption)).batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(buffer_size=5000)
    # tf.print(caption.shape[0])
    return dataset

##### fake_dataset is a dataset that help us to create unpaired data examples. Those unpaired data examples are images that paired with wrong captions.

In [11]:
dataset = dataset_generator(hparas['BATCH_SIZE'] , training_data_generator)
fake_dataset = dataset_generator(hparas['BATCH_SIZE']  , training_data_generator)

In [12]:
fake_dataset                   # first dimension: batch size

<PrefetchDataset shapes: ((64, 64, 64, 3), (64, 20)), types: (tf.float32, tf.int32)>

#####  Model

##### The RNN encoder

In [13]:
from tensorflow.keras.layers import Conv2D, BatchNormalization, LeakyReLU, Conv2DTranspose, GRU, Embedding,Dense
from tensorflow.keras.models import Sequential

class RnnEncoder(tf.keras.Model):
    """
    
    """
    def __init__(self, hparas):
        super(RnnEncoder, self).__init__()
        self.hparas = hparas
        
        # params 
        self.batch_size = self.hparas['BATCH_SIZE']
        self.code_dim = self.hparas['CODE_DIM'] 

        # layers 
        # shape = (batch_size, max_lenth)
        self.embedding = Embedding(self.hparas['VOCAB_SIZE'], self.hparas['EMBED_DIM'])
        # shape = (batch_size, max_lenth, embedding_dim)
        self.RENet = GRU(self.hparas['RNN_HIDDEN_SIZE'], return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
        # shape = (batch_size, rnn_hidden_size)
        
        # self.dense = Dense(self.code_dim) 
        
    def call(self, text, hidden):
        text_embed = self.embedding (text) 
        whole_sequence_output, final_state = self.RENet(text_embed, initial_state  = hidden)
        # whole_sequence_output => shape = (batch_size, 20, RNN_HIDDEN_SIZE)
        return whole_sequence_output[:, -1, :]                   # result of the last input
    
    def initialize_hidden_state(self, current_batch_size ):
        # return the initial hidden state 
        return tf.zeros((current_batch_size, self.hparas['RNN_HIDDEN_SIZE']))

##### The CNN encoder

In [14]:
from tensorflow.keras.layers import Conv2D, BatchNormalization, LeakyReLU, Flatten
from tensorflow.keras.models import Sequential

class CnnEncoder(tf.keras.Model):
    """
    
    """
    def __init__(self, hparas):
        super(CnnEncoder, self).__init__()
        self.hparas = hparas
        
        # parameter
        self.batch_size = self.hparas['BATCH_SIZE']
        self.n_filter = 64
        self.code_dim = self.hparas['CODE_DIM'] 
        
        # layers 
        # shape = (batch_size, 64, 64, 3)
        self.con1 = Conv2D(filters = self.n_filter* 1 , kernel_size=4,  strides = 2)
        self.act1 = LeakyReLU()
        # shape = (batch_size, 32, 32, n_filter) 
        self.con2 = Conv2D(filters = self.n_filter* 2 , kernel_size=4,  strides = 2)
        self.bat2 = BatchNormalization()
        self.act2 = LeakyReLU()
        # shape = (batch_size, 16, 16, n_filter*2) 
        self.con3 = Conv2D(filters = self.n_filter* 4 , kernel_size=4,  strides = 2)
        self.bat3 = BatchNormalization()
        self.act3 = LeakyReLU()
        # shape = (batch_size, 8, 8, n_filter*4) 
        self.con4 = Conv2D(filters = self.n_filter* 8 , kernel_size=4,  strides = 2)
        self.bat4 = BatchNormalization()
        self.act4 = LeakyReLU()
        self.flatten = Flatten()
        self.dense = Dense(self.code_dim)
        # shape = (batch_size, code_dim)
        
    def call(self, img, training = True):
        x = self.con1(img)
        x = self.act1(x)

        x = self.con2(x)
        x = self.bat2(x, training=training)
        x = self.act2(x)

        x = self.con3(x)
        x = self.bat3(x, training=training)
        x = self.act3(x)

        x = self.con4(x)
        x = self.bat4(x, training=training)
        x = self.act4(x)
        x = self.flatten(x)
        img_code = self.dense(x)

        return img_code

In [15]:
cnn_encoder = CnnEncoder(hparas)
rnn_encoder = RnnEncoder(hparas)

In [16]:
cnn_optimizer = tf.keras.optimizers.Adam(hparas['LR'])
rnn_optimizer = tf.keras.optimizers.Adam(hparas['LR'])

##### In our training process, we want to make the output of RNN encoder and CNN encoder as similar as possible. Therefore, we compare the output of RNN encoder and CNN encoder using cosine similarity and calculate the loss. The input and ouput shape is written in comments.

In [17]:
def cosine_similarity(v1, v2):
    """
    cost = (v1 dot v2 ) /( (sqrt (v1 dot v1)*  sqrt (v2  dot v2) ) ) 
    v1:  shape = (batch_size, code_dim )
    v2:  shape = (batch_size, code_dim )
    """
    cost = tf.reduce_sum(tf.multiply(v1, v2), axis=1 ) / ( tf.sqrt(tf.reduce_sum(tf.multiply(v1, v1), axis=1)) * tf.sqrt(tf.reduce_sum(tf.multiply(v2, v2), 1)) )
    # tf.print(cost.shape)
    return cost

In [18]:
@tf.function
def train_step (real_image, real_caption, wrong_image, wrong_caption):
    '''
        image + caption (pair data)
        image + wrong_caption (unpair data)
    '''
    hidden = rnn_encoder.initialize_hidden_state(real_image.shape[0])                 # real_image[0] = batch_size
    
    with tf.GradientTape() as cnn_tape, tf.GradientTape() as rnn_tape:
        real_ImgCode = cnn_encoder (real_image, training = True)
        wrong_ImgCode = cnn_encoder (wrong_image, training = True)
        real_CapCode = rnn_encoder (real_caption, hidden)
        wrong_CapCode = rnn_encoder (wrong_caption, hidden) 
    
        alpha = 0.2  # margin alpha
        loss = tf.reduce_mean(tf.maximum(0., alpha - cosine_similarity(real_ImgCode, real_CapCode) + cosine_similarity(real_ImgCode, wrong_ImgCode))) + \
                   tf.reduce_mean(tf.maximum(0., alpha - cosine_similarity(real_ImgCode, real_CapCode) + cosine_similarity(wrong_ImgCode, real_CapCode)))

    # backprop 
    grad_c = cnn_tape.gradient(loss, cnn_encoder.trainable_variables)
    grad_r = rnn_tape.gradient(loss, rnn_encoder.trainable_variables)
    
    cnn_optimizer.apply_gradients(zip(grad_c, cnn_encoder.trainable_variables))
    rnn_optimizer.apply_gradients(zip(grad_r, rnn_encoder.trainable_variables))
    
    return loss
    

##### checkpoint

In [19]:
# checkpoint for cnn encoder
CNN_CKPT_DIR = './ckpts/cnn_encoder'
cnn_ckpt = tf.train.Checkpoint(step=tf.Variable(1), cnn_encoder=cnn_encoder)
cnn_ckpt_manager = tf.train.CheckpointManager(cnn_ckpt, CNN_CKPT_DIR, max_to_keep=100)

In [20]:
# checkpoint for rnn encoder
RNN_CKPT_DIR = './ckpts/rnn_encoder'
rnn_ckpt = tf.train.Checkpoint(step=tf.Variable(1), rnn_encoder=rnn_encoder)
rnn_ckpt_manager = tf.train.CheckpointManager(rnn_ckpt, RNN_CKPT_DIR, max_to_keep=100)

##### training 

In [21]:
start_epoch = 0
steps_per_epoch = int(hparas['N_SAMPLE'] / hparas['BATCH_SIZE'])
encoder_loss = []
for ep in range (hparas['N_EPOCH']):
    start_time = time.time()

    total_loss = 0
    
    # dataset = dataset_generator(data_path + '/text2ImgData.pkl', hparas['BATCH_SIZE'] , training_data_generator)
    # fake_dataset = dataset_generator(data_path + '/text2ImgData.pkl', hparas['BATCH_SIZE']  , training_data_generator)
    
    
    for (real_image, real_caption),(wrong_image, wrong_caption) in zip(dataset, fake_dataset):
        loss = train_step(real_image, real_caption, wrong_image, wrong_caption)
        total_loss += loss.numpy() 
    total_loss = total_loss / steps_per_epoch
    encoder_loss.append(total_loss)
    print ("Epoch %3d   Time taken for 1 epoch %d sec   Loss % .5f " % (ep,  time.time() - start_time, total_loss )) 
    
    # save checkpoint for each epoch
    if ep % 1 == 0:
        save_path = cnn_ckpt_manager.save()
        print("Saved cnn checkpoint for step {}: {}".format(ep, save_path))
        save_path = rnn_ckpt_manager.save()
        print("Saved rnn checkpoint for step {}: {}".format(ep, save_path))

Epoch   0   Time taken for 1 epoch 271 sec   Loss  0.23787 
Saved cnn checkpoint for step 0: ./ckpts/cnn_encoder\ckpt-1
Saved rnn checkpoint for step 0: ./ckpts/rnn_encoder\ckpt-1
Epoch   1   Time taken for 1 epoch 276 sec   Loss  0.22433 
Saved cnn checkpoint for step 1: ./ckpts/cnn_encoder\ckpt-2
Saved rnn checkpoint for step 1: ./ckpts/rnn_encoder\ckpt-2
Epoch   2   Time taken for 1 epoch 279 sec   Loss  0.13841 
Saved cnn checkpoint for step 2: ./ckpts/cnn_encoder\ckpt-3
Saved rnn checkpoint for step 2: ./ckpts/rnn_encoder\ckpt-3
Epoch   3   Time taken for 1 epoch 282 sec   Loss  0.12560 
Saved cnn checkpoint for step 3: ./ckpts/cnn_encoder\ckpt-4
Saved rnn checkpoint for step 3: ./ckpts/rnn_encoder\ckpt-4
Epoch   4   Time taken for 1 epoch 281 sec   Loss  0.11429 
Saved cnn checkpoint for step 4: ./ckpts/cnn_encoder\ckpt-5
Saved rnn checkpoint for step 4: ./ckpts/rnn_encoder\ckpt-5
Epoch   5   Time taken for 1 epoch 286 sec   Loss  0.08909 
Saved cnn checkpoint for step 5: ./ckpts

Epoch  45   Time taken for 1 epoch 292 sec   Loss  0.03131 
Saved cnn checkpoint for step 45: ./ckpts/cnn_encoder\ckpt-46
Saved rnn checkpoint for step 45: ./ckpts/rnn_encoder\ckpt-46
Epoch  46   Time taken for 1 epoch 293 sec   Loss  0.03182 
Saved cnn checkpoint for step 46: ./ckpts/cnn_encoder\ckpt-47
Saved rnn checkpoint for step 46: ./ckpts/rnn_encoder\ckpt-47
Epoch  47   Time taken for 1 epoch 292 sec   Loss  0.03167 
Saved cnn checkpoint for step 47: ./ckpts/cnn_encoder\ckpt-48
Saved rnn checkpoint for step 47: ./ckpts/rnn_encoder\ckpt-48
Epoch  48   Time taken for 1 epoch 292 sec   Loss  0.03112 
Saved cnn checkpoint for step 48: ./ckpts/cnn_encoder\ckpt-49
Saved rnn checkpoint for step 48: ./ckpts/rnn_encoder\ckpt-49
Epoch  49   Time taken for 1 epoch 293 sec   Loss  0.03043 
Saved cnn checkpoint for step 49: ./ckpts/cnn_encoder\ckpt-50
Saved rnn checkpoint for step 49: ./ckpts/rnn_encoder\ckpt-50
Epoch  50   Time taken for 1 epoch 292 sec   Loss  0.03036 
Saved cnn checkpoint

Epoch  90   Time taken for 1 epoch 293 sec   Loss  0.02126 
Saved cnn checkpoint for step 90: ./ckpts/cnn_encoder\ckpt-91
Saved rnn checkpoint for step 90: ./ckpts/rnn_encoder\ckpt-91
Epoch  91   Time taken for 1 epoch 293 sec   Loss  0.02093 
Saved cnn checkpoint for step 91: ./ckpts/cnn_encoder\ckpt-92
Saved rnn checkpoint for step 91: ./ckpts/rnn_encoder\ckpt-92
Epoch  92   Time taken for 1 epoch 293 sec   Loss  0.02102 
Saved cnn checkpoint for step 92: ./ckpts/cnn_encoder\ckpt-93
Saved rnn checkpoint for step 92: ./ckpts/rnn_encoder\ckpt-93
Epoch  93   Time taken for 1 epoch 295 sec   Loss  0.02088 
Saved cnn checkpoint for step 93: ./ckpts/cnn_encoder\ckpt-94
Saved rnn checkpoint for step 93: ./ckpts/rnn_encoder\ckpt-94
Epoch  94   Time taken for 1 epoch 294 sec   Loss  0.02032 
Saved cnn checkpoint for step 94: ./ckpts/cnn_encoder\ckpt-95
Saved rnn checkpoint for step 94: ./ckpts/rnn_encoder\ckpt-95
Epoch  95   Time taken for 1 epoch 294 sec   Loss  0.02033 
Saved cnn checkpoint

Epoch 134   Time taken for 1 epoch 296 sec   Loss  0.01657 
Saved cnn checkpoint for step 134: ./ckpts/cnn_encoder\ckpt-135
Saved rnn checkpoint for step 134: ./ckpts/rnn_encoder\ckpt-135
Epoch 135   Time taken for 1 epoch 296 sec   Loss  0.01606 
Saved cnn checkpoint for step 135: ./ckpts/cnn_encoder\ckpt-136
Saved rnn checkpoint for step 135: ./ckpts/rnn_encoder\ckpt-136
Epoch 136   Time taken for 1 epoch 296 sec   Loss  0.01588 
Saved cnn checkpoint for step 136: ./ckpts/cnn_encoder\ckpt-137
Saved rnn checkpoint for step 136: ./ckpts/rnn_encoder\ckpt-137
Epoch 137   Time taken for 1 epoch 296 sec   Loss  0.01590 
Saved cnn checkpoint for step 137: ./ckpts/cnn_encoder\ckpt-138
Saved rnn checkpoint for step 137: ./ckpts/rnn_encoder\ckpt-138
Epoch 138   Time taken for 1 epoch 296 sec   Loss  0.01602 
Saved cnn checkpoint for step 138: ./ckpts/cnn_encoder\ckpt-139
Saved rnn checkpoint for step 138: ./ckpts/rnn_encoder\ckpt-139
Epoch 139   Time taken for 1 epoch 296 sec   Loss  0.01552 


KeyboardInterrupt: 

##### After training for 148 epochs, our loss is 0.01495. Then, we use this encoder to train conditional gan.

##### In testing, we check whether our text encoder can get high cosine similarity if the inputs are paired image and caption. 

In [None]:
# testing
dataset = dataset_generator(data_path + '/text2ImgData.pkl', hparas['BATCH_SIZE'], training_data_generator)
fake_dataset = dataset_generator(data_path + '/text2ImgData.pkl', hparas['BATCH_SIZE'], training_data_generator)


for (real_image, real_caption),(wrong_image, wrong_caption) in zip(dataset, fake_dataset):
    hidden = rnn_encoder.initialize_hidden_state(real_image.shape[0])

    real_ImgCode = cnn_encoder (real_image, training = False)
    wrong_ImgCode = cnn_encoder (wrong_image, training = False)
    real_CapCode = rnn_encoder (real_caption, hidden)
    wrong_CapCode = rnn_encoder (wrong_caption, hidden)
    tf.print(tf.reduce_sum(cosine_similarity(real_ImgCode,real_CapCode),0)/hparas['BATCH_SIZE'])
    tf.print(tf.reduce_sum(cosine_similarity(real_ImgCode,wrong_CapCode),0)/hparas['BATCH_SIZE'])
    tf.print(tf.reduce_sum(cosine_similarity(wrong_ImgCode,real_CapCode),0)/hparas['BATCH_SIZE'])