<a href="https://colab.research.google.com/github/viswambhar-yasa/image_captioning/blob/master/training_policy_net.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/viswambhar-yasa/image_captioning

In [None]:
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile


def downloading_extraction(link, extraction_path='.'):
    url = urlopen(link)
    zipfile = ZipFile(BytesIO(url.read()))
    zipfile.extractall(path=extraction_path)


if __name__ == "__main__":
    images_link = 'https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip'
    downloading_extraction(images_link)
    text_link = "https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip"
    downloading_extraction(text_link)


In [46]:
import tensorflow as tf
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Embedding, LSTM, BatchNormalization, Bidirectional
from tensorflow.keras.applications import Xception, InceptionV3
from tensorflow.keras.models import Model
from tensorflow.python.keras.layers.recurrent import GRU


def image_encoder(img_input, trainable_layers=0, CNN_Type='Xception', Embed_Size=256, display=False):
    print('Building CNN model')
    if CNN_Type == 'Xception':
        cnn_pre_trained_model = Xception(include_top=False, weights='imagenet', input_tensor=img_input)
    else:
        cnn_pre_trained_model = InceptionV3(include_top=False, weights='imagenet', input_tensor=img_input)
    for i, layer in enumerate(cnn_pre_trained_model.layers):
        if len(cnn_pre_trained_model.layers) - i < trainable_layers:
            layer.trainable = True
        else:
            layer.trainable = False
    cnn_inputs = cnn_pre_trained_model.inputs
    base_model = cnn_pre_trained_model.output
    base_model = GlobalAveragePooling2D(name='global_average_pooling')(base_model)
    embed_image = tf.keras.layers.Dense(Embed_Size, activation='tanh', name='embed_image')(base_model)
    feature_extraction_model = Model(inputs=cnn_inputs, outputs=embed_image, name='CNN encoder model')
    print('CNN model {output shape}:', embed_image.shape)
    if display:
        tf.keras.utils.plot_model(feature_extraction_model, to_file='base_model.png', show_shapes=True)
    return feature_extraction_model


def txt_decoder(rnn_input, Embed_Size=256, Bi_Direction=False, RNN_Type='LSTM', RNN_Layers=2):
    print('Building RNN model')
    for i in range(RNN_Layers):
        x = BatchNormalization()(rnn_input)
        if RNN_Type == 'LSTM':
            if i == (RNN_Layers - 1):
                if Bi_Direction:
                    rnn_out = Bidirectional(LSTM(int(Embed_Size/2)))(x)
                else:
                    rnn_out = LSTM(Embed_Size)(x)
            else:
                if Bi_Direction:
                    rnn_out = Bidirectional(LSTM(int(Embed_Size/2), return_sequences=True))(x)
                else:
                    rnn_out = LSTM(Embed_Size, return_sequences=True)(x)
        else:
            if i == (RNN_Layers - 1):
                if Bi_Direction:
                    rnn_out = Bidirectional(GRU(Embed_Size))(x)
                else:
                    rnn_out = GRU(Embed_Size)(x)
            else:
                if Bi_Direction:
                    rnn_out = Bidirectional(GRU(Embed_Size/2, return_sequences=True))(x)
                else:
                    rnn_out = GRU(Embed_Size, return_sequences=True)(x)
        rnn_input = rnn_out
    return rnn_out


def Caption_model_gen(NET, img_shape=(256, 256, 3), vocab_size=5000, Embed_Size=256, max_length=20, display=False):
    img_input = tf.keras.Input(shape=img_shape)
    cnn_model = image_encoder(img_input, trainable_layers=0, CNN_Type='InceptionV3', display=False)
    embed_image = tf.keras.layers.Dense(Embed_Size, activation='tanh')(cnn_model.output)

    text_input = tf.keras.Input(shape=(max_length,))
    Embedding_layer = Embedding(input_dim=vocab_size, output_dim=Embed_Size, input_length=max_length, mask_zero=True)(
        text_input)

    whole_seq_output = txt_decoder(Embedding_layer, Embed_Size=Embed_Size,
                                                                          Bi_Direction=False, RNN_Type='LSTM',
                                                                          RNN_Layers=3)
    print('final_carry_state {rnn output shape}:', whole_seq_output.shape)
    rnn_output = whole_seq_output
    if NET == 'policy':
        image_txt_embed = tf.keras.layers.add([embed_image, rnn_output])
        print('Image and text {add shape}:', image_txt_embed.shape)
        policy_net_output = tf.keras.layers.Dense(vocab_size, activation='softmax')(image_txt_embed)
        policy_net_model = Model(inputs=[img_input, text_input], outputs=policy_net_output, name='Policy_Net')

        print('output {shape}', policy_net_output.shape)
        print('Policy Net built successfully \n')
        if display:
            tf.keras.utils.plot_model(policy_net_model, to_file='policy_net.png', show_shapes=True)
        return policy_net_model
    elif NET == 'value':
        image_txt_embed = tf.keras.layers.concatenate([embed_image, rnn_output], axis=-1)
        print('Image and text {concat shape}:', image_txt_embed.shape)
        hidden_layer_1 = Dense(1024, activation='tanh', name='MLP_layer1')(image_txt_embed)
        hidden_layer_2 = Dense(512, activation='tanh', name="MLP_layer2")(hidden_layer_1)
        value_net_outputs = Dense(1, activation='tanh', name='decoder_output')(hidden_layer_2)
        value_net_model = Model(inputs=[img_input, text_input], outputs=value_net_outputs, name='Value_Net')
        print('output {shape}', value_net_outputs.shape)
        print('Value Net built successfully \n')
        if display:
            tf.keras.utils.plot_model(value_net_model, to_file='value_net.png', show_shapes=True)
        return value_net_model
    else:
        feature_vector = Dense(512, activation='tanh')(embed_image)
        text_sequence_vector = Dense(512, activation='tanh', name='rnn_linear')(rnn_output)
        print('Image feature vector shape:', feature_vector.shape)
        print('Text sequence vector shape:', text_sequence_vector.shape)
        reward_model = Model(inputs=[img_input, text_input], outputs=[feature_vector, text_sequence_vector],
                             name='reward net model')
        print('Reward Net built successfully \n')
        if display:
            tf.keras.utils.plot_model(reward_model, to_file='reward_net.png', show_shapes=True)
        return reward_model


if __name__ == "__main__":
    print('TensorFlow Version', tf.__version__)
    print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
    #actor_model = Caption_model_gen('policy')
    #critic_model = Caption_model_gen('value')
    #reward = Caption_model_gen('reward')


TensorFlow Version 2.8.0-rc0
Num GPUs Available:  1


In [47]:
text = open('./Flickr8k.token.txt', 'r', encoding='utf-8').read()

In [48]:
description_map=dict()
for lines in text.split('\n'):
  line_split = lines.split('\t')
  if line_split == ['']:
      continue
  image_id = line_split[0][:-2]
  image_des = line_split[1]
  #if image_id in img_dic:
  if image_id not in description_map:
    description_map[image_id] = list()
  description_map[image_id].append(image_des)
caption_list = []
for img_id, des_list in description_map.items():
    for i in range(len(des_list)):
        caption = des_list[i]
        caption = ''.join(caption)
        caption = caption.split(' ')
        caption = [word.lower() for word in caption if len(word) > 1 and word.isalpha()]
        caption = ' '.join(caption)
        des_list[i] = 'startseq ' + caption + ' endseq'
        caption_list.append('startseq ' + caption + ' endseq')
max_length = max(len(des.split()) for des in caption_list)
print('max_length of captions', max_length)
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000,oov_token='<unknw>')
tokenizer.fit_on_texts(caption_list)

max_length of captions 33


In [49]:
import os
files = os.listdir("./Flicker8k_Dataset")
len(files)

8091

In [50]:
word_index=tokenizer.word_index

In [51]:
word_index={value:key for key, value in word_index.items()}

In [52]:
len(word_index)

8360

In [53]:
len(description_map.keys())

8092

In [54]:
description_map['2258277193_586949ec62.jpg.1']

['startseq people waiting for the subway endseq',
 'startseq some people looking out windows in large building endseq',
 'startseq three people are waiting on train platform endseq',
 'startseq three people standing at station endseq',
 'startseq two woman and one man standing near train tracks endseq']

In [55]:
description_map['2258277193_586949ec62.jpg'] = description_map['2258277193_586949ec62.jpg.1']
del description_map['2258277193_586949ec62.jpg.1']

In [56]:
del description_map['2258277193_586949ec62.jpg']

In [57]:
description_map1=dict()

In [58]:
files=os.listdir("./Flicker8k_Dataset")

In [59]:
for key,value in description_map.items():
  if key in files:
    description_map1[key]=value

In [60]:
print(len(tokenizer.word_counts))

8359


In [61]:
print(len(description_map1.keys()))

8091


In [62]:
from os import path, mkdir
from random import sample
from sklearn.model_selection import train_test_split
from pandas import Series
num_images = 4000
if not path.exists('./subsets'):
    mkdir('./subsets')
if(num_images != len(description_map1.keys())):
    description_map_subset = dict(sample(description_map1.items(),num_images))
    train_images_id, test_images_id = train_test_split(Series(description_map_subset.keys()),test_size=0.125,random_state=8)
    train_images_id, val_imgs_id = train_test_split(Series(description_map_subset.keys()),test_size=0.125, random_state=8)
    train_images_id.to_csv('./subsets/Flickr8k_images_train.txt',sep=' ',index=False,header=False)
    test_images_id.to_csv('./subsets/Flickr8k_images_test.txt',sep=' ',index=False,header=False)
    val_imgs_id.to_csv('./subsets/Flickr8k_images_val.txt',sep=' ',index=False,header=False)
else:
    description_map_subset = description_map1

since Python 3.9 and will be removed in a subsequent version.
  description_map_subset = dict(sample(description_map1.items(),num_images))


In [63]:
token_cap_dic = dict()
print('Vocab size',len(tokenizer.word_counts))
for img_id, des_list in description_map_subset.items():
    for i in range(len(des_list)):
        caption = des_list[i]
        cap_token = tokenizer.texts_to_sequences([str(caption)])
        if img_id not in token_cap_dic:
            token_cap_dic[img_id] = list()
        token_cap_dic[img_id].append(cap_token)

Vocab size 8359


In [64]:
import tensorflow as tf


class data_processing:
    def __init__(self, text_file_path):
        self.text_file_path = text_file_path
        self.tokenizer = None

    def extraction_captions(self, images_id_text):
        description_map = dict()
        text = open(self.text_file_path, 'r', encoding='utf-8').read()
        images = open(images_id_text, 'r', encoding='utf-8').read()
        img_dic = []
        for img_id in images.split('\n'):
            img_dic.append(img_id)
        for lines in text.split('\n'):
            line_split = lines.split('\t')
            if line_split == ['']:
                continue
            image_id = line_split[0][:-2]
            image_des = line_split[1]
            if image_id in img_dic:
                if image_id not in description_map:
                    description_map[image_id] = list()
                description_map[image_id].append(image_des)
        return description_map

    def cleaning_sequencing_captions(self, images_id_text):
        captions_dic = self.extraction_captions(images_id_text)
        caption_list = []
        for img_id, des_list in captions_dic.items():
            for i in range(len(des_list)):
                caption = des_list[i]
                caption = ''.join(caption)
                caption = caption.split(' ')
                caption = [word.lower() for word in caption if len(word) > 1 and word.isalpha()]
                caption = ' '.join(caption)
                des_list[i] = 'startseq ' + caption + ' endseq'
                caption_list.append('startseq ' + caption + ' endseq')
        max_length = max(len(des.split()) for des in caption_list)
        print('max_length of captions', max_length)
        return caption_list,captions_dic

    def tokenization(self, captions_for_token, num_wrds=5000) -> None:
        tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_wrds, oov_token='<unknw>')
        tokenizer.fit_on_texts(captions_for_token)
        self.tokenizer = tokenizer
        return tokenizer

    def sentence_tokenizing(self, captions_dic) -> dict:
        token_cap_dic = dict()
        print('Vocab size', self.tokenizer.num_words)
        for img_id, des_list in captions_dic.items():
            for i in range(len(des_list)):
                caption = des_list[i]
                cap_token = self.tokenizer.texts_to_sequences([str(caption)])
                if img_id not in token_cap_dic:
                    token_cap_dic[img_id] = list()
                token_cap_dic[img_id].append(cap_token)
        return token_cap_dic

In [65]:
import tensorflow as tf
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.inception_v3 import InceptionV3, preprocess_input
import numpy as np
from data_processing import data_processing


def load_preprocess_img(img_path):
    img = load_img(img_path, target_size=(256, 256, 3))
    x = img_to_array(img)
    x /= 255.0
    return x


def captions_generation(captions_dic, vocab_size, image_pth_rt, max_length=25, num_photos_per_batch=5, num_captions=1):
    images, input_text_seq, output_text = list(), list(), list()
    batch_iter = 0
    batch_keys = []
    while True:
        for key, desc_list in captions_dic.items():
            # print(key)
            batch_keys.append(key)
            batch_iter += 1
            caption = 0
            # retrieve the photo feature

            photo = load_preprocess_img(image_pth_rt + key)
            
            for desc in desc_list:
                caption += 1
                desc = np.squeeze(desc)
                input_sequence = []
                out_text=[]
                for i in range(0, len(desc)-1):
                    input_sequence.append(desc[:i ])
                    out_text.append(desc[i+1])
                    images.append(photo)
                
                input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_sequence, maxlen=max_length,
                                                                          padding='post')
                #input_text = input_seq[:, :-1]
                #out_text = input_seq[:, -1]
                output_sequence = tf.keras.utils.to_categorical(out_text, num_classes=vocab_size)
                input_text_seq.append(input_seq)
                output_text.append(output_sequence)
                if caption == num_captions:
                    break
            if batch_iter == num_photos_per_batch:
                input_text_seq = np.concatenate(input_text_seq)
                output_text = np.concatenate(output_text)
                #print(batch_keys[-5:])
                yield [[np.array(images), np.array(input_text_seq)], np.array(output_text)]
                images, input_text_seq, output_text = list(), list(), list()
                batch_iter = 0

In [66]:
import matplotlib.pyplot as plt

In [67]:
import tensorflow as tf
#from data_processing import data_processing
#from data_generator import captions_generation
import pickle

print('TensorFlow Version', tf.__version__)
vocab_size = 1000
max_length = 10


captions_text_path = './Flickr8k.token.txt'
captions_extraction = data_processing(captions_text_path)
trn_images_id_text = r'./subsets/Flickr8k_images_train.txt'
train_cleaned_seq, train_cleaned_dic = captions_extraction.cleaning_sequencing_captions(trn_images_id_text)
val_images_id_text = r'./subsets/Flickr8k_images_val.txt'
val_cleaned_seq, val_cleaned_dic = captions_extraction.cleaning_sequencing_captions(val_images_id_text)
test_images_id_text = r'./subsets/Flickr8k_images_test.txt'
test_cleaned_seq, test_cleaned_dic = captions_extraction.cleaning_sequencing_captions(test_images_id_text)
tokenizer=captions_extraction.tokenization(train_cleaned_seq, vocab_size)
print("No of captions: Training-" + str(len(train_cleaned_seq) / 5) + " Validation-" + str(
    len(val_cleaned_seq) / 5) + " test-" + str(len(test_cleaned_seq) / 5))

train_cap_tok = captions_extraction.sentence_tokenizing(train_cleaned_dic)
val_cap_tok = captions_extraction.sentence_tokenizing(val_cleaned_dic)
test_cap_tok = captions_extraction.sentence_tokenizing(test_cleaned_dic)

image_pth_rt = r"./Flicker8k_Dataset/" #+ r"\\"
trn_dataset = captions_generation(train_cap_tok, vocab_size, image_pth_rt, max_length,5,1)
val_dataset = captions_generation(val_cap_tok, vocab_size, image_pth_rt, max_length)

tokenizer

TensorFlow Version 2.8.0-rc0
max_length of captions 33
max_length of captions 27
max_length of captions 27
No of captions: Training-3500.0 Validation-500.0 test-500.0
Vocab size 1000
Vocab size 1000
Vocab size 1000


In [68]:
trn_dataset_whole = captions_generation(token_cap_dic, 1000, image_pth_rt, max_length,5,1)

In [69]:
len(train_cleaned_seq)

17500

In [70]:
import PIL
inputs, outputs = next(iter(trn_dataset_whole))
print(inputs[0].shape, inputs[1].shape, outputs.shape)

(39, 256, 256, 3) (39, 10) (39, 1000)


In [71]:
print(tokenizer)

None


In [72]:
actor_model = Caption_model_gen(NET='policy', vocab_size=1000, Embed_Size=256, max_length=max_length,display=True)
#actor_model.summary()
actor_model.compile(loss=tf.keras.losses.categorical_crossentropy,
                    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
                    metrics=['accuracy'])

Building CNN model
CNN model {output shape}: (None, 256)
Building RNN model
final_carry_state {rnn output shape}: (None, 256)
Image and text {add shape}: (None, 256)
output {shape} (None, 1000)
Policy Net built successfully 

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [73]:
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)


lr_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

checkpoint_filepath = '/content'
early_stop_callback = tf.keras.callbacks.EarlyStopping(
    monitor='loss', patience=10)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='accuracy',
    mode='auto')

callback = [model_checkpoint_callback]

history = actor_model.fit(trn_dataset_whole,epochs=100,steps_per_epoch=12, shuffle=False, validation_data=val_dataset,validation_steps=1)
#history = actor_model.fit(trn_dataset, steps_per_epoch=10, epochs=100, shuffle=False,callbacks=lr_callback)
model_parameters = history.history

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100

KeyboardInterrupt: 

In [None]:
tf.keras.backend.clear_session()

In [None]:
f = open("./output/history_policy_model_lstm_3.pkl", "wb")
pickle.dump(model_parameters, f)
f.close()   

In [None]:
actor_model.save_weights('./output/policy_net_weights.h5')

In [None]:
actor_model.load_weights('./output/policy_net_weights.h5')

In [None]:
import os
from tensorboard.plugins import projector

In [None]:
weights = tf.Variable(actor_model.layers[-23].get_weights()[0][1:])
# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
log_dir=r'./output/sample_data'
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))
# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [None]:
actor_model.layers[-23]

In [None]:
# Now run tensorboard against on log data we just saved.
#!tensorboard --logdir /logs/
from collections.abc import Mapping
!tensorboard

In [None]:
print(history.history.keys())
f = open("./output/history_policy_model_lstm_3.pkl", "wb")
pickle.dump(model_parameters, f)
f.close()   

In [None]:
   import matplotlib.pyplot as plt

In [None]:
plt.plot(history.history['loss'], label="loss")
plt.plot(history.history['val_loss'])
plt.legend()

In [None]:
plt.plot(history.history['accuracy'], label="accuracy")
plt.plot(history.history['val_accuracy'])
plt.legend()

In [None]:
plt.semilogx(history.history["lr"],history.history['loss'])
plt.axis([0.001,0.0001,4.5,6.5])

In [None]:
actor_model.save_weights('./output/policy_net_model_3.h5')

In [None]:
#actor_model.load_weights()

In [None]:
def caption_greedy(policy_net,image,tokenizer,word_index,max_length=20):
  caption='startseq'
  for i in range(max_length):
    input_seq=caption.split(' ')
    tokenization=tokenizer.texts_to_sequences(input_seq)
    padding=tf.keras.preprocessing.sequence.pad_sequences([tokenization],maxlen=max_length) 
    predicted_word_index=np.argmax(policy_net.predict([tf.expand_dims(image,axis=0),padding]))
    predicted_word=word_index[predicted_word_index]
    caption+=' '+predicted_word
    if predicted_word =='endseq':
      break
  return caption

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=3000,oov_token='<unknw>')
tokenizer.fit_on_texts(caption_list)

In [None]:
test_img=list(test_cap_tok.keys())[250]
test_photo = load_preprocess_img(image_pth_rt + test_img)
cap=caption_greedy(actor_model,test_photo,tokenizer,word_index,max_length=max_length)
plt.imshow(test_photo)
plt.title(cap)

In [None]:
test_dataset = captions_generation(test_cap_tok, vocab_size, image_pth_rt, max_length)

In [None]:
import nltk
def captions_eval(captions_dic, vocab_size, image_pth_rt,tokenizer, max_length=25, num_captions=1):
    images, input_text_seq, output_text = list(), list(), list()
    batch_iter = 0
    batch_keys = []
    while True:
        for key, desc_list in captions_dic.items():
            # print(key)
            batch_keys.append(key)
            batch_iter += 1
            caption = 0
            # retrieve the photo feature

            photo = load_preprocess_img(image_pth_rt + key)
            for desc in desc_list:
                caption += 1
                desc = np.squeeze(desc)
                input_sequence = []
                for i in range(1, len(desc)):
                    input_sequence.append(desc[:i + 1])
                    images.append(photo)
                input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_sequence, maxlen=max_length,
                                                                          padding='pre')
                if caption == num_captions:
                    break
                #predicted_cap=caption_greedy(policy_net,photo,tokenizer)
                #BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], predicted_cap, weights = [1])
                #print(BLEUscore)
          

In [None]:
actor_model.evaluate(test_dataset)

In [None]:
tf.keras.backend.clear_session()