In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from utils import load_caption_data, create_caption_dict, load_train_image_names, load_image


In [2]:
#Below file contains text that consists of imagename and 5 captions for each image
meta_file = 'dataset/Flickr8k_text/Flickr8k.token.txt'

#Load the meta file
text = load_caption_data(meta_file)

# Create a dictionary with {image_name:[caption1, caption2,.....], ....}
caption_dict = create_caption_dict(text)

# File with image names of train files
train_path_file = 'dataset/Flickr8k_text/Flickr_8k.trainImages.txt'

train_img_names = set(load_train_image_names(train_path_file))


## Define Model
* Create a new model by removing the dense layers and only keep the feature extraction layers

In [3]:
'''
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3 as inception
from tqdm import tqdm
#Define Model
model = inception(include_top = False, weights = 'imagenet')

input_layer = model.input
intermediate_layer = model.layers[-1].output

#Group a new model to extract features only (not classify)
new_model = tf.keras.Model(input_layer, intermediate_layer)
'''

"\nimport tensorflow as tf\nfrom tensorflow.keras.applications import InceptionV3 as inception\nfrom tqdm import tqdm\n#Define Model\nmodel = inception(include_top = False, weights = 'imagenet')\n\ninput_layer = model.input\nintermediate_layer = model.layers[-1].output\n\n#Group a new model to extract features only (not classify)\nnew_model = tf.keras.Model(input_layer, intermediate_layer)\n"

In [4]:

image_files = 'dataset/Flicker8k_Dataset/'
'''
train_img_paths = sorted([image_files + name + '.jpg' for name in train_img_names])

image_dataset = tf.data.Dataset.from_tensor_slices(train_img_paths)

image_dataset = image_dataset.map(load_image, num_parallel_calls=2).batch(16)
'''


"\ntrain_img_paths = sorted([image_files + name + '.jpg' for name in train_img_names])\n\nimage_dataset = tf.data.Dataset.from_tensor_slices(train_img_paths)\n\nimage_dataset = image_dataset.map(load_image, num_parallel_calls=2).batch(16)\n"

In [5]:
'''
for img, path in tqdm(image_dataset):
    features = new_model(img)
    
    
    features = tf.reshape(features, (features.shape[0], -1, features.shape[3]))

    #The feature is now of dim 16, 7,7, 2048

    for feature, pth in zip(features, path):
        #Convert path from tensor to string
        path_to_feature = pth.numpy().decode('utf-8')
        np.save(path_to_feature, feature.numpy())

'''


    

"\nfor img, path in tqdm(image_dataset):\n    features = new_model(img)\n    \n    \n    features = tf.reshape(features, (features.shape[0], -1, features.shape[3]))\n\n    #The feature is now of dim 16, 7,7, 2048\n\n    for feature, pth in zip(features, path):\n        #Convert path from tensor to string\n        path_to_feature = pth.numpy().decode('utf-8')\n        np.save(path_to_feature, feature.numpy())\n\n"

## Caption Processing

In [6]:
from caption_handler import clean_captions, add_sof_eof, add_token, create_tokenizer


#Preprocess the captions (text)
img_dict = clean_captions(caption_dict)

#Adds indicator for start and end of sequence
train_dict = add_token(img_dict, train_img_names)


# Create Vocabulary
tokenizer, vocab_size, caption_max_len = create_tokenizer(train_dict)

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
def prepare_training_data(data_dict, tokenizer, max_length, vocab_size):
    x , y = list() ,list()
    for img_name, captions in data_dict.items():
        img_path = image_files + img_name + '.jpg'

        for caption in captions:
            #converts the text sentences to sequences of numbers where the nums are the word's index in vocab
            words_ids = tokenizer.texts_to_sequences([caption])[0] 
            
            #Makes all words_ids vector of same length by padding 0's at the end(padding='post') of the vector
            padded_ids = pad_sequences([words_ids], maxlen = max_length, padding= 'post')[0]

            x.append(img_path)
            y.append(padded_ids)
            print(x)
            


prepare_training_data(train_dict, tokenizer, caption_max_len, vocab_size)

[1, 42, 3, 87, 170, 6, 116, 52, 387, 11, 394, 3, 27, 4417, 626, 2]
[   1   42    3   87  170    6  116   52  387   11  394    3   27 4417
  626    2    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0]
[1, 18, 313, 64, 195, 119, 2]
[  1  18 313  64 195 119   2   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
[1, 39, 18, 116, 64, 195, 2064, 2]
[   1   39   18  116   64  195 2064    2    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0]
[1, 39, 18, 116, 4, 394, 19, 60, 2064, 2]
[   1   39   18  116    4  394   19   60 2064    2    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0]
[1, 39, 18, 3, 87, 170, 313, 64, 195, 2903, 2]
[   1   39   18    3   87  170  313   64  195 2903    2    0    0    0
    0    0    0    0    0   

KeyboardInterrupt: 