## Image Embedding

The objective of this file is to use pre-trained inceptionV3 model as a feature extracter for images in training data.

[Input image]===>InceptionV3==>(None,2048)===>Embedding Layer[2048x300] ==>(None,300)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pickle import dump

In [2]:
#Standard variable
PATH = "Flickr8k_text/"

In [3]:
# opening text file
with open(PATH+"Flickr8k.token.txt") as f:
    data = f.read()

In [4]:
# dictionary contining key as image_id and value as list of captions.
descriptions = dict()

In [5]:
try:
    for el in data.split("\n"):
        tokens = el.split()
        image_id , image_desc = tokens[0],tokens[1:]

        # dropping .jpg from image id
        image_id = image_id.split(".")[0]

        image_desc = " ".join(image_desc)
        
        # check if image_id is already present or not
        if image_id in descriptions:
            descriptions[image_id].append(image_desc)
        else:
            descriptions[image_id] = list()
            descriptions[image_id].append(image_desc)
except Exception as e: 
    print("Exception got :- \n",e)

Exception got :- 
 list index out of range


In [6]:
descriptions["1000268201_693b08cb0e"]

['A child in a pink dress is climbing up a set of stairs in an entry way .',
 'A girl going into a wooden building .',
 'A little girl climbing into a wooden playhouse .',
 'A little girl climbing the stairs to her playhouse .',
 'A little girl in a pink dress going into a wooden cabin .']

##### Preprocessing captions and adding [startseq + caption + endseq]

In [7]:
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [8]:
for k in descriptions.keys():
    value = descriptions[k]
    caption_list = []
    for ec in value:
        
        # replaces specific and general phrases
        sent = decontracted(ec)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\"', ' ')
        sent = sent.replace('\\n', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        
        # startseq is for kick starting the partial sequence generation and endseq is to stop while predicting.
        # for more referance please check https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/
        image_cap = 'startseq ' + sent.lower() + ' endseq'
        caption_list.append(image_cap)
    descriptions[k] = caption_list

In [9]:
descriptions["1000268201_693b08cb0e"]

['startseq a child in a pink dress is climbing up a set of stairs in an entry way  endseq',
 'startseq a girl going into a wooden building  endseq',
 'startseq a little girl climbing into a wooden playhouse  endseq',
 'startseq a little girl climbing the stairs to her playhouse  endseq',
 'startseq a little girl in a pink dress going into a wooden cabin  endseq']

In [13]:
# save the file for further use
dump(descriptions,open("descriptions.pkl","wb"))

### Make a model

We are considering Inception over Vgg 

VGG:
1. Slow to train
2. Very large weight file approx. 553MB
3. Weight File Available

ImageNet:
1. Weight file is light i.e. approx 96MB
2. Fast to train


In [10]:
# Importing necessary modules

from keras.applications.inception_v3 import InceptionV3,preprocess_input
from keras.layers import Dense,BatchNormalization,Dropout,Embedding,RepeatVector
from keras.preprocessing.image import load_img, img_to_array

from keras.models import Sequential
from keras.models import Model




In [11]:
# Since we are using this as feature extractor, the last softmax layer is not useful for us.
inception = InceptionV3(weights='imagenet')



Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5


In [12]:
inception.summary()

Model: "inception_v3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 299, 299, 3)]        0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 149, 149, 32)         864       ['input_1[0][0]']             
                                                                                                  
 batch_normalization (Batch  (None, 149, 149, 32)         96        ['conv2d[0][0]']              
 Normalization)                                                                                   
                                                                                                  
 activation (Activation)     (None, 149, 149, 32)         0         ['batch_normalizati

In [13]:
# pop the last softmax layer and freezing the remaining layers
inception.layers.pop()

for layer in inception.layers:
    layer.trainable = False

In [14]:
inception.layers[-2].output

<KerasTensor: shape=(None, 2048) dtype=float32 (created by layer 'avg_pool')>

##### image ==> [Inception + Embedding ] ==> Feature Extracted image of shape (None,300)

In [16]:
# building the final model
final_model = Model(inputs = inception.input,outputs = inception.layers[-1].output)

In [17]:
final_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 299, 299, 3)]        0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 149, 149, 32)         864       ['input_1[0][0]']             
                                                                                                  
 batch_normalization (Batch  (None, 149, 149, 32)         96        ['conv2d[0][0]']              
 Normalization)                                                                                   
                                                                                                  
 activation (Activation)     (None, 149, 149, 32)         0         ['batch_normalization[0][0

In [18]:
TARGET_SIZE = (299,299)

In [22]:
# code for image imbedding i.e converting image to 300 dimentional

train_image_extracted = dict()
with open("Flickr8k_text/Flickr_8k.trainImages.txt","r") as f:
    data = f.read()
    
try:
    for el in data.split("\n"):
        tokens = el.split(".")
        image_id = tokens[0]
        img = load_img("Flicker8k_Dataset/{}.jpg".format(image_id),target_size=TARGET_SIZE)
        # Converting image to array
        img_array = img_to_array(img)
        nimage = preprocess_input(img_array)
        # Adding one more dimesion
        nimage = np.expand_dims(nimage, axis=0)    
        fea_vec = final_model.predict(nimage)
        train_image_extracted[image_id] = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )


except Exception as e:
    print("Exception got :- \n",e)

Exception got :- 
 [Errno 2] No such file or directory: 'Flicker8k_Dataset/.jpg'


In [23]:
# save the file 
dump(train_image_extracted,open("train_image_extracted.pkl","wb"))

In [25]:
print(final_model.output.shape)

(None, 1000)
