### 07. Validation

Run predictions for images in Validation Set (40.5K images) and save captions to a json file.

Use the json file to run performance measurements in 8th notebook

In [1]:
import time
import json
import pickle

import numpy as np
import matplotlib.pyplot as plt
from keras.models import model_from_json, load_model
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.sequence import pad_sequences
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from keras.models import Model

from pycocotools.coco import COCO
from tqdm import tqdm

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# standard variables
MAX_LENGTH = 52
VOCAB_SIZE = 6321 # Using unique words as vocab here.
NPIX = 299
TARGET_SIZE = (NPIX,NPIX,3)
EMBEDDING_SIZE = 300

In [3]:
def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

index_and_words = load_pickle("index_and_words.pkl")
ix_to_word = index_and_words['ix_to_word']
word_to_ix = index_and_words['word_to_ix']

In [4]:
modified_inception = load_model("modified_inception.h5", compile=False)

In [5]:
def load_model_from_json(path):
    with open(path,"r") as f:
        model = model_from_json(f.read())
    print("Model loaded successfully")
    return model

In [6]:
# model = load_model("coco_19th_epoch.h5", compile=False)
model = load_model_from_json("model.json")

# loading the weights of the model
model.load_weights("model_weights/amey_19.h5")

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model loaded successfully


In [7]:
def beam_search_predictions(photo, beam_index=3):
    start = [word_to_ix["startseq"]]
    
    # start_word[0][0] = index of the starting word
    # start_word[0][1] = probability of the word predicted
    start_word = [[start, 0.0]]
    
    while len(start_word[0][0]) < MAX_LENGTH:
        temp = []
        for s in start_word:
            par_caps = pad_sequences([s[0]], maxlen=MAX_LENGTH, padding='post')
            preds = model.predict([photo, par_caps], verbose=0)
            
            # Getting the top <beam_index>(n) predictions
            word_preds = np.argsort(preds[0])[-beam_index:]
            
            # creating a new list so as to put them via the model again
            for w in word_preds:
                next_cap, prob = s[0][:], s[1]
                next_cap.append(w)
                prob += preds[0][w]
                temp.append([next_cap, prob])
                    
        start_word = temp
        # Sorting according to the probabilities
        start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
        # Getting the top words
        start_word = start_word[-beam_index:]
    
    start_word = start_word[-1][0]
    intermediate_caption = [ix_to_word[i] for i in start_word]

    final_caption = []
    
    for i in intermediate_caption:
        if i != 'endseq':
            final_caption.append(i)
        else:
            break
    
    final_caption = ' '.join(final_caption[1:])
    return final_caption

In [8]:
def greedySearch(photo):
    in_text = 'startseq'
    for i in range(MAX_LENGTH):
        sequence = [word_to_ix[w] for w in in_text.split() if w in word_to_ix]
        sequence = pad_sequences([sequence], maxlen=MAX_LENGTH)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = ix_to_word[yhat]
        in_text += ' ' + word
        if word == 'endseq':
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [9]:
def getFeatureVector(imgPath):
    try:
        img = load_img(imgPath, target_size=TARGET_SIZE)
    except OSError as e:
        print("Problem with image:",e)
    
    # Converting image to array
    img_array = img_to_array(img)
    nimage = preprocess_input(img_array)
    # Adding one more dimesion
    nimage = np.expand_dims(nimage, axis=0)    
    fea_vec = modified_inception.predict(nimage)
    return np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )

#### Loading and storing Validation Set

Load all images from validation set (val2014) and store their feature vectors in a pickle.

In [10]:
dataDir='coco'
dataType='val2014'
annFile='{}/annotations/captions_{}.json'.format(dataDir,dataType)

In [11]:
coco=COCO(annFile)

loading annotations into memory...
Done (t=0.75s)
creating index...
index created!


In [12]:
def getImgPath(imgId):
    padding = "0" * (12  - len(str(imgId)))
    imgName = "{}{}".format(padding, imgId)
    return "{}/{}/COCO_{}_{}.jpg".format(dataDir, dataType, dataType, imgName)

In [13]:
imgIds = coco.getImgIds()

In [14]:
val_set_feature_matrix = dict()

#### only run when feature matrix for validation set is unavaiable.

In [None]:
# code for image imbedding i.e converting image to 300 dimentional

try:
    for imgId in tqdm(imgIds, ascii=True, desc="Generating Val Set Matrix"):
        img = load_img(getImgPath(imgId), target_size=TARGET_SIZE)
        # Converting image to array
        img_array = img_to_array(img)
        nimage = preprocess_input(img_array)
        # Adding one more dimesion
        nimage = np.expand_dims(nimage, axis=0)    
        fea_vec = modified_inception.predict(nimage)
        val_set_feature_matrix[imgId] = np.reshape(fea_vec, fea_vec.shape[1]) # reshape from (1, 2048) to (2048, )


except Exception as e:
    print("Exception got :- \n",e)

In [21]:
len(val_set_feature_matrix)

40504

In [22]:
def save_embedding_matrix(matrix):
    with open("val_set_feature_matrix.pkl","wb") as f:
        pickle.dump(matrix, f)

save_embedding_matrix(val_set_feature_matrix)
print(modified_inception.output.shape)

(None, 2048)


#### Load pickle and run validations

In [14]:
def load_pickle(path):
    with open(path, "rb") as f:
        return pickle.load(f)

def dump_results(results, algo):
    filename = "{}/results/captions_{}_{}_results.json".format(dataDir, dataType, algo)
    with open(filename, "w") as f:
        json.dump(results, f)

In [15]:
val_set_feature_matrix = load_pickle("validation_set/val_set_feature_matrix.pkl")

In [16]:
images = list(val_set_feature_matrix.keys())
results = list()
#     imgPath = getImgPath(imgId)
start = time.time()
for imgId in tqdm(images, ascii=True, desc="Generating Captions for Val Set"):
    image = val_set_feature_matrix[imgId]
    image = image.reshape((1,2048))
    record = {"image_id": imgId,
              "caption": beam_search_predictions(image, beam_index=5)}
    results.append(record)

print("Captioned {} images in {:.2f}s".format(len(images), time.time() - start))
dump_results(results, "beam_search_k5")

Generating Captions for Val Set:   0%|          | 14/40504 [00:29<24:38:33,  2.19s/it]

KeyboardInterrupt: 