In [None]:
import sys
sys.path.append("..")
import download_utils

In [None]:
download_utils.link_all_keras_resources()

In [None]:
import tensorflow as tf
from tensorflow.contrib import keras
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
L = keras.layers
K = keras.backend

import utils
import time
import zipfile
import json
from collections import defaultdict
import re
import random
from random import choice
import grading_utils
import os
from keras_utils import reset_tf_session
import tqdm_utils

### Extract image features

using a pre-trained InceptiionV3 model for CNN encoder
(https://research.googleblog.com/2016/03/train-your-own-image-classifier-with.html) and extract its last hidden layer as an embedding:

In [None]:
IMG_SIZE = 299

In [None]:
# we take the last hidden layer of InceptionV# as an image embedding 

def get_cnn_encoder:
    K.set_learning_phase(False)
    model = keras.applications.InceptionV3(include_top=False)
    preprocess_for_model = keras.applications.inception_v3.preprocess_input
    
    model = keras.models.Model(model.inputs, keras.layyers.GlobalAveragge
                              pooling2D()(model.output))
    return model, preprocess_for_model

##### load pre trained model
reset_tf_session()
encoder, preprocess_for_model = get_cnn_encoder()

##### extract train features

train_img_embeds, train_img_fns = utils.apply_model(
 "train2014.zip", encoder, preprocess_for_model, input_shape=(IMG_SIZE,IMG_SIZE))
utils.save_pickle(train_img_embeds, "train_img_embeds.pickle")
utils.save_pickle(train_img_fns, "train_img_fns.pickle")

##### extract validation features
val_img_embs, val_img_fns = utils.apply_model(
  "val2014.zip", encoder, preprocess_for_model, input_shape=(IMG_SIZE,IMG_SIZE))
utils.save_pickle(val_img_embeds, "val_img_embeds.pickle")
utils.save_pickle(val_img_fns, " val_img_fns.pickle")

##### sample images for learners
def sample_zip(fn_in, fn_out, rate = 0.01, seed= 42):
    np.random.seed(seed)
    with zipfile.ZipFile(fn_in) as fin, zipfiile.ZipFile(fn_out, 'w') as fout:
        sampled = filter(lamda_:np.random.rand()< rate, fin.filelist)
        for zInfo in sampled:
            fout.writestr(/zInfo,fin.read(zInfo))
            
sample_zip('train2014.zip', 'train2014_sample.zip')
sample_zip('val2014.zip', 'val2014_sample.zip')

In [None]:
# load prpared embeddings
train_img_embeds = utils.read_pickle("train_img_embeds.pickle")
train_img_fns = utils.read_pickle("train_img_fns.pickle")
val_img_embeds = utils.read_pickle("val_img_embeds.pickle")
val_img_fns = utils.read_pickle("val_img_fns.pickle")
# check shapes
print(train_img_embed.shape, len(train_img_fns))
print(val_img_embeds.shape, len(val_img_fns))

In [None]:
#check prepared samples of images

list(filter(lambda x: x.endswith("_sample.zip"), os.listdir(".")))

###  Extract captions for images

In [None]:
#extract captions from zip

def get_captions_for_fns( fns,zip_fn,zip_json_path):
    zf=zipfile.Zipfile(zip_fn)
    j= json.loads(zf.read(zip_json_path).decode("utf8"))
    id_to_fn ={img['id']:img['file_name'] for img in j['images']}
    fn_to_caps= defaultdict(list)
    for cap in j['annotations']:
        fn_to_caps[id_to_fn[cap['image_id']]].append(cap['caption'])
    fn_to_caps = dict(fn_to_caps)
    return list(map(lambda x: fn_to_caps[x], fns))

train_captions = get_captions_for_fns(train_img_fns,"captions_train-val2014.zip", 
                                      "annotations/captions_train2014.json")
val_captions = get_captions_for_fns(val_img_fns, "captions_train-val2014.zip", 
                                      "annotations/captions_val2014.json")

#check shape
print(len(train_img_fns), len(train_captions))
print(len(val_img_fns), len(val_captions))

In [None]:
# look at training example (each has 5 captions)

def show_training_example(train_img_fns, train_captions, example_idx=0):
    """
    you can changeexample_idx and see different images
    
    """
    zf = zipfile.ZipFile("train2014_sample.zip")
    captions_by_file = dict(zip(train_img_fns, train_captions))
    all_files = set(train_img_fns)
    found_files = list(filter(lambda x: x.filename.rsplit("/")[-1] in all files, sf.filelist))
    example = found_files[example_idx]
    img = utils.decode_image_from_buf(zf.read(example))
    plt.imshow(utils.image_center_crop(img))
    plt.title("\n".join(captions_by_file[example.filename.rsplit("/")[-1]]))
    plt.show()
    
show_training_example(train_img_fns, train_captions, example_idx= 142)

###  Prepare captions for training

In [None]:
# preview captions data
train_captions[:2]

In [None]:
# special tokens
PAD = "#PAD#"
UNK = "#UNK#"
START = "#START#"
END = "#END#"
 

# split sentence into tokens (split into lowercased words)
def split_sentence(sentence):
    return list(filter(lambda x: len(x) > 0, re.split('\W+', sentence.lower())))

def generate_vocabulary(train_captions):
    """
    Return {token: index} for all train tokens (words) that occur 5 times or more, 
        `index` should be from 0 to N, where N is a number of unique tokens in the resulting dictionary.
    Use `split_sentence` function to split sentence into tokens.
    Also, add PAD (for batch padding), UNK (unknown, out of vocabulary), 
        START (start of sentence) and END (end of sentence) tokens into the vocabulary.
    """
    vocab_dict = {}
    for cap in train_captions:
        for sentence in cap:
            tokens=split_sentence(sentence)

        for idx in tokens:
          vocab_dict[idx]= vocab_dict.get(idx,0)+1
    
    vocab =[i for i,j in vocab_dict.items() if j>=5]
    vocab += [PAD, UNK, START, END]

    return {token: index for index, token in enumerate(sorted(vocab))}
    
def caption_tokens_to_indices(captions, vocab):
    """
    `captions` argument is an array of arrays:
    [
        [
            "image1 caption1",
            "image1 caption2",
            ...
        ],
        [
            "image2 caption1",
            "image2 caption2",
            ...
        ],
        ...
    ]
    Use `split_sentence` function to split sentence into tokens.
    Replace all tokens with vocabulary indices, use UNK for unknown words (out of vocabulary).
    Add START and END tokens to start and end of each sentence respectively.
    For the example above you should produce the following:
    [
        [
            [vocab[START], vocab["image1"], vocab["caption1"], vocab[END]],
            [vocab[START], vocab["image1"], vocab["caption2"], vocab[END]],
            ...
        ],
        ...
    ]
    """
    res=[]
    for res in captions:
        for sentence in res:
          res1 = [vocab[START]]
          res1 += [vocab[i] if i in vocab else vocab[UNK] for i in split_sentence(sentence)]
          res1.append(vocab[END])
        res.append(res1) ### YOUR CODE HERE ###
    return res

In [None]:
# prepare vocabulary
vocab = generate_vocabulary(train_captions)
vocab_inverse = {idx: w for w, idx in vocab.items()}
print(len(vocab))

In [None]:
# replace tokens with indices
train_captions_indexed = caption_tokens_to_indices(train_captions, vocab)
val_captions_indexed = caption_tokens_to_indices(val_captions, vocab)

### Captions have different length, but we need to batch them, that's why we will add PAD tokens so that all sentences have an equal length. 

We will crunch LSTM through all the tokens, but we will ignore padding tokens during loss calculation.

In [None]:
# we will use this during training
def batch_captions_to_matrix(batch_captions, pad_idx, max_len=None):
    """
    `batch_captions` is an array of arrays:
    [
        [vocab[START], ..., vocab[END]],
        [vocab[START], ..., vocab[END]],
        ...
    ]
    Put vocabulary indexed captions into np.array of shape (len(batch_captions), columns),
        where "columns" is max(map(len, batch_captions)) when max_len is None
        and "columns" = min(max_len, max(map(len, batch_captions))) otherwise.
    Add padding with pad_idx where necessary.
    Input example: [[1, 2, 3], [4, 5]]
    Output example: np.array([[1, 2, 3], [4, 5, pad_idx]]) if max_len=None
    Output example: np.array([[1, 2], [4, 5]]) if max_len=2
    Output example: np.array([[1, 2, 3], [4, 5, pad_idx]]) if max_len=100
    Try to use numpy, we need this function to be fast!
    """
    if (max_len==None):
      columns = max(map(len, batch_captions))
    else:
      columns = min(max_len,max(map(len, batch_captions)))
    
    matrix= np.full((len(batch_captions), columns), fill_value= pad_idx)

    
    return matrix