In [2]:
import pandas as pd
import numpy as np
import pickle

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import pdb
import time
import gc
from scipy.sparse import csr_matrix, lil_matrix

from gensim.models import KeyedVectors

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Dataset Formation

In the other notebooks, I pre-processed the captions for each image, and created an "image embedding" by extracting an intermediate representation from a pre-trained deep neural network called VGG16. 

Now, I will process each image/caption pair to create a dataset that can be used to train an LSTM. This will require fitting a tokenizer on the captions, and converting each caption into a set of training examples, where each word is to be predicted by its predecesessors. 



## 1. Load image features and captions

In [3]:
with open("../data/features/train_features.pkl", "rb") as handle:
    train_features = pickle.load(handle)

In [4]:
with open("../data/features/valid_features.pkl", "rb") as handle:
    valid_features = pickle.load(handle)

In [5]:
# load the captions
train_captions = pd.read_csv("../data/split_lists/train_ids.csv", dtype = str)
valid_captions = pd.read_csv("../data/split_lists/valid_ids.csv", dtype = str)

In [6]:
valid_captions.head()

Unnamed: 0,photo_id,caption
0,_ExrVJTjGcChfzLH51etAw,shanghai rainbow trout
1,yPUPhsJvT6yx6l8QwShw1Q,grill rainbow trout
2,zvESg-w2JIBL5FhU7F2d-g,chicken parm
3,uqdXqfB8MXW6XU7Hk1gGIQ,mcg holiday jazz
4,VMedbsDZnCxmCE3Pndvtng,dining room


In [7]:
# does everything make sense, in terms of shapes? 
print(valid_captions.shape[0] == len(valid_features))
print(train_captions.shape[0] == len(train_features))

True
True


## 2. Build a Tokenizer

Now we need to build a tokenizer, so that we can vectorize our words in a consistent way. 
First, I'll add start and end tokens to the begining of each caption to mark the start/end of the caption. 

In [6]:
train_captions.caption = "startseq " + train_captions.caption + " endseq"
valid_captions.caption = "startseq " + valid_captions.caption + " endseq"

In [7]:
train_captions.head()

Unnamed: 0,photo_id,caption
0,lHhMNhCA7rAZmi-MMfF3ZA,startseq bakery area endseq
1,kHrkA-8BY4tC-rejiJNhBQ,startseq the hangars house band tuned up check...
2,ljStFDx0XFg8jSbJIRhvGA,startseq thats the mans butt ask him about bee...
3,JoB_QTE2Hjr1NT0AgYdWzQ,startseq bbq shrimp endseq
4,PvdNq213kqk7N9raOcefEw,startseq sharkfin pie endseq


In [8]:
# fit a tokenizer 
tokenizer = Tokenizer()

In [9]:
all_captions = np.concatenate([train_captions.caption.values,valid_captions.caption.values])

In [10]:
# fit tokenizer
tokenizer.fit_on_texts(all_captions.astype(str))

In [13]:
# store the vocabulary size
vocab_size = 1 + len(tokenizer.word_index)
vocab_size

30212

In [12]:
mkdir ../data/tokenizer

In [13]:
with open("../data/tokenizer/tokenizer.pkl", "wb") as handle:
    pickle.dump(tokenizer, handle)

## 3. Build a consolidated dataset. 

To build a language model-type network, we need to convert each sequence into a set of training examples for each caption. for example, the sentence _little girl is running in field_, along with the feature of the corresponding photo, will be transformed into the following sets of training examples: 

```
X1,		    X2 (input sequence), 						y (target)
photo-features	startseq, 									little
photo-features	startseq, little,							girl
photo-features	startseq, little, girl, 					running
photo-features	startseq, little, girl, running, 			in
photo-features	startseq, little, girl, running, in, 		field
photo-features	startseq, little, girl, running, in, field, endseq
```

This example is taken from [Dr. Brownlee's awesome blog. ](https://machinelearningmastery.com/develop-a-deep-learning-caption-generation-model-in-python/)

In [14]:
def encode_and_pad(caption, sequence_length = 15):
    # questions encoded as index vectors
    encoded = tokenizer.texts_to_sequences([caption])
    # padded squences to be of length [sequence_length]
    padded = pad_sequences(encoded, 
                            maxlen = sequence_length,
                            padding = "post", 
                            truncating = "post")[0]
    return(padded)

In [15]:
def encode(caption):
    # questions encoded as index vectors
    encoded = tokenizer.texts_to_sequences([caption])[0]
    return (encoded)

In [95]:
# convert a dictionary of {photo_id : photo-featres} pairs and a dataframe of captions into two numpy arrays
# that can be used as a consolidated training dataset
def consolidate_dataset(features_dict, captions_df, sequence_length = 15):
    # keep track of the photo features and caption sequenes in lists
    X_photos, X_captions = [], []
    y = [] # build response vector
    e = 0
    prevtime = time.time()
    for photo_id in captions_df['photo_id']:
        # if the photo_id is not in the feature dictionary, move on
        if photo_id not in features_dict:
            continue
        if e % 1000 == 0:
            print ("reached %d in %f sec" % (e, time.time() - prevtime))
            prevtime = time.time()
        e += 1
        """
        For each word in the caption, add a copy of the photo features to the features list,
        as well as the vectorization of the caption up to and including the current word
        """
        current_feature = features_dict[photo_id][0]
        current_caption = str(captions_df.loc[captions_df.photo_id == photo_id].iloc[0]["caption"])
        current_caption_split = current_caption.split()
        for i in range(1,len(current_caption.split())):
            # add a copy of the photo features
            X_photos.append(current_feature)
            # encode the input and output sequence
            in_words, out_word = " ".join(current_caption_split[:i]), current_caption_split[i]
            in_seq = encode_and_pad(in_words, sequence_length = sequence_length)
            # add the training sequences and responses to list
            X_captions.append(in_seq)
            y.append(encode(out_word))
    # return all three
    return(X_photos, X_captions, y)

Now, running this function to get our training data and labels, as well as our validation data and labels:

In [96]:
X_valid_photos, X_valid_captions, y_valid = consolidate_dataset(valid_features, valid_captions, sequence_length=15)

reached 0 in 0.000296 sec
reached 1000 in 3.907506 sec
reached 2000 in 3.458567 sec
reached 3000 in 4.025187 sec
reached 4000 in 3.572920 sec
reached 5000 in 3.725117 sec
reached 6000 in 5.110378 sec
reached 7000 in 4.129266 sec
reached 8000 in 4.464129 sec
reached 9000 in 4.558153 sec
reached 10000 in 3.578678 sec
reached 11000 in 3.458837 sec
reached 12000 in 3.604255 sec
reached 13000 in 3.849694 sec
reached 14000 in 4.159780 sec
reached 15000 in 4.086161 sec
reached 16000 in 4.044107 sec
reached 17000 in 3.463907 sec
reached 18000 in 3.289455 sec
reached 19000 in 3.366791 sec
reached 20000 in 3.502182 sec


In [101]:
X_train_photos, X_train_captions, y_train = consolidate_dataset(train_features, train_captions, sequence_length=15)

reached 0 in 0.340574 sec
reached 1000 in 10.293230 sec
reached 2000 in 10.069726 sec
reached 3000 in 10.746813 sec
reached 4000 in 10.988190 sec
reached 5000 in 12.416748 sec
reached 6000 in 22.106808 sec
reached 7000 in 11.373367 sec
reached 8000 in 10.917947 sec
reached 9000 in 10.799975 sec
reached 10000 in 11.161716 sec
reached 11000 in 11.027584 sec
reached 12000 in 11.317970 sec
reached 13000 in 10.280054 sec
reached 14000 in 10.294515 sec
reached 15000 in 10.376237 sec
reached 16000 in 10.167927 sec
reached 17000 in 11.045395 sec
reached 18000 in 10.711648 sec
reached 19000 in 10.562855 sec
reached 20000 in 10.798727 sec
reached 21000 in 10.465860 sec
reached 22000 in 10.307129 sec
reached 23000 in 10.314936 sec
reached 24000 in 10.243981 sec
reached 25000 in 10.410780 sec
reached 26000 in 12.681155 sec
reached 27000 in 11.305082 sec
reached 28000 in 10.430054 sec
reached 29000 in 11.315182 sec
reached 30000 in 13.327277 sec
reached 31000 in 9.614445 sec
reached 32000 in 9.5421

## 4. Convert and save as numpy arrays

In [45]:
X_valid_photos = np.array(X_valid_photos, dtype = np.float32)

In [100]:
X_valid_captions = np.array(X_valid_captions, dtype = np.int16)

In [42]:
y_valid = np.array(y_valid, dtype = np.int16)

In [38]:
mkdir ../data/preprocessed

In [48]:
def save_npy(path, arr):
    with open(path, "wb") as handle:
        np.save(path, arr)

In [105]:
# save_npy("../data/preprocessed/X_valid_photos.npy", X_valid_photos)
save_npy("../data/preprocessed/X_valid_captions.npy", X_valid_captions)
# save_npy("../data/preprocessed/y_valid.npy", y_valid)

In [50]:
X_train_photos = np.array(X_train_photos, dtype = np.float32)

In [102]:
X_train_captions= np.array(X_train_captions, dtype = np.int16)

In [50]:
y_train = np.array(y_train, np.int16)

In [104]:
# save_npy("../data/preprocessed/X_train_photos.npy", X_train_photos)
save_npy("../data/preprocessed/X_train_captions.npy", X_train_captions)
# save_npy("../data/preprocessed/y_train.npy", y_train)

## 5. Save an embedding matrix

Finally, I'll load and save an embedding matrix (with pretrained word2vec vectors) as a numpy array. 

In [59]:
embedding_model = KeyedVectors.load_word2vec_format('~/Desktop/embeddings/word2vec/GoogleNews-vectors-negative300.bin',
                                                   binary = True)

In [64]:
# Each row in the matrix is the embedding of one word in the joint datasets. 
# The row index corresponds to the integer ecoding of that word. 
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    if word in embedding_model:
        embedding_matrix[i] = embedding_model[word]

In [67]:
embedding_model["this"]

array([ 0.109375  ,  0.140625  , -0.03173828,  0.16601562, -0.07128906,
        0.01586914, -0.00311279, -0.08496094, -0.04858398,  0.05566406,
       -0.08251953, -0.02404785, -0.00665283,  0.03686523, -0.05029297,
       -0.02941895,  0.11376953, -0.06787109,  0.05639648, -0.07568359,
       -0.03857422,  0.09716797, -0.04418945, -0.12207031,  0.140625  ,
        0.08496094,  0.09667969,  0.07470703, -0.0039978 ,  0.17285156,
       -0.06933594,  0.08886719,  0.03808594, -0.00061417,  0.01184082,
        0.00164032, -0.20898438, -0.08251953,  0.08984375,  0.07910156,
        0.12353516, -0.01867676,  0.03039551,  0.04711914,  0.06542969,
       -0.01251221,  0.00152588,  0.10644531, -0.01531982, -0.04199219,
        0.16796875,  0.05175781,  0.07470703,  0.08251953,  0.01721191,
        0.01599121,  0.02734375, -0.03686523,  0.08105469, -0.06445312,
       -0.08984375,  0.10742188,  0.01153564, -0.13671875,  0.05151367,
       -0.02429199,  0.02282715,  0.12353516,  0.01531982,  0.14

In [72]:
mkdir ../data/embedding_matrix

In [73]:
save_npy("../data/embedding_matrix/embedding_matrix.npy", embedding_matrix)