In [9]:
import numpy as np
import pandas as pd
import pickle

from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, RepeatVector, TimeDistributed, Merge, Masking
from keras.layers.merge import add, concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD

# Model Design

Here we define the model, including the merger of the Convnet features and the LSTM network. I will not train my model here, as my CPU has no chance with this complex of a network, but I will verify that the loss is decreasing before deploying this model to a GPU on AWS. 

## 1. Load Data

We'll load the training data, as well as the embedding matrix

In [5]:
def load_npy(path):
    with open(path, "rb") as handle:
        arr = np.load(handle)
    handle.close()
    return (arr)

In [6]:
X_train_photos = load_npy("../data/preprocessed/X_train_photos.npy")
X_train_captions = load_npy("../data/preprocessed/X_train_captions.npy")
embedding_matrix = load_npy("../data/embedding_matrix/embedding_matrix.npy")
y_train = load_npy("../data/preprocessed/y_train.npy")

In [4]:
print(X_train_photos.shape)
print(X_train_captions.shape)
print(y_train.shape)
print(embedding_matrix.shape)

(541448, 4096)
(541448, 15)
(541448, 1)
(30212, 300)


## 2. Define model

In [2]:
VOCAB_SIZE = 30212

In [7]:
# input 1: photo features
inputs_photo = Input(shape = (4096,), name="Inputs-photo")
# add a dense layer on top of that, with ReLU activation and random dropout
drop1 = Dropout(0.5)(inputs_photo)
dense1 = Dense(256, activation='relu')(drop1)

#input 2: caption sequence
inputs_caption = Input(shape=(15,), name = "Inputs-caption")
embedding = Embedding(VOCAB_SIZE, 300,
                mask_zero = True, trainable = False,
                weights=[embedding_matrix])(inputs_caption)
drop2 = Dropout(0.5)(embedding)
lstm1 = LSTM(256)(drop2)

#decoder model
merged = concatenate([dense1, lstm1])
dense2 = Dense(256, activation='relu')(merged)
outputs = Dense(VOCAB_SIZE, activation='softmax')(dense2)
# tie it together [image, seq] [word]
model = Model(inputs=[inputs_photo, inputs_caption], outputs=outputs)
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd)
# summarize model
print(model.summary())
plot_model(model, to_file='images/model1.png', show_shapes=True, show_layer_names=False)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Inputs-caption (InputLayer)     (None, 15)           0                                            
__________________________________________________________________________________________________
Inputs-photo (InputLayer)       (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 15, 300)      9063600     Inputs-caption[0][0]             
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 4096)         0           Inputs-photo[0][0]               
__________________________________________________________________________________________________
dropout_3 

![](images/model1.png)

In [8]:
model.fit([X_train_photos,X_train_captions], to_categorical(y_train, VOCAB_SIZE), epochs = 1, verbose = 1)

Epoch 1/1
  1248/541448 [..............................] - ETA: 1:48:01 - loss: 8.9781

KeyboardInterrupt: 

## 3. Inject model

In [10]:
# input 1: photo features
inputs_photo = Input(shape = (4096,), name="Inputs-photo")
# add a dense layer on top of that, with ReLU activation and random dropout
drop1 = Dropout(0.5)(inputs_photo)
dense1 = Dense(300, activation='relu')(drop1)
# add time dimension so that this layer output shape is (None, 1, embed_size)
cnn_feats = Masking()(RepeatVector(1)(dense1))

#input 2: caption sequence
inputs_caption = Input(shape=(15,), name = "Inputs-caption")
embedding = Embedding(VOCAB_SIZE, 300,
                mask_zero = True, trainable = False,
                weights=[embedding_matrix])(inputs_caption)
# merge the models: decoder model
# Ouput shape should be (None, maxlen + 1, embed_size)
merged = concatenate([cnn_feats, embedding], axis=1)
# now feed the concatenation into a LSTM layer (many-to-many)
lstm_layer = LSTM(units=300,
                  input_shape=(15 + 1, 300),   # one additional time step for the image features
                  return_sequences=False,
                  dropout=.5)(merged)

    # create a fully connected layer to make the predictions
outputs = Dense(units=VOCAB_SIZE,activation='softmax')(lstm_layer)

model = Model(inputs=[inputs_photo, inputs_caption], outputs=outputs)
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='sparse_categorical_crossentropy', optimizer=sgd)
# summarize model
print(model.summary())
plot_model(model, to_file='images/model6.png', show_shapes=True,show_layer_names=False )



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Inputs-photo (InputLayer)       (None, 4096)         0                                            
__________________________________________________________________________________________________
dropout_5 (Dropout)             (None, 4096)         0           Inputs-photo[0][0]               
__________________________________________________________________________________________________
dense_6 (Dense)                 (None, 300)          1229100     dropout_5[0][0]                  
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)  (None, 1, 300)       0           dense_6[0][0]                    
__________________________________________________________________________________________________
Inputs-cap

![](images/model6.png)

In [49]:
model.fit([X_train_photos,X_train_captions], y_train, epochs = 1, verbose = 1)

Epoch 1/1

KeyboardInterrupt: 