In [0]:
!pip install -q tensorflow==2.1.0

In [0]:
# Import keras
import tensorflow as tf
from tensorflow.keras import layers, models

In [0]:
# IMAGE
#
# Define CNN for the visual processing
cnn_model = models.Sequential()
cnn_model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(224, 224, 3)))
cnn_model.add(layers.Conv2D(64, (3, 3), activation='relu'))
cnn_model.add(layers.MaxPooling2D(2, 2))
cnn_model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same'))
cnn_model.add(layers.Conv2D(128, (3, 3), activation='relu'))
cnn_model.add(layers.MaxPooling2D(2, 2))
cnn_model.add(layers.Conv2D(256, (3, 3), activation='relu', padding='same'))
cnn_model.add(layers.Conv2D(256, (3, 3), activation='relu'))
cnn_model.add(layers.Conv2D(256, (3, 3), activation='relu'))
cnn_model.add(layers.MaxPooling2D(2, 2))
cnn_model.add(layers.Flatten())



In [6]:
cnn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 224, 224, 64)      1792      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 222, 222, 64)      36928     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 111, 111, 64)      0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 111, 111, 128)     73856     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 109, 109, 128)     147584    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 54, 54, 128)       0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 54, 54, 256)       2

In [0]:
# define the visual model with proper input
image_input = layers.Input(shape=(224, 224, 3))
visual_mode = cnn_model(image_input)

In [0]:
# TEXT
#
# Define the RNN model for the text processing
question_input = layers.Input(shape=(100), dtype='int32')
embedding = layers.Embedding(input_dim=10000, output_dim=256, input_length=100)(question_input)
encoded_question = layers.LSTM(256)(embedding)

In [0]:
# Combine the encoded quetion and visual model
merged = layers.concatenate([encoded_question, visual_mode])

In [0]:
# Attach the dense network at the end
output = layers.Dense(1000, activation='softmax')(merged)

In [11]:
# Get the combined model
vqa_model = models.Model(inputs=[image_input, question_input], outputs=output)
vqa_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 256)     2560000     input_2[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 256)          525312      embedding[0][0]                  
______________________________________________________________________________________________