In [1]:
import os
import string
import numpy as np

from pickle import dump
from pickle import load

from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input

from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model

from keras.layers import Input
from keras.layers import Embedding
from keras.layers import Dropout
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import add
from keras.models import Model
from keras.models import load_model
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
def extract_features(data_path):
    model = VGG16()
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
#     print(model.summary())
    
    features = {}
    for i, f in enumerate(os.listdir(data_path)):
        filename = data_path + "/" + f
        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image)  # this is (224, 224, 3)
        image = image.reshape(-1, image.shape[0], image.shape[1], image.shape[2])  # this is (1, 224, 224, 3)
        image = preprocess_input(image)
        feature = model.predict(image)
        image_id = f.split(".")[0]
        features[image_id] = feature
        if i%100 == 0:
            print('Images done: ', i)
    return features

In [3]:
# data_path = "data/Flickr8k_Dataset/Flicker8k_Dataset/" 
# features = extract_features(data_path) 
# dump(features, open('data/features.pkl', 'wb')) 
# print(len(features))

In [4]:
def load_doc(filepath):
    with open(filepath, 'r') as f:
        text = f.read()
    return text

In [5]:
def load_descriptions(doc):
    
    ## Return dictionary of image_name: image_description
    
    mapping = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        if len(tokens) < 2:
            continue
        
        image_id = tokens[0].split('.')[0]
        image_desc = ''.join(tokens[1:])
        
        if image_id not in mapping:
            mapping[image_id] = []
            mapping[image_id].append(image_desc)
            
    return mapping

In [6]:
filepath = 'data/Flickr8k_text/Flickr8k.token.txt'
doc = load_doc(filepath)
descriptions = load_descriptions(doc)
print('Loaded {} descriptions'.format(len(descriptions)))

Loaded 8092 descriptions


In [7]:
def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i, desc in enumerate(desc_list):
            desc = desc.split()
            desc = [w.lower() for w in desc]
            desc = [w.translate(table) for w in desc]
            desc = [w for w in desc if len(w) > 1]
            desc = [w for w in desc if w.isalpha()]
            descriptions[key][i] = ' '.join(desc)

In [8]:
def to_vocabulary(descriptions):
    words = set()
    for key in descriptions.keys():
        for d in descriptions[key]:
            words.update(d.split())
    return words

In [9]:
def save_descriptions(descriptions, output_filepath):
    with open(output_filepath, 'w') as ofp:
        for key,desc_list in descriptions.items():
            for d in desc_list:
                ofp.write(key + ' ' + d + '\n')

In [10]:
clean_descriptions(descriptions)
vocab = to_vocabulary(descriptions)
print(len(vocab))
save_descriptions(descriptions, 'data/clean_descriptions.txt')

8029


In [11]:
def load_identifiers(filepath):
    doc = load_doc(filepath)
    ids = set()
    for line in doc.split('\n'):
        if len(line) < 1:
            continue
        ids.add(line.split('.')[0])
    return ids

In [12]:
def load_clean_descriptions(filepath, ids):
    doc = load_doc(filepath)
    descriptions = {}
    for line in doc.split('\n'):
        tokens = line.split()
        if len(tokens) < 1:
            continue
        image_id, image_desc = tokens[0], tokens[1:]
        if image_id in ids:
            if image_id not in descriptions:
                descriptions[image_id] = []
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            descriptions[image_id].append(desc)
    return descriptions

In [13]:
def load_photo_features(filepath, ids):
    features = load(open(filepath, 'rb'))
    return {k:features[k] for k in ids}

In [14]:
filepath = 'data/Flickr8k_text/Flickr_8k.trainImages.txt'
train_ids = load_identifiers(filepath)
print(len(train_ids))
train_descriptions = load_clean_descriptions('data/clean_descriptions.txt', train_ids)
print(len(train_descriptions))
train_features = load_photo_features('data/features.pkl', train_ids)
print(len(train_features))

6000
6000
6000


In [15]:
def to_lines(descriptions):
    desc_list = []
    for key in descriptions:
        for d in descriptions[key]:
            desc_list.append(d)
    return desc_list

In [16]:
def create_tokenizer(descriptions):
    tok = Tokenizer()
    tok.fit_on_texts(to_lines(descriptions))
    
    return tok

In [17]:
tokenizer = create_tokenizer(train_descriptions)

In [18]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

5960


In [19]:
def create_sequences(tokenizer, max_length, desc_list, photo):
    X1, X2, y = [], [], []
    vocab_size = len(tokenizer.word_index) + 1
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [26]:
def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

In [27]:
max_length = max_length(train_descriptions)
print(max_length)

X1_train, X2_train, y_train = create_sequences(tokenizer, 
                                               max_length, 
                                               train_descriptions, 
                                               train_features)

3


In [28]:
filepath = 'data/Flickr8k_text/Flickr_8k.devImages.txt'
test_ids = load_identifiers(filepath)
print('Test Ids: ', len(test_ids))

test_descriptions = load_clean_descriptions('data/clean_descriptions.txt', test_ids)
print('Test desc: ', len(test_descriptions))

test_features = load_photo_features('data/features.pkl', test_ids)
print(len(test_features))

X1_test, X2_test, y_test = create_sequences(tokenizer, max_length, test_descriptions, test_features)

Test Ids:  1000
Test desc:  1000
1000


In [29]:
def neural_net(vocab_size, max_length):
    inputs1 = Input(shape=(4096, ))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    inputs2 = Input(shape=(max_length, ))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    decoder_1 = add([fe2, se3])
    decoder_2 = Dense(256, activation='relu')(decoder_1)
    
    outputs = Dense(vocab_size, activation='softmax')(decoder_2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    
    model.compile(loss='categorical_crossentropy',
                 optimizer='adam')
    
    print(model.summary())
#     plot_model(model, to_file='results/model.png', show_shapes=True)
    
    return model

In [30]:
def data_generator(tokenizer, max_length, descriptions, photos):
    while True:
        for key, desc_list in descriptions.items():
            photo = photos[key][0]
            in_img, in_seq, out_word = create_sequences(tokenizer,
                                                       max_length,
                                                       desc_list,
                                                       photo)
            
            yield [[in_img, in_seq], out_word]

In [31]:
model = neural_net(vocab_size, max_length)
filepath = 'initial_model.h5'
checkpoint = ModelCheckpoint(filepath, 
                            monitor='val_loss', 
                            verbose=True, 
                            save_best_only=True, 
                            mode='min')


generator = data_generator(tokenizer, max_length, train_descriptions, train_features)
num_epochs = 10
for i in range(num_epochs):
    model.fit_generator(generator, epochs=1, steps_per_epoch=len(descriptions), verbose=1)
    model.save('models/initial_model{}.h5'.format(i))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 3)            0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 3, 256)       1525760     input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)  

KeyboardInterrupt: 