In [166]:
import string
import numpy as np
import os
import pickle
from PIL import Image
from tqdm import tqdm_notebook as tqdm

In [167]:
from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, load_model
from keras.layers.merge import add
from keras.layers import Input, Dense, LSTM, Embedding, Dropout

In [168]:
def load_doc(filename):
    with open(filename, "r") as file:
        text = file.read()
    return text

In [169]:
def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions = {}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [ caption ]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

In [170]:
def cleaning_text(captions):
    tab = str.maketrans('','',string.punctuation)
    for img, caps in captions.items():
        for i, img_caption in enumerate(caps):
            img_caption.replace("-", " ")
            desc = img_caption.split()
            
            desc = [word.lower() for word in desc]
            desc = [word.translate(tab) for word in desc]
            desc = [word for word in desc if len(word) > 1]
            desc = [word for word in desc if word.isalpha()]
            
            img_caption = ' '.join(desc)
            captions[img][i] = img_caption
    return captions

In [171]:
def text_voabulary(desriptions):
    vocab = set()
    
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

In [139]:
def save_descriptions(descriptions, filename):
    lines = []
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + "\t" + desc)
    data = "\n".join(lines)
    with open(filename, "w") as f:
        f.write(data)

In [140]:
url_text = "Flickr8k_text/Flickr8k.token.txt"
descriptions = all_img_captions(url_text)

In [141]:
clean_descriptions = cleaning_text(descriptions)
save_descriptions(clean_descriptions, "descriptions.txt")

In [181]:
def extract_features(directory):
    model = Xception(include_top=False, pooling="avg")
    features = {}
    for img in tqdm(os.listdir(directory)):
        filename = directory + "/" + img
        image = Image.open(filename)
        image = image.resize((299, 299))
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0
        
        feature = model.predict(image)
        features[img] = feature
    return features

In [182]:
data_images = "Flicker8k_Dataset"
features = extract_features(data_images)
pickle.dump(features, open("features.p", "wb"))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=8091.0), HTML(value='')))




In [186]:
features

{'1000268201_693b08cb0e.jpg': array([[0.47339666, 0.0173262 , 0.07333979, ..., 0.0855904 , 0.02102299,
         0.23766522]], dtype=float32),
 '1001773457_577c3a7d70.jpg': array([[0.0015821 , 0.11113481, 0.00037394, ..., 0.26503593, 0.35279837,
         0.05871647]], dtype=float32),
 '1002674143_1b742ab4b8.jpg': array([[0.        , 0.02488963, 0.01554057, ..., 0.        , 0.        ,
         0.1019263 ]], dtype=float32),
 '1003163366_44323f5815.jpg': array([[0.14568852, 0.00272412, 0.27776513, ..., 0.17018221, 0.11957315,
         0.09414084]], dtype=float32),
 '1007129816_e794419615.jpg': array([[0.        , 0.12443972, 0.73916227, ..., 0.00390437, 0.00997144,
         0.50172347]], dtype=float32),
 '1007320043_627395c3d8.jpg': array([[0.04136704, 0.        , 0.01274465, ..., 0.00944686, 0.6420169 ,
         0.0479213 ]], dtype=float32),
 '1009434119_febe49276a.jpg': array([[0.        , 0.        , 0.02624742, ..., 0.30528587, 0.23091768,
         0.14191085]], dtype=float32),
 '1012

In [187]:
# features = pickle.load(open("features.p"))

### load data

In [188]:
def load_photos(filename):
    file = load_doc(filename)
    photos = file.split("\n")[:-1]
    return photos

In [189]:
def load_clean_descriptions(filename, photos):
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("\n"):
        words = line.split()
        if len(words) < 1:
            continue
        
        image, image_caption = words[0], words[1:]
        
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = "<START>" + ' '.join(image_caption) + '<END>'
            descriptions[image].append(desc)
    return descriptions

In [190]:
def load_features(photos):
    all_features = pickle.load(open("features.p", "rb"))
    features = {k:all_features[k] for k in photos}
    return features

In [236]:
train_data = "Flickr8k_text/Flickr_8k.trainImages.txt"

train_imgs = load_photos(train_data)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

### TOKENIZING VOCABULARY

In [237]:
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [238]:
from keras.preprocessing.text import Tokenizer

In [239]:
def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

In [240]:
tokenizer = create_tokenizer(train_descriptions)
pickle.dump(tokenizer, open('tokenizer.p', "wb"))
vocab_size = len(tokenizer.word_index) + 1
vocab_size

7577

In [241]:
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

max_length = max_length(descriptions)
max_length

32

In [242]:
def data_generator(descripions, features, tokenizer, max_length):
    while True:
        for key, description_list in descriptions.items():
            feature = features[key][0]
            input_img, input_seq, output_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield ([input_img, input_seq], output_word)

In [243]:
def create_sequences(tokenizer, max_length, desc_list, feature):
    x1, x2, y = [], [], []
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            x1.append(feature)
            x2.append(in_seq)
            y.append(out_seq)
    return np.array(x1), np.array(x2), np.array(y)

In [244]:
[a, b], c = next(data_generator(train_descriptions, features, tokenizer, max_length))
a.shape, b.shape, c.shape

((37, 2048), (37, 32), (37, 7577))

In [245]:
from keras.utils import plot_model

def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048, ))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation="relu")(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation="relu")(decoder1)
    outputs = Dense(vocab_size, activation="softmax")(decoder2)
    
    model = Model([inputs1, inputs2], outputs)
    model.compile(loss="categorical_crossentropy", optimizer='adam')
    
    print(model.summary())
    plot_model(model, to_file="model.png", show_shapes=True)
    
    return model

In [246]:
model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_descriptions)

if not os.path.exists("models"):
    os.mkdir("models")

for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save(f"models/model_{str(i)}.h5")

Model: "functional_27"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_60 (InputLayer)           [(None, 32)]         0                                            
__________________________________________________________________________________________________
input_59 (InputLayer)           [(None, 2048)]       0                                            
__________________________________________________________________________________________________
embedding_25 (Embedding)        (None, 32, 256)      1939712     input_60[0][0]                   
__________________________________________________________________________________________________
dropout_52 (Dropout)            (None, 2048)         0           input_59[0][0]                   
______________________________________________________________________________________

UnknownError: 2 root error(s) found.
  (0) Unknown:  KeyError: '1022454332_6af2c1449a.jpg'
Traceback (most recent call last):

  File "R:\Work\Anacond\lib\site-packages\tensorflow\python\ops\script_ops.py", line 244, in __call__
    ret = func(*args)

  File "R:\Work\Anacond\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 302, in wrapper
    return func(*args, **kwargs)

  File "R:\Work\Anacond\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 827, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "R:\Work\Anacond\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 814, in wrapped_generator
    for data in generator_fn():

  File "<ipython-input-242-6ce9a9d0b45f>", line 4, in data_generator
    feature = features[key][0]

KeyError: '1022454332_6af2c1449a.jpg'


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
	 [[functional_27/embedding_25/embedding_lookup/_24]]
  (1) Unknown:  KeyError: '1022454332_6af2c1449a.jpg'
Traceback (most recent call last):

  File "R:\Work\Anacond\lib\site-packages\tensorflow\python\ops\script_ops.py", line 244, in __call__
    ret = func(*args)

  File "R:\Work\Anacond\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 302, in wrapper
    return func(*args, **kwargs)

  File "R:\Work\Anacond\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py", line 827, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "R:\Work\Anacond\lib\site-packages\tensorflow\python\keras\engine\data_adapter.py", line 814, in wrapped_generator
    for data in generator_fn():

  File "<ipython-input-242-6ce9a9d0b45f>", line 4, in data_generator
    feature = features[key][0]

KeyError: '1022454332_6af2c1449a.jpg'


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_1433894]

Function call stack:
train_function -> train_function


### TESTING THE MODEL

In [None]:
import matplotlib.pyplot as plt

def extract_features(filename, model):
    image = Image.open(filename)
    image = image.resize((299, 299))
    image = np.array(image)
    
    if image.shape[2] == 4:
        image = image[..., :3]
    image = np.expand_dims(image, axis=0)
    image = image/127.5
    image = image - 1.0
    feature = model.predict(image)
    return feature


In [None]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
def generate_desc(model, tokenizer, photo, max_length):
    in_text = "start"
    for i in range(max_length):
        seq = tokenizer.texts_to_sequences([in_text])[0]
        seq = pad_sequence([sequence], maxlen=max_length)
        pred = model.predict([photo, sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        
        if word is None:
            break
        in_text += " " + word
        if word == "end":
            break
    return in_text

In [None]:
max_length = 32
tokenizer = pickle.load(open("tokenizer.p", "rb"))
model = load_model("models/model_9.h5")
xception_model = Xception(include_top=False, pooling="avg")

photo = extract_features(img_path, xception_model)
img = Image.open(img_path)

desription = generate_desc(model, tokenizer, photo, max_length)
description

In [None]:
plt.imshow(img)
plt.show()