In [1]:
import string
import numpy as np
import os
import pickle
from PIL import Image
from tqdm import tqdm_notebook as tqdm

In [2]:
from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, load_model
from keras.layers.merge import add
from keras.layers import Input, Dense, LSTM, Embedding, Dropout

In [3]:
def load_doc(filename):
    with open(filename, "r") as file:
        text = file.read()
    return text

In [4]:
def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions = {}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [ caption ]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

In [5]:
def cleaning_text(captions):
    tab = str.maketrans('','',string.punctuation)
    for img, caps in captions.items():
        for i, img_caption in enumerate(caps):
            img_caption.replace("-", " ")
            desc = img_caption.split()
            
            desc = [word.lower() for word in desc]
            desc = [word for word in desc if len(word) > 1]
            desc = [word for word in desc if word.isalpha()]
            
            img_caption = ' '.join(desc)
            captions[img][i] = img_caption
    return captions

In [6]:
def text_voabulary(desriptions):
    vocab = set()
    
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    return vocab

In [7]:
def save_descriptions(descriptions, filename):
    lines = []
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + "\t" + desc)
    data = "\n".join(lines)
    with open(filename, "r") as f:
        f.write(data)

In [8]:
url_text = "Flickr8k_text/Flickr8k.token.txt"
descriptions = all_img_captions(url_text)

In [9]:
descriptions

{'1000268201_693b08cb0e.jpg': ['A child in a pink dress is climbing up a set of stairs in an entry way .',
  'A girl going into a wooden building .',
  'A little girl climbing into a wooden playhouse .',
  'A little girl climbing the stairs to her playhouse .',
  'A little girl in a pink dress going into a wooden cabin .'],
 '1001773457_577c3a7d70.jpg': ['A black dog and a spotted dog are fighting',
  'A black dog and a tri-colored dog playing with each other on the road .',
  'A black dog and a white dog with brown spots are staring at each other in the street .',
  'Two dogs of different breeds looking at each other on the road .',
  'Two dogs on pavement moving toward each other .'],
 '1002674143_1b742ab4b8.jpg': ['A little girl covered in paint sits in front of a painted rainbow with her hands in a bowl .',
  'A little girl is sitting in front of a large painted rainbow .',
  'A small girl in the grass plays with fingerpaints in front of a white canvas with a rainbow on it .',
  'T

In [10]:
def extract_features(directory):
    model = Xception(include_top=False, pooling="avg")
    features = {}
    for img in tqdm(os.listdir(directory)):
        filename = directory + "/" + img
        image = Image.open(filename)
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0
        
        feature = model.predict(image)
        features[img] = feature
    return features

In [None]:
data_images = "Flicker8k_Dataset"
features = extract_features(data_images)
pickle.dump(features, open("features.p", "wb"))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=8091.0), HTML(value='')))

### load data

In [None]:
def load_photos(filename):
    file = load_doc(filename)
    photos = file.split("/n")[:-1]
    return photos

In [None]:
def load_clean_descriptions(filename, photos):
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("\n"):
        words = line.split()
        if len(words) < 1:
            continue
        
        image, image_caption = words[0], words[1:]
        
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = "<START>" + ' '.join(image_caption) + '<END>'
    return descriptions

In [None]:
def load_features(photos):
    all_features = load(open("features.p", "rb"))
    features = {k:all_features[k] for k in photos}
    return features

In [None]:
train_data = "Flickr8k_text/Flickr_8k.trainImages.txt"

train_imgs = load_photos(train_data)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

### TOKENIZING VOCABULARY

In [None]:
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

In [None]:
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer.p', "wb"))
vocab_size = len(tokenizer.word_index) + 1
vocab_size

In [None]:
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

max_length = max_length(descriptions)
max_length

In [None]:
def data_generator(descripions, features, tokenizer, max_length):
    while True:
        for key, description_list in descriptions.items():
            feature = feature[key][0]
            input_img, input_seq, outpu_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield [[input_img, input_seq], output_word]

In [None]:
def create_sequences(tokenizer, max_length, desc_list, feature):
    x1, x2, y = []. [], []
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = to_categorial([out_seq], num_classes=vocab_size)[0]
            x1.append(feature)
            x2.append(in_seq)
            y.append(out_seq)
    return np.array(x1), np.array(x2), np.array(y)

In [None]:
[a, b], c = next(data_generator(train_descriptions, features, tokenizer, max_length))
a.shape, b.shape, c.shape

In [None]:
from keras.utils import plot_model

def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048, ))
    fe1 = Dropout(0.5)(inputs1)
    fe2 =Dense(256, activation="relu")(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(voab_size, 256, mask_zer=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(356)(se2)
    
    decoder1 = add([fe1, fe2])
    decoder2 = Dense(256, activation="relu")(decoder1)
    outputs = Dense(vocab_size, activation="softmax")(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], utputs=outputs)
    model.compile(loss="categorical_crossentropy", optimizer='adam')
    
    print(model.summary())
    plot_model(model, to_file="model.png", show_shapes=True)
    
    return model

In [None]:
model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_descriptions)

os.mkdir("models")

for i in range(epochs):
    generatr = data_generator(train_descriptions, train_features, tokenizer, verbose=1)
    model.save(f"models/model_{str(i)}.h5")

### TESTING THE MODEL

In [None]:
import matplotlib.pyplot as plt

def extract_features(filename, model):
    image = Image.open(filename)
    
    if image.shape[2] == 4:
        imaage = image[..., :3]
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0
        feature = model.predict(image)
        return feature


In [None]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [None]:
def generate_desc(model, tokenizer, photo, max_length):
    in_text = "start"
    for i in range(max_length):
        seq = tokenizer.texts_to_sequences([in_text])[0]
        seq = pad_sequence([sequence], maxlen=max_length)
        pred = model.predict([photo, sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        
        if word is None:
            break
        in_text += " " + word
        if word == "end":
            break
    return in_text

In [None]:
max_length = 32
tokenizer = load(open("tokenizer.p", "rb"))
model = load_model("mdels/model_10.h5")
xception_model = Xception(include_top=False, pooling="avg")

photo = extract_features(img_path, xception_model)
img = Image.open(img_path)

desription = generate_desc(model, tokenizer, photo, max_length)
description

In [None]:
plt.imshow(img)
plt.show()