In [1]:
pip install keras

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install pillow

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load

In [3]:
from keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.utils import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.utils import to_categorical
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
from keras.layers import add

In [4]:
from tqdm import tqdm, tqdm_notebook
import pandas as pd
tqdm.pandas()

In [13]:
#loading a text file into memory

def load_doc(doc):
    file = open(doc, 'r')
    text = file.read()
    file.close()
    return text

#get all images with captions

def img_captions(doc):
    file = load_doc(doc)
    captions = file.split('\n')
    description = {}
    for cap in captions[:-1]:
        img, cap = cap.split('\t')
        if img[:-2] not in description:
            description[img[:-2]] = [cap]
        else:
            description[img[:-2]].append(cap)
    return description

In [14]:
#removing punctuations and words containing numbers

def clean(captions):
    table = str.maketrans('', '', string.punctuation)
    for img, caps in captions.items():
        for i, img_cap in enumerate(caps):
            img_cap.replace("-", " ")
            desc = img_cap.split()
            
            #all letters in lowercase
            desc = [word.lower() for word in desc]
            
            #remove punctuation
            desc = [word.translate(table) for word in desc]
            
            #remove 's and 'a'
            desc = [word for word in desc if(len(word)>1)]
            
            #remove tokens with numbers
            desc = [word for word in desc if(word.isalpha())]
            
            img_cap = ' '.join(desc)
            captions[img][i] = img_cap
            
    return captions

In [15]:
def text_vocab(description):
    vocab = set()
    for key in description.keys():
        [vocab.update(d.split()) for d in description[key]]
        
    return vocab

def save_desc(description, doc):
    lines = list()
    
    for key, desc_list in description.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc)
        data = "\n".join(lines)
        file = open(doc, "w")
        file.write(data)
        file.close()

In [16]:
data_text = r"C:\Users\vasun\Desktop\Image Caption Generator\Flickr8k_text"
data_img = r"C:\Users\vasun\Desktop\Image Caption Generator\Flickr8k_Dataset\Flicker8k_Dataset"

doc = data_text + "/" + "Flickr8k.token.txt"
description = img_captions(doc)
print("Length of descriptions =", len(description))

clean_desc = clean(description)
vocab = text_vocab(clean_desc)
print("Length of vocabulary =", len(vocab))

save_desc(clean_desc, "descriptions.txt")

Length of descriptions = 8092
Length of vocabulary = 8763


In [25]:
model = Xception(include_top=False, pooling='avg')

In [None]:
def extract(directory):
    model = Xception(include_top=False, pooling='avg')
    features = {}
    for img in tqdm(os.listdir(directory)):
        filename = directory + "/" + img
        image = Image.open(filename)
        image = image.resize((299, 299))
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0
        
        feature = model.predict(image)
        features[img] = feature
    
    return features

features = extract(data_img)
dump(features, open("features.p", "wb"))

In [26]:
features = load(open("features.p", "rb"))

In [27]:
def load_img(filename):
    file = load_doc(filename)
    photos = file.split("\n")[:-1]
    return photos

def load_clean_desc(filename, photos):
    file = load_doc(filename)
    description = {}
    for line in file.split("\n"):
        words = line.split()
        if len(words) < 1:
            continue
        
        img, image_captions = words[0], words[1:]
        
        if img in photos:
            if img not in description:
                description[img] = []
            desc = '<start> ' + " ".join(image_captions) + ' <end>'
            description[img].append(desc)
    
    return description

In [35]:
def load_features(photos):
    all_feature = load(open("features.p", "rb"))
    features = {k: all_feature[k] for k in photos}
    return features

filename = data_text + "/" + "Flickr_8k.trainImages.txt"
train_img = load_img(filename)
train_desc = load_clean_desc("descriptions.txt", train_img)
train_feature = load_features(train_img)

In [36]:
def dict2list(description):
    all_desc = []
    for key in description:
        [all_desc.append(d) for d in description[key]]
    return all_desc

from keras.preprocessing.text import Tokenizer

def create_token(description):
    desc_list = dict2list(description)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

tokenizer = create_token(train_desc)
dump(tokenizer, open('tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
print("Size of Vocabulary =", vocab_size)

Size of Vocabulary = 7577


In [37]:
def max_length(description):
    desc_list = dict2list(description)
    return max(len(d.split()) for d in desc_list)

max_length = max_length(description)
print("Maximum length of description =", max_length)

Maximum length of description = 32


In [38]:
def data_generator(description, features, tokenizer, max_length):
    while 1:
        for key, desc in description.items():
            feat = features[key][0]
            input_img, input_seq, output_desc = create_seq(tokenizer, max_length, desc, feat)
            yield[[input_img, input_seq], output_desc]

def create_seq(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

[a,b], c = next(data_generator(train_desc, features, tokenizer, max_length))
a.shape, b.shape, c.shape            

((47, 2048), (47, 32), (47, 7577))

In [54]:
from keras.utils import plot_model


def define_model(vocab_size, max_length):
    #features from the CNN model squeezed from 2048 to 256 nodes
    input_1 = Input(shape=(2048,))
    feat_1 = Dropout(0.5)(input_1)
    feat_2 = Dense(256, activation='relu')(feat_1)
    
    #LSTM sequence model
    input_2 = Input(shape=(max_length,))
    seq_1 = Embedding(vocab_size, 256, mask_zero=True)(input_2)
    seq_2 = Dropout(0.5)(seq_1)
    seq_3 = LSTM(256)(seq_2)
    
    #Merging CNN model and LSTM model
    decoder_1 = add([feat_2, seq_3])
    decoder_2 = Dense(256, activation='relu')(decoder_1)
    output = Dense(vocab_size, activation='softmax')(decoder_2)
    
    model = Model(inputs=[input_1, input_2], outputs = output)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    print(model.summary())
    plot_model(model, to_file="model.png", show_shapes=True)
    
    return model

In [55]:
#training model
print('Datset: ', len(train_img))
print('Descriptions: ', len(train_desc))
print('Photos: ', len(train_feature))
print('Vocabulary Size: ', vocab_size)
print('Description Length: ', max_length)

model = define_model(vocab_size, max_length)
epochs = 10
steps = len(train_desc)

Datset:  6000
Descriptions:  6000
Photos:  6000
Vocabulary Size:  7577
Description Length:  32
Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_19 (InputLayer)          [(None, 32)]         0           []                               
                                                                                                  
 input_18 (InputLayer)          [(None, 2048)]       0           []                               
                                                                                                  
 embedding_7 (Embedding)        (None, 32, 256)      1939712     ['input_19[0][0]']               
                                                                                                  
 dropout_14 (Dropout)           (None, 2048)         0           ['input_18[0][0]']             

In [None]:
os.mkdir("models")
for i in range(epochs):
    generator = data_generator(train_desc, train_feature, tokenizer, max_length)
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save("models/model_" + str(i) + ".h5")