In [1]:
!git clone "https://github.com/tnagire1/flickr8k_dataset"
#Flikr 8K data set

Cloning into 'flickr8k_dataset'...
remote: Enumerating objects: 8252, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (76/76), done.[K
remote: Total 8252 (delta 22), reused 74 (delta 13), pack-reused 8162[K
Receiving objects: 100% (8252/8252), 2.48 GiB | 14.55 MiB/s, done.
Resolving deltas: 100% (35/35), done.
Checking out files: 100% (8139/8139), done.


In [2]:
#importing packages
import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load
import numpy as np
from tensorflow.keras.applications import VGG19
from keras.applications.xception import Xception, preprocess_input
from tensorflow.keras.utils import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Add
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout
import matplotlib.pyplot as plt
import time
from pylab import *
from tqdm.notebook import tqdm
tqdm().pandas()



0it [00:00, ?it/s]

In [3]:
#Reads the file and extraxts the test.(Generic)
def load_doc(filename):
    # Opening the file as read only
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text



In [4]:
# get all imgs with their captions and keep in a dictionary with image as key
def all_img_captions(filename):
    file = load_doc(filename)
    captions = file.split('\n')
    descriptions ={}
    for caption in captions[:-1]:
        img, caption = caption.split('\t')
        if img[:-2] not in descriptions:
            descriptions[img[:-2]] = [caption]
        else:
            descriptions[img[:-2]].append(caption)
    return descriptions

In [5]:
##Data cleaning- lower casing, removing puntuations and words containing numbers
def cleaning_text(captions):
    table = str.maketrans('','',string.punctuation)
    for img,caps in captions.items():
        for i,img_caption in enumerate(caps):

            img_caption.replace("-"," ")
            desc = img_caption.split()

            #converts to lower case
            desc = [word.lower() for word in desc]
            #remove punctuation from each token
            desc = [word.translate(table) for word in desc]
            #remove hanging 's and a 
            desc = [word for word in desc if(len(word)>1)]
            #remove tokens with numbers in them
            desc = [word for word in desc if(word.isalpha())]
            #convert back to string

            img_caption = ' '.join(desc)
            captions[img][i]= img_caption
    return captions

In [6]:
def text_vocabulary(descriptions):
    # build vocabulary of all unique words from  dictionary as input
    vocab = set()
    
    for key in descriptions.keys():
        [vocab.update(d.split()) for d in descriptions[key]]
    
    return vocab

In [7]:
#All descriptions in one file 
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc )
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()

In [8]:
#Extracting featuers and converting into gaussian distribution(1) Z score normalization.
def extract_features(directory):
        model = VGG19( include_top=False, pooling='avg' )
        features = {}
        for img in tqdm(os.listdir(directory)):
            filename = directory + "/" + img
            image = Image.open(filename)
            image = image.resize((224,224))
            image = np.expand_dims(image, axis=0)
            #image = preprocess_input(image)
            image = image/127.5
            image = image - 1.0
            
            feature = model.predict(image)
            features[img] = feature
        return features

In [9]:
dataset_text = "/content/flickr8k_dataset/Flicker8k_text"
dataset_images = "/content/flickr8k_dataset/Flicker8k_Dataset"

In [10]:
#Save all features in a file.
features = extract_features(dataset_images)
dump(features, open("/content/features.p","wb"))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5


  0%|          | 0/8091 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


In [11]:
#we prepare our text data
filename = dataset_text + "/" + "Flickr8k.token.txt"
#loading the file that contains all data
#mapping them into descriptions dictionary img to 5 captions
descriptions = all_img_captions(filename)
print("Length of descriptions =" ,len(descriptions))

#cleaning the descriptions
clean_descriptions = cleaning_text(descriptions)

#building vocabulary 
vocabulary = text_vocabulary(clean_descriptions)
print("Length of vocabulary = ", len(vocabulary))

#saving each description to file 
save_descriptions(clean_descriptions, "/content/descriptions.txt")

Length of descriptions = 8092
Length of vocabulary =  8763


In [12]:
features = load(open("/content/features.p","rb"),allow_pickle=True)

In [13]:
#load the data 
def load_photos(filename):
    file = load_doc(filename)
    photos = file.split("\n")[:-1]
    return photos


def load_clean_descriptions(filename, photos):   
    #loading clean_descriptions
    file = load_doc(filename)
    descriptions = {}
    for line in file.split("\n"):
        
        words = line.split()
        if len(words)<1 :
            continue
    
        image, image_caption = words[0], words[1:]
        
        if image in photos:
            if image not in descriptions:
                descriptions[image] = []
            desc = '<start> ' + " ".join(image_caption) + ' <end>'
            descriptions[image].append(desc)

    return descriptions


def load_features(photos):
    #loading all features
    all_features = load(open("features.p","rb"),allow_pickle=True)
    #selecting only needed features
    features = {k:all_features[k] for k in photos}
    return features

In [14]:
#converting dictionary to clean list of descriptions
def dict_to_list(descriptions):
    all_desc = []
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

#creating tokenizer class 
#this will vectorise text corpus
#each integer will represent token in dictionary 

from keras.preprocessing.text import Tokenizer

def create_tokenizer(descriptions):
    desc_list = dict_to_list(descriptions)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(desc_list)
    return tokenizer

In [15]:
filename = dataset_text + "/" + "Flickr_8k.trainImages.txt"

#train = loading_data(filename)
train_imgs = load_photos(filename)
train_descriptions = load_clean_descriptions("descriptions.txt", train_imgs)
train_features = load_features(train_imgs)

In [16]:
# give each word a index, and store that into tokenizer.p pickle file
tokenizer = create_tokenizer(train_descriptions)
dump(tokenizer, open('tokenizer.p', 'wb'))
vocab_size = len(tokenizer.word_index) + 1
vocab_size 

7577

In [17]:
#calculate maximum length of descriptions
def max_length(descriptions):
    desc_list = dict_to_list(descriptions)
    return max(len(d.split()) for d in desc_list)

max_length = max_length(descriptions)
max_length

32

In [18]:
features['1000268201_693b08cb0e.jpg'][0]

array([2.79231220e-01, 0.00000000e+00, 1.36423260e-01, 1.05445199e-01,
       0.00000000e+00, 0.00000000e+00, 4.81055416e-02, 3.68111357e-02,
       1.27884857e-02, 8.42721537e-02, 0.00000000e+00, 1.52101610e-02,
       7.46809602e-01, 0.00000000e+00, 8.92920792e-02, 1.06946543e-01,
       1.15925604e-02, 4.74472404e-01, 3.45508382e-02, 0.00000000e+00,
       7.82214105e-02, 5.75450696e-02, 2.65959487e-03, 3.34641218e-01,
       1.16082259e-01, 0.00000000e+00, 1.72352999e-01, 7.88996816e-02,
       3.51406962e-01, 1.74881853e-02, 4.66460884e-02, 8.22420120e-02,
       1.17414623e-01, 8.88379365e-02, 2.21417233e-01, 0.00000000e+00,
       1.15018860e-02, 3.58790606e-02, 0.00000000e+00, 0.00000000e+00,
       4.11673263e-03, 5.72843710e-03, 4.42179143e-01, 6.21027686e-03,
       3.44771259e-02, 1.62028633e-02, 1.89500470e-02, 0.00000000e+00,
       1.83661003e-03, 1.33829936e-01, 9.74355489e-02, 0.00000000e+00,
       6.81812540e-02, 3.11861350e-03, 2.47477457e-01, 8.72633278e-01,
      

In [19]:
#create input-output sequence pairs from the image description.

#data generator, used by model.fit_generator()
def data_generator(descriptions, features, tokenizer, max_length):
    while 1:
        for key, description_list in descriptions.items():
            #retrieve photo features
            feature = features[key][0]
            input_image, input_sequence, output_word = create_sequences(tokenizer, max_length, description_list, feature)
            yield [[input_image, input_sequence], output_word]         

def create_sequences(tokenizer, max_length, desc_list, feature):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(feature)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [20]:
[a,b],c = next(data_generator(train_descriptions, features, tokenizer, max_length))
a.shape, b.shape, c.shape

((47, 512), (47, 32), (47, 7577))

In [21]:
from keras.utils.vis_utils import plot_model

# define the captioning model
def define_model(vocab_size, max_length):
    
    # features from the CNN model squeezed from 2048 to 256 nodes
    inputs1 = Input(shape=(512,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # LSTM sequence model
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256,return_sequences=True)(se2)
    se4 = Dense(256, activation='relu')(se3)
    se5 = LSTM(256)(se4)
    se6 = Dense(256, activation='relu')(se5)

    # Merging both models
    decoder1 = Add()([fe2, se6])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    # tie it together [image, seq] [word]
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    # summarize model
    print(model.summary())
    plot_model(model, to_file='model.png', show_shapes=True)
    
    return model

In [22]:
define_model(7767,34)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 34)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 34, 256)      1988352     ['input_3[0][0]']                
                                                                                                  
 dropout_1 (Dropout)            (None, 34, 256)      0           ['embedding[0][0]']              
                                                                                                  
 lstm (LSTM)                    (None, 34, 256)      525312      ['dropout_1[0][0]']              
                                                                                              

<keras.engine.functional.Functional at 0x7f73b3729bb0>

In [24]:
# train our model, skip this as already trained
print('Dataset: ', len(train_imgs))
print('Descriptions: train=', len(train_descriptions))
print('Photos: train=', len(train_features))
print('Vocabulary Size:', vocab_size)
print('Description Length: ', max_length)

batch=64
model = define_model(vocab_size, max_length)
#uncomment below line for continuing training of pre trained models
#model = load_model('/content/models/dense/model_vgg190.h5')
epochs = 9
steps = len(train_descriptions)//batch

#making a directory models to save our models
os.mkdir("models")

for i in range(epochs):
    generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
    model.fit(generator, epochs=500, steps_per_epoch= steps, verbose=1)
    #path to save trainned model
    model.save("/content/model_vgg19_" + str(i) + ".h5")

Dataset:  6000
Descriptions: train= 6000
Photos: train= 6000
Vocabulary Size: 7577
Description Length:  32
Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 32, 256)      1939712     ['input_7[0][0]']                
                                                                                                  
 dropout_5 (Dropout)            (None, 32, 256)      0           ['embedding_2[0][0]']            
                                                                                                  
 lstm_4 (LSTM)                  (None, 32, 256)      525312      ['dropout_5[0][0]']

Testing part

In [25]:
def extract_features(filename, model):
        try:
            image = Image.open(filename)
        except e:
            raise e
            print("ERROR: Couldn't open image! Make sure the image path and extension is correct")
        image = image.resize((224,224))
        image = np.array(image)
        # for images that has 4 channels, we convert them into 3 channels
        if image.shape[2] == 4: 
            image = image[..., :3]
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0
        feature = model.predict(image)
        return feature

In [26]:
def word_for_id(integer, tokenizer):
 for word, index in tokenizer.word_index.items():
     if index == integer:
         return word
 return None

In [27]:
def generate_desc(model, tokenizer, photo, max_length):
    in_text = 'start'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        pred = model.predict([photo,sequence], verbose=0)
        pred = np.argmax(pred)
        word = word_for_id(pred, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == 'end':
            break
    return in_text

In [29]:
max_length = 32
tokenizer = np.load("/content/tokenizer.p",allow_pickle=True)
#Load the model you want to test with
model = load_model('/content/model_vgg19_0.h5')

In [31]:
testdir="/content/flickr8k_dataset/test_images"
for img in tqdm(os.listdir(testdir)):
  print(img)

  0%|          | 0/17 [00:00<?, ?it/s]

test_img_14.jpg
test_img_08.jpg
test_img_09.jpg
test_img_19.jpg
test_image_05.jpg
test_img_1.jpg
test_img_07.jpg
test_img_17.jpg
test_img_16.jpg
test_image_06.jpg
test_image_3.jpg
test_img_12.jpg
test_img_2.jpg
test_img_13.jpg
test_img_11.jpg
test_img_04.jpg
test_img_15.jpg


In [33]:
for img in tqdm(os.listdir(testdir)):
  if(img != ".ipynb_checkpoints"):
    img_path=testdir+"/"+img
    VGG19_Model = VGG19( include_top=False, pooling='avg' )
    photo = extract_features(img_path, VGG19_Model)
    image = Image.open(img_path)
    description = generate_desc(model, tokenizer, photo, max_length)
    print("\n\n")
    print(description)
    imshow(image)
    show()

  0%|          | 0/17 [00:00<?, ?it/s]

In [34]:
filename = dataset_text + "/" + "Flickr_8k.testImages.txt"
test_imgs = load_photos(filename)
test_descriptions = load_clean_descriptions("descriptions.txt", test_imgs)
test_features = load_features(test_imgs)

In [None]:
test_output={}
for line in tqdm(open(filename)):
  img_path=dataset_images+"/"+line.split("\t")[0].strip()
  VGG19_Model = VGG19(include_top=False, pooling="avg")
  photo = extract_features(img_path, VGG19_Model)
  image = Image.open(img_path.strip())
  description = generate_desc(model, tokenizer, photo, max_length)
  test_output[img_path]=description

In [68]:
test_out={}
skip=len("/content/flickr8k_dataset/Flicker8k_Dataset/")
skip_start=len("start ")
skip_end=len(" end")
for i in test_output:
  test_out[i[skip:]]=test_output[i][skip_start:-skip_end].split(" ")

In [69]:
skip_start=len("<start> ")
skip_end=len(" <end>")
test_desc={}
for key in test_descriptions:
   test_desc[key]=[]
   for i in test_descriptions[key]:
     test_desc[key].append(i[skip_start:-skip_end].split(" "))

In [70]:
pred=[]
actual=[]
for key in test_desc:
  pred.append(test_out[key])
  actual.append(test_desc[key])

In [71]:
from nltk.translate.bleu_score import corpus_bleu
print('BLEU-1: %f' % corpus_bleu(actual, pred, weights=(1.0, 0, 0, 0)))
print('BLEU-2: %f' % corpus_bleu(actual, pred, weights=(0.5, 0.5, 0, 0)))
print('BLEU-3: %f' % corpus_bleu(actual, pred, weights=(0.3, 0.3, 0.3, 0)))
print('BLEU-4: %f' % corpus_bleu(actual, pred, weights=(0.25, 0.25, 0.25, 0.25)))

BLEU-1: 0.355999
BLEU-2: 0.178890
BLEU-3: 0.118668
BLEU-4: 0.047096
