In [None]:
import numpy as np
import os
import pickle
from tqdm.notebook import tqdm

from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model



In [None]:
base_dir='/kaggle/input/flickr8k'
working_dir='/kaggle/working'


In [None]:
model=VGG16()
model=Model(inputs=model.inputs,outputs=model.layers[-2].output)
print(model.summary())

In [None]:
#make an empty dictionary for storing the features
features={}

#make a path to the pictures
directory=os.path.join(base_dir,'Images')

#extracting features from images
#for every image in teh directory do the following
for img in os.listdir(directory):
    #make the full image path and store it ina seperate variable
    img_path=directory+"/"+img
    #load the image into a different variable and make sure the image isze is all the same
    image=load_img(img_path,target_size=(224,224))
    #converting pixel values into numpy aray and storing
    image=img_to_array(image)
    #reshaping the image,i.e making sure that it is the same shape as the original image
    image=image.reshape(1,image.shape[0],image.shape[1],image.shape[2])
    #preprocess the image
    image=preprocess_input(image)
    feature=model.predict(image,verbose=1)
    #get the image id
    image_id=img.split('.')[0]
    features[image_id]=feature #stores a key value pair
    

In [None]:
#use pcikel to write and store the features extracted into a binary file
pickle.dump(features,open(os.path.join(working_dir,'features.pkl'),'wb'))

In [None]:
#seeing the contents of the featuresd
with open(os.path.join(working_dir,'features.pkl'),'rb') as f:
    loaded_features=pickle.load(f)
    
for key,value in list(loaded_features.items())[:5]:
    #print( value)
    print(value.shape)


In [None]:
#reading the captions that are present in the data
with open(os.path.join(base_dir,'captions.txt'),'r') as ff:
    next(ff)
    captions=ff.read()

In [None]:
#total number of captions
length=captions.split('\n')
print(len(length))

In [None]:
print(captions.split('\n')[0])
first=captions.split('\n')[0]
tokens=first.split(',')
print(len(tokens))
print(tokens[0].split('.')[0])
print(tokens[1])

In [None]:
caption_map={}

for line in captions.split('\n'):
    if len(line)<2:
        continue
    #now you have each line  as id,caption pair- get the id and caption seperated
    token=line.split(',')
    caption_id=token[0]
    caption=token[1]
    #get rid of the extension for the id
    caption_id=caption_id.split('.')[0]
    
    #check if the id is in the dictionary otherwise create a new list w the id as the key
    if caption_id not in caption_map:
        caption_map[caption_id]=[]
    #add the cpation to that list created
    caption_map[caption_id].append(caption)

In [None]:
len(caption_map)

In [None]:
#crate a class to preprocess the text 
def clean(caption_map):
    for key,captions in caption_map.items():
        #the data is in the format such tha the key is one value and the caption is a list of sentences(5 in this dataset)
        #you need to take each sentence and pre process it 
        for i in range(len(captions)):
            caption=captions[i]#now you take one caption at a time
            caption=caption.lower()
            caption=caption.replace('[^A-Za-z]','')
            #get rid of the excess white spaces as well
            import re
            caption=re.sub('\s+',' ',caption)
            #add the starting and ending token to find out the starting and ending
            caption= "startseq "+"  ".join([word for word in caption.split() if len(word)>1])+" "+"endseq"
            captions[i]=caption

In [None]:
caption_map['1000268201_693b08cb0e']

In [None]:
#after cleaning data
clean(caption_map)

In [None]:
caption_map['1000268201_693b08cb0e']

In [None]:
#put all the captions in one large list
all_captions=[]
for key in caption_map:
    for caption in caption_map[key]:
        all_captions.append(caption)

In [None]:
len(all_captions)

In [None]:
all_captions[:10]

In [None]:
#tokenizing the text
tokenizer=Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size=len(tokenizer.word_index)+1


In [None]:
print(vocab_size)

In [None]:
max_len=max(len(caption.split()) for caption in all_captions)
print(max_len)

In [None]:
#train test and split the dataset
#store all the keys in a list

#now you are justting the getting the keys for training, testing and splitting
image_ids=list(caption_map.keys())
split=int(len(image_ids)*0.9)
train=image_ids[:split]
test=image_ids[split:]

In [None]:
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    
                    # store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield [X1, X2], y
                X1, X2, y = list(), list(), list()
                n = 0

In [None]:
#calculating number of features
count=0
for keys in features.keys():
    count+=1
    print(keys)
    if count>5:
        break
length=len(features["3226254560_2f8ac147ea"])


In [None]:
print(features["3226254560_2f8ac147ea"])
arr=np.array(features["3226254560_2f8ac147ea"])
print(arr.shape) # it implies you have 4096 features

# Model Creation


In [None]:
#you cannot use the normal sequential model that use normally, bc here you need multiple inputs so the layers cannot be 
#stacked ontop of eachother

#use funtional API
#image feature data
from tensorflow.keras.layers import Input,Dense,Dropout,Embedding,LSTM,add
inputs1=Input(shape=(4096,))
fe1=Dropout(0.4)(inputs1)
fe2=Dense(256,activation='relu')(fe1)


#text data input
input2=Input(shape=(max_len,))
se1=Embedding(vocab_size,256,mask_zero=True)(input2)
se2=Dropout(0.4)(se1)
se3=LSTM(256)(se2)


#buildiing the decoder
#combine both the models
decoder1=add([fe2,se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, input2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

# plot the model
from tensorflow.keras.utils import plot_model
plot_model(model, show_shapes=True)



In [None]:
model.summary()

In [None]:
#training the model
batch_size=32
steps=len(train)//32. #automatically returns floor so you dont have to use int explicitly
epochs=20
#generator = data_generator(train, caption_map, features, tokenizer, max_len, vocab_size, batch_size)
#model.fit(generator, epochs=20, steps_per_epoch=steps, verbose=1)

for i in range(epochs):

    # create data generator
    generator = data_generator(train, caption_map, features, tokenizer, max_len, vocab_size, batch_size)
    # fit for one epoch
    model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)

In [None]:
#SAVING THE MODEL
model.save(working_dir+'/best_model.h5')

Funtion to turn index into words


In [None]:
#using the index find the word and return it
def idx_to_word(key,tokenizer):
    for word,index in tokenizer.word_index.items():
        if key==index:
            return word
            
    return None

        
    

In [None]:
#a funtion to make everything work from the input
def predict_caption(model, image, tokenizer, max_len):#max len is the maximum length of the caption, so now you are assuming that the new caption cannot exceed the limit for this, so you will iterate that many ties
    caption='startseq'
    for i in range(max_len):
        #you need to create a sequence and pad it, but which one? i think you can always start off
        #you need to find the index of the word and then the word
        #word index will be found by the model
        sequence=tokenizer.texts_to_sequences([caption])[0]
        seq=pad_sequences([sequence],maxlen=max_len)
        y=model.predict([image,seq],verbose=0) #initially it will be startseq, the model will return the probabilities of all the possible words , you need to pick the one w the highest prob
        y=np.argmax(y)
        
        word=idx_to_word(y,tokenizer)
        if word is None:
            break
        caption+=" "+word
        if word=='endseq':
            break
    return caption


        

In [None]:
from nltk.translate.bleu_score import corpus_bleu
# validate with test data
actual, predicted = list(), list()

for key in test:
    # get actual caption
    captions = caption_map[key]
    # predict the caption for image
    y_pred = predict_caption(model, features[key], tokenizer, max_len) 
    # split into words
    actual_captions = [caption.split() for caption in captions]
    y_pred = y_pred.split()
    # append to the list
    actual.append(actual_captions)
    predicted.append(y_pred)
    
# calcuate BLEU score
print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(image_name):
    # load the image
    # image_name = "1001773457_577c3a7d70.jpg"
    image_id = image_name.split('.')[0]
    img_path = os.path.join(base_dir, "Images", image_name)
    image = Image.open(img_path)
    captions = caption_map[image_id]
    print('---------------------Actual---------------------')
    for caption in captions:
        print(caption)
    # predict the caption
    y_pred = predict_caption(model, features[image_id], tokenizer, max_len)
    print('--------------------Predicted--------------------')
    print(y_pred)
    plt.imshow(image)

In [None]:
generate_caption("1001773457_577c3a7d70.jpg")

In [None]:
generate_caption("1002674143_1b742ab4b8.jpg")

In [None]:
generate_caption("101669240_b2d3e7f17b.jpg")