# 1 - Get the data and prepare it

In [None]:
from google.colab import drive
drive.mount('/content/drive')
! mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d adityajn105/flickr8k
! unzip flickr8k.zip

In [None]:
from tensorflow import keras
import pickle
import os 
import numpy as np
from tqdm.notebook import tqdm
from keras.applications.vgg16 import VGG16 , preprocess_input
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
### load vgg model
model = VGG16()

In [None]:
model = keras.Model(inputs = model.inputs ,outputs = model.layers[-2].output)
model.summary()

In [None]:
#### load images and convert to features 
image_path = '/content/Images'
images_list = os.listdir(image_path)

In [None]:
len(images_list)

In [None]:
features = {}

In [None]:
for i in range(len(images_list)):
  image_patha = os.path.join(image_path , images_list[i])
  img = load_img(image_patha , target_size = (224,224))
  img = img_to_array(img)
  img = img.reshape((1, img.shape[0] ,img.shape[1] ,img.shape[2] ))
  img = preprocess_input(img)
  feature = model.predict(img , verbose=0 )
  image_id = images_list[i].split('.')[0] 
  features[image_id] = feature

In [None]:
len(features)

In [None]:
with open('/content/captions.txt' , 'r') as f :
  next(f)
  caption_doc = f.read()

In [None]:
mapping = {}
for line in tqdm(caption_doc.split('\n')) :
  tokens = line.split(',')
  if len(tokens) < 2 :
    continue 
  image_id , caption = tokens[0] , tokens[1:]
  image_id = image_id.split('.')[0]
  caption = " ".join(caption)
  if image_id in features : 
     if  image_id not in mapping :
        mapping[image_id] = []
     mapping[image_id].append(caption)  

In [None]:
mapping['1119418776_58e4b93eac']

In [None]:
len(mapping)

In [None]:
def clean_caption(mapping):
  for keys , captions in mapping.items():
    for i in range(len(captions)):
      caption = captions[i]
      caption = caption.lower()
      caption = caption.replace('[^A-Za-z]'  , '')
      caption = caption.replace('\s+' ,' ')
      caption = 'startseq ' + " ".join([word for word in caption.split() if len(word) >1]) + 'endseq'
      # we will discard word with less than tow charachters
      captions[i]  = caption

In [None]:
clean_caption(mapping)

In [None]:
mapping['1119418776_58e4b93eac']

In [None]:
all_captions = []
for key in mapping : 
  for caption in mapping[key]:
    all_captions.append(caption)

In [None]:
len(all_captions)

# Captions tokenized

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

In [None]:
len(tokenizer.word_index)

In [None]:
for word , index in tokenizer.word_index.items():
  print(word , " , " , index)

In [None]:
max_len = max(len(caption.split()) for caption in all_captions)

# Train test split

In [None]:
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.9)
train = image_ids[: split]
test = image_ids[split:]

# Build data generator

In [None]:
def data_generator(data_keys , mapping , features , tokenizer , max_len , vocab_size , batch_size):
  x1, x2 , y = list() , list() , list()
  n = 0 
  while True :
    for key in data_keys : 
      n +=1
      captions = mapping[key]
      for caption in captions : 
        seq = tokenizer.texts_to_sequences([caption])[0]
        for i in range(1 , len(seq)) :
          in_seq , out_seq = seq[: i] , seq[i]
          in_seq = pad_sequences([in_seq] , maxlen = max_len)[0]
          out_seq = to_categorical([out_seq] , num_classes = vocab_size)[0]

          x1.append(features[key][0])
          x2.append(in_seq)
          y.append(out_seq)
      if n == batch_size : 
        x1, x2 , y = np.array(x1) , np.array(x2) , np.array(y)
        yield [x1, x2] , y 
        x1, x2 , y = list() , list() , list()
        n = 0

# Model Building

In [None]:
 from tensorflow.keras.utils import to_categorical
 from tensorflow.keras.layers import Dropout , Dense , Embedding ,LSTM ,Input , add

In [None]:
import tensorflow

In [None]:
### model
inputs1 = Input((4096 ,) )
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256 , activation = 'relu')(fe1)

inputs2 = Input((max_len , ))
se1 = Embedding(vocab_size , 256 , mask_zero = True)(inputs2)
se2 = Dropout(0.4)(se1)

se3 = LSTM(256)(se2)

# decoder

decoder1 = add([fe2 , se3])

decoder2 = Dense(256 , activation = 'relu')(decoder1)

outputs = Dense(vocab_size , activation = 'softmax')(decoder2)

model = tensorflow.keras.Model(inputs = [inputs1 , inputs2] , outputs = outputs)
model.compile(loss = 'categorical_crossentropy' , optimizer = 'adam')

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
epochs = 20 
batch_size = 64
steps = len(train) // batch_size
early_stopping_monitor = EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=True
)
for i in range(epochs):
  genertrator = data_generator(train , mapping , features , tokenizer , max_len , vocab_size , batch_size)
  model.fit(genertrator , epochs = 1 , steps_per_epoch = steps ,  verbose =1 )


# prediction and belu score

In [None]:
def idx_to_word(integer , tokenizer):
  for word , index in tokenizer.word_index.items():
    if index == integer : 
      return word
  return None   

In [None]:
# generate caption
def predict_caption(model , image , tokenizer , max_len) : 
  in_text = 'startseq'
  for i in range(max_len) : 
    seq = tokenizer.texts_to_sequences([in_text])[0]
    seq = pad_sequences([seq] , max_len)
    yhat = model.predict([image , seq] , verbose =0)
    yhat = np.argmax(yhat)
    word = idx_to_word(yhat,tokenizer)
    if word is None :
      break
  
    in_text += ' ' + word
    if word == 'endseq' :
      break
  return  in_text 

In [None]:
from nltk.translate.bleu_score import corpus_bleu
actual , predicted = list() , list()
for key in tqdm(test) : 
  captions = mapping[key]
  actualcaptions = [caption.split() for caption in captions]
  actual.append(actualcaptions)
  y_pred = predict_caption(model , features[key] , tokenizer , max_len)
  y_pred = y_pred.split()
  predicted.append(y_pred)
print('', corpus_bleu(actual , predicted ,weights = (1,0,0,0,0) ))

In [None]:
print('', corpus_bleu(actual , predicted ,weights = (1,0,0,0,0) ))