# Caption Preprocessing

In [1]:
import string
import pickle

In [3]:
#load all captions in memory
def load_captions(cap_data):
    #create a dict containing id corresponding to caption
    mapping = dict()
    for line in cap_data.split('\n')[1:]:
        token = line.split(',')
        #ignore very short caption
        if len(line) < 2:
            continue
        image_id = token[0].split('.')[0] #image id
        image_cap = token[1] #image caption
        if image_id not in mapping:
            mapping[image_id] = []
        mapping[image_id].append(image_cap)
    return mapping
 
cfile_path = '../input/flickr8k/captions.txt'
cap_data = open(cfile_path, 'r', encoding='utf-8').read()
mapping = load_captions(cap_data)

In [4]:
#Preforming text preprocessing
def cap_clean(mapping):
    for key, cap_list in mapping.items():
        for i in range(len(cap_list)):
            #load each caption
            caption = cap_list[i]
            #convert to lower case
            caption = caption.lower()
            #remove punctuation marks
            caption = [word for word in caption if word not in string.punctuation]
            caption = ''.join(caption)
            #remove unwanted words
            caption = caption.split(' ')
            caption = [word for word in caption if len(word)>1 and word.isalpha()]
            caption = ' '.join(caption)
            #save
            cap_list[i] = caption

cap_clean(mapping)

In [5]:
#generating vocabulary
#library of unique words in our caption data
def get_vocab(mapping):
    words = set()
    for key in mapping.keys():
        for line in mapping[key]:
            words.update(line.split())
    return words

vocab = get_vocab(mapping)
print(len(vocab))

In [6]:
#load images to memory
#train and test split
import glob
import os
img_path = '../input/flickr8k/Images/'
#creating a list of image file names
img_list = glob.glob(img_path + '*jpg')

#train and test split
#train_images = 7000
#test_images = 1091
train_path = 'trainImg.txt'
with open(train_path, 'a') as f:
    for path in img_list[:7000]:
        f.write(path+'\n')
test_path = 'testImg.txt'
with open(test_path, 'a') as f:
    for path in img_list[7000:]:
        f.write(path+'\n')

In [7]:
train_img = open(train_path, 'r').read()
test_img = open(test_path, 'r').read()

In [None]:
train_img = train_img.split('\n')
train_img = train_img[:7000]
train_img[-1]

In [None]:
test_img = test_img.split('\n')
test_img = test_img[:1091]
test_img[-1]

In [None]:
#fetch all image ids in training data
def get_dataset(data_path):
    data_id = []
    for path in data_path:
        path = path.split('/')
        id = path[-1].split('.')[0]
        data_id.append(id)
    return data_id

train_id = get_dataset(train_img)
print(len(train_id))

In [None]:
#adding <start> and <end> tags in each caption in given dataset(train or test)
def load_tagged_cap(mapping, dataset):
    tagged_map = dict()
    for key, cap_list in mapping.items():
        if key in dataset:
            if key not in tagged_map:
                tagged_map[key] = []
            for line in cap_list:
                tagged_line = 'startseq ' + line + ' endseq'
                tagged_map[key].append(tagged_line)
    return tagged_map

train_cap = load_tagged_cap(mapping, train_id)
print(train_cap['3226254560_2f8ac147ea'])

# Image Processing

In [None]:
#import all required libraries
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from tensorflow.keras.preprocessing.image import load_img, img_to_array
#load Inception model for transfer learning
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input

In [None]:
tf.config.list_physical_devices()

In [None]:
#add to memory
def process_img(img_path):
    with tpu_strategy.scope():
        #load with size 299 x 299 as inception_v2 accept that
        img = load_img(img_path, target_size=(299, 299, 3))
        img_arr = img_to_array(img)
        #expand by a dimension and scale pixels from -1 to 1 range
        img_arr = np.expand_dims(img_arr, axis=0)
        img_arr = preprocess_input(img_arr)
        return img_arr

In [None]:
from tensorflow.keras.models import Model
#get features using inception model
base_model = InceptionV3(weights='imagenet', include_top=False,
                         input_shape=(299, 299, 3), pooling=max)
model = Model(base_model.input, base_model.layers[-1].output)

In [None]:
def encode_feature(img_arr):
    with tf.device('/GPU:0'):
        feature_vec = model.predict(img_arr)
        feature_vec = np.reshape(feature_vec, (feature_vec.shape[0],
                        feature_vec.shape[1]*feature_vec.shape[2]*feature_vec.shape[3]))
        return feature_vec

In [None]:
def get_feature_map(data_img_path, data_img_id):
    #creating a feature map 
    with tf.device('/GPU:0'):
        encoded_feature_map = dict()
        with tqdm(total=len(train_id)) as pbar:
            for img_path, img_id in zip(data_img_path, data_img_id):
                #preprocess image and encode the feature vector
                img_arr = process_img(img_path)
                img_vec = encode_feature(img_arr)
                encoded_feature_map[img_id] = img_vec
                pbar.update(1)
        return encoded_feature_map

In [None]:
with tf.device('/GPU:0'):
    encoded_feature_map = get_feature_map(train_img, train_id)

In [None]:
#save feature map
file = 'feature_map.pickle'
outfile = open(file, 'wb')
pickle.dump(encoded_feature_map, outfile)
outfile.close()

# Tokenization

In [None]:
train_caption = []
for key, cap_list in train_cap.items():
    for cap in cap_list:
        train_caption.append(cap)

In [None]:
#max length of caption available in train data
max_len = max(len(cap.split()) for cap in train_caption)
max_len

In [None]:
#cleaning vocabulary
threshold = 10 #add only words having frequency > 10
word_count = dict()
for cap in train_caption:
    for word in cap.split(' '):
        word_count[word] = word_count.get(word, 0) + 1

vocab = [word for word in word_count if word_count[word] >= threshold]

#mapping each word in vocabulary with integer
wordtoint = dict()
inttoword = dict()

for word, i in zip(vocab, range(1,len(vocab)+1)):
    wordtoint[word] = i
    inttoword[i] = word

In [None]:
outfile_tokenize1 = open('wordtoint.pickle', 'wb')
outfile_tokenize2 = open('inttoword.pickle', 'wb')
pickle.dump(wordtoint, outfile_tokenize1)
pickle.dump(inttoword, outfile_tokenize2)
outfile_tokenize1.close()
outfile_tokenize2.close()

In [None]:
len(vocab)

In [None]:
inttoword[4], wordtoint['the']

# Data generation
Adding data into *input -> output* form
, here we have two inputs namely: 
X1 -> Input image features &
X2 -> Input seq
and One Output
Y1 -> Output seq
Using LSTM
I am going feed in sequence such that LSTM network has to predict every other word in 
sequence as output given previous word as input in sequence.

In [None]:
infile = open('feature_map.pickle', 'rb')
feature_map = pickle.load(infile)
infile.close()

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, plot_model
X1, X2, y = [], [], []

for key, cap_list in train_cap.items():
    img_feature = feature_map[key]
    for cap in cap_list:
        seq = [wordtoint[word] for word in cap.split() if word in wordtoint]
        for i in range(1, len(seq)):
            in_sq, out_sq = seq[:i], seq[i]
            in_sq = pad_sequences([in_sq], maxlen=max_len)[0]
            out_sq = to_categorical([out_sq], num_classes=len(vocab)+1)[0]
            X1.append(img_feature)
            X2.append(in_sq)
            y.append(out_sq)

X1 = np.asarray(X1)
X2 = np.asarray(X2)
y = np.asarray(y)

# Creating a Embedding layer using GLOVE
glove is global vectors for word representation, version I am using here consist of 6 billion words from english and each word is have 200 dimensional vector for it and it is pre trained word to vector model thus I just need to find one from kaggle and add it to my dataset.
I am going to create a Embedding layer with it such that I can get *vocab_size x 200d* Embedding layer in model.

In [None]:
embedding_map = {}
glove_path = '../input/glove6b/glove.6B.200d.txt'
glove = open(glove_path, 'r', encoding='utf-8').read()
for line in glove.split("\n"):
    val = line.split(" ")
    word = val[0]
    vec = np.asarray(val[1: ], dtype = 'float32')
    embedding_map[word] = vec

emb_dim = 200
emb_matrix = np.zeros((len(vocab)), emb_dim)
for word, i in wordtoint.items():
    emb_vec = embedding_map.get(word)
    if emb_vec is not None:
        emb_matrix[i] = emb_vec

In [None]:
emb_matrix.shape

# Model design

In [None]:
from keras.layers import Flatten, Dense, LSTM, Dropout, Embedding, Activation
from keras.layers import concatenate, BatchNormalization, Input
from keras.layers.merge import add

In [None]:
#Feature extraction
ip1 = Input(shape = (2048, ))
fe1 = Dropout(0.5)(ip1)
fe2 = Dense(256, activation = 'relu')(fe1)

#LSTM layers 
ip2 = Input(shape = (max_length, ))
se1 = Embedding(len(vocab), emb_dim, mask_zero = True)(ip2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

#add function
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation = 'relu')(decoder1)
outputs = Dense(vocab_size, activation = 'softmax')(decoder2)
model = Model(inputs = [ip1, ip2], outputs = outputs)

In [None]:
model.layers[2].set_weights([emb_matrix])
model.layers[2].trainable = False
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
model.fit([X1, X2], y, epochs = 50, batch_size = 256)
model.save('model1.h5')

# Predicting output

In [None]:
from tensorflow.keras.models import load_model

In [None]:
trained_model = load_model('trained_model.h5')

In [None]:
def greedy_search(pic, trained_model):
    start = 'startseq'
    for i in range(max_length):
        seq = [wordtoix[word] for word in start.split() if word in wordtoix]
        seq = pad_sequences([seq], maxlen = max_length)
        yhat = trained_model.predict([pic, seq])
        yhat = np.argmax(yhat)
        word = ixtoword[yhat]
        start += ' ' + word
        if word == 'endseq':
            break
    final = start.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [None]:
#for one image
def input_img(img_path):
    img_arr = process_img(img_path)
    img_vec = encode_feature(img_arr)
    return img_vec

In [None]:
#on testing data
test_id = get_dataset(test_img)
test_cap = load_tagged_cap(mapping, test_id)
with tf.device('/GPU:0'):
    test_features = get_feature_map(test_img, test_id)
for key, features in test_features.items():
    pred_cap = greedy_search(features, trained_model)