In [None]:
## Step 1:- Import the required libraries 


import numpy as np
from numpy import array
import matplotlib.pyplot as plt
%matplotlib inline

import string
import os
import glob
from PIL import Image
from time import time

from keras import Input, layers
from keras import optimizers
from tensorflow.keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.preprocessing import image
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Embedding, Dense, Activation, Flatten, Reshape, Dropout
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add,concatenate
from keras.applications.inception_v3 import InceptionV3
from keras.applications.inception_v3 import preprocess_input
from keras.models import Model
from tensorflow.keras.utils import to_categorical


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

def Recall(y_true,y_pred):  
    # y_true = [0   ,1   ,0   ,0   ,0   ,1]
    # y_pred = [0.2 ,0.4 ,0.3 ,0.2 ,0.4 ,0.3]
    re = recall_score(y_true, y_pred)
    return re

def Precision(y_true,y_pred):  
    # y_true = [0   ,1   ,0   ,0   ,0   ,1]
    # y_pred = [0.2 ,0.7 ,0.3 ,0.2 ,0.4 ,0.3]
    return precision_score(y_true, y_pred)

def F_score(y_true,y_pred):  
    # y_true = [0   ,1   ,0   ,0   ,0   ,1]
    # y_pred = [0.2 ,0.4 ,0.3 ,0.2 ,0.4 ,0.3]
    return f1_score(y_true, y_pred)

def Accuracy(y_true,y_pred):  
    # y_true = [0   ,1   ,0   ,0   ,0   ,1]
    # y_pred = [0.2 ,0.4 ,0.3 ,0.2 ,0.4 ,0.3]
    return accuracy_score(y_true, y_pred)




In [None]:
## Step 2 : Data Loading and Pre-Processing

token_path = "../input/flickr8k/Flickr8k.token.txt"
train_images_path = '../input/flickr8k/Flickr_8k.trainImages.txt'
test_images_path = '../input/flickr8k/Flickr_8k.testImages.txt'
val_images_path = '../input/flickr8k/Flickr_8k.devImages.txt'
images_path = '../input/flickr-8k/Images/'
glove_path = '../input/glove6b/'

doc = open(token_path,'r').read()
print(doc[:410])

In [None]:
descriptions = dict()
for line in doc.split('\n'):
        tokens = line.split()
        #print(tokens)
        if len(line) > 2:
            image_id = tokens[0].split('.')[0]
            image_desc = ' '.join(tokens[1:])
            if image_id not in descriptions:
                descriptions[image_id] = list()
            descriptions[image_id].append(image_desc)
            
#print(descriptions)

In [None]:
table = str.maketrans('', '', string.punctuation)
for key, desc_list in descriptions.items():
    for i in range(len(desc_list)):
        desc = desc_list[i]
        desc = desc.split()
        desc = [word.lower() for word in desc]
        desc = [w.translate(table) for w in desc]
        desc_list[i] =  ' '.join(desc)

In [None]:
## Lets visualize and example image
pic = '1000268201_693b08cb0e.jpg'
x=plt.imread(images_path+pic)
plt.imshow(x)
plt.show()
descriptions['1000268201_693b08cb0e']

> **Divide INTO POSITIVE AND NEGATIVE EXAMPLES**

In [None]:
vocabulary = set()
for key in descriptions.keys():
        [vocabulary.update(d.split()) for d in descriptions[key]]
print('Original Vocabulary Size: %d' % len(vocabulary))

In [None]:
lines = list()
for key, desc_list in descriptions.items():
    for desc in desc_list:
        lines.append(key + ' ' + desc)
new_descriptions = '\n'.join(lines)

In [None]:
print(new_descriptions[:450])

In [None]:
doc = open(train_images_path,'r').read()
dataset1 = list()
for line in doc.split('\n'):
    if len(line) > 1:
        identifier = line.split('.')[0]
        dataset1.append(identifier)

train = set(dataset1)

doc = open(test_images_path,'r').read()
dataset2 = list()
for line in doc.split('\n'):
    if len(line) > 1:
        identifier = line.split('.')[0]
        dataset2.append(identifier)

test = set(dataset2)

doc = open(val_images_path,'r').read()
dataset3 = list()
for line in doc.split('\n'):
    if len(line) > 1:
        identifier = line.split('.')[0]
        dataset3.append(identifier)

val = set(dataset3)

In [None]:
print(list(train)[:5])

In [None]:
img = glob.glob(images_path + '*.jpg')
train_images = set(open(train_images_path, 'r').read().strip().split('\n'))
train_img = []
for i in img: 
    if i[len(images_path):] in train_images:
        train_img.append(i)

test_images = set(open(test_images_path, 'r').read().strip().split('\n'))
test_img = []
for i in img: 
    if i[len(images_path):] in test_images: 
        test_img.append(i)
        
val_images = set(open(val_images_path, 'r').read().strip().split('\n'))
val_img = []
for i in img: 
    if i[len(images_path):] in val_images: 
        val_img.append(i)

In [None]:
print(test_img[:2])

In [None]:
train_descriptions = dict()
test_descriptions = dict()
val_descriptions = dict()
for line in new_descriptions.split('\n'):
    tokens = line.split()
    image_id, image_desc = tokens[0], tokens[1:]
    if image_id in train:
        if image_id not in train_descriptions:
            train_descriptions[image_id] = list()
        #desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        desc = ' '.join(image_desc)
        train_descriptions[image_id].append(desc)
    elif image_id in test:
        if image_id not in test_descriptions:
            test_descriptions[image_id] = list()
        #desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        desc = ' '.join(image_desc)
        test_descriptions[image_id].append(desc)
    elif image_id in val:
        if image_id not in val_descriptions:
            val_descriptions[image_id] = list()
        #desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
        desc = ' '.join(image_desc)
        val_descriptions[image_id].append(desc)


In [None]:
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
        
all_test_captions = []
for key, val in test_descriptions.items():
    for cap in val:
        all_test_captions.append(cap)
        
all_val_captions = []
for key, val in val_descriptions.items():
    for cap in val:
        all_val_captions.append(cap)


**train_descriptions**

{'1000268201_693b08cb0e': ['startseq a child in a pink dress is climbing up a set of stairs in an entry way endseq', 'startseq a girl going into a wooden building endseq', 'startseq a little girl climbing into a wooden playhouse endseq', 'startseq a little girl climbing the stairs to her playhouse endseq', 'startseq a little girl in a pink dress going into a wooden cabin endseq']

In [None]:
a = len(all_train_captions)
print(a)

In [None]:
# Generate negative samples

import random
from random import randrange
random.seed(40)

pos_neg_train_descriptions = dict()
for key, val in train_descriptions.items():
    pos_neg_descriptions = dict()
    pos_neg_descriptions[0] = list()
    pos_neg_descriptions[1] = val
    
    for i in range(5):
        rand_cap = None
        while(True):
            i = randrange(len(all_train_captions))
            rand_cap = all_train_captions[i]
            if rand_cap not in val:
                break
            print("Collision")
        pos_neg_descriptions[0].append(rand_cap)
    pos_neg_train_descriptions[key] = pos_neg_descriptions
    
    
pos_neg_test_descriptions = dict()
for key, val in test_descriptions.items():
    pos_neg_descriptions = dict()
    pos_neg_descriptions[0] = list()
    pos_neg_descriptions[1] = val
    
    for i in range(5):
        rand_cap = None
        while(True):
            i = randrange(len(all_test_captions))
            rand_cap = all_test_captions[i]
            if rand_cap not in val:
                break
            print("Collision")
        pos_neg_descriptions[0].append(rand_cap)
    pos_neg_test_descriptions[key] = pos_neg_descriptions
    
    
    
pos_neg_val_descriptions = dict()
for key, val in val_descriptions.items():
    pos_neg_descriptions = dict()
    pos_neg_descriptions[0] = list()
    pos_neg_descriptions[1] = val
    
    for i in range(5):
        rand_cap = None
        while(True):
            i = randrange(len(all_val_captions))
            rand_cap = all_val_captions[i]
            if rand_cap not in val:
                break
            print("Collision")
        pos_neg_descriptions[0].append(rand_cap)
    pos_neg_val_descriptions[key] = pos_neg_descriptions
    
        
    

In [None]:
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in all_train_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1
        
for sent in all_test_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1
        
for sent in all_val_captions:
    nsents += 1
    for w in sent.split(' '):
        word_counts[w] = word_counts.get(w, 0) + 1
        
vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]

print('Vocabulary = %d' % (len(vocab)))

In [None]:
ixtoword = {}
wordtoix = {}
ix = 1
for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

vocab_size = len(ixtoword) + 1

all_desc = list()
for key in train_descriptions.keys():
    [all_desc.append(d) for d in train_descriptions[key]]
for key in test_descriptions.keys():
    [all_desc.append(d) for d in test_descriptions[key]]
for key in val_descriptions.keys():
    [all_desc.append(d) for d in val_descriptions[key]]
    
lines = all_desc
max_length = max(len(d.split()) for d in lines)

print('Description Length: %d' % max_length)

**Step 3**

In [None]:
embeddings_index = {} 
f = open(os.path.join(glove_path, 'glove.6B.200d.txt'), encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    
embedding_dim = 200
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in wordtoix.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

**STEP 4**

In [None]:
model = InceptionV3(weights='imagenet')

model_new = Model(model.input, model.layers[-2].output)

def preprocess(image_path):
    img = image.load_img(image_path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x




def encode(image):
    image = preprocess(image) 
    fea_vec = model_new.predict(image) 
    fea_vec = np.reshape(fea_vec, fea_vec.shape[1])
    return fea_vec

encoding_train = {}
for img in train_img:
    encoding_train[img[len(images_path):]] = encode(img)
train_features = encoding_train

encoding_test = {}
for img in test_img:
    encoding_test[img[len(images_path):]] = encode(img)
test_features = encoding_test

encoding_val = {}
for img in val_img:
    encoding_val[img[len(images_path):]] = encode(img)
val_features = encoding_val

In [None]:
def preprocess1(path):
    img = image.load_img(path, target_size=(299, 299))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    return x

In [None]:
print(train_img)

In [None]:
print(len(encoding_train))
print(len(encoding_test))
print(len(encoding_val))

In [None]:
#take the images as input
#construct the archtitecture => LeNet5, VGG16
#train the model
#calculate accuracy

from tensorflow.keras.models import Model
import tensorflow as tf
import numpy as np
import cv2
import os

import keras

from numpy.random import seed
seed(1)

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#import keras
#from keras.models import Sequential
#from keras.layers import Dense, Conv2D, MaxPool2D , Flatten
#from keras.preprocessing.image import ImageDataGenerator
 
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import backend as K
#import keras
from tensorflow.keras.models import Sequential, Model,load_model
from tensorflow.keras.optimizers import SGD, RMSprop
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from tensorflow.keras.layers import Input,Dropout, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D,MaxPool2D
from tensorflow.keras.preprocessing import image
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.applications.densenet import DenseNet201, DenseNet121
from tensorflow.keras.layers import GlobalAveragePooling2D
import matplotlib.pyplot as plt




########### MODEL ###############
from keras.layers import Dropout
from tensorflow.keras.applications import InceptionV3, Xception

base_model = Xception(include_top=False, weights='imagenet', input_shape=(256, 256, 3))

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = BatchNormalization()(x)
x = Dropout(0.6,name='dropout_fc2')(x)
predictions = Dense(200, activation='softmax')(x)


model = Model(inputs=base_model.input, outputs=predictions)
model.summary()

In [None]:
inputs1 = Input(shape=(2048,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))

se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)



decoder1 = concatenate([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(1, activation='sigmoid')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.summary()

model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


**STEP 5**

In [None]:
import tensorflow as tf
tf.config.run_functions_eagerly(True)

                
def data_generator(descriptions, train_features, wordtoix, max_length, batch_size):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over images
    while 1:
        for key, desc in descriptions.items():
            n+=1
            # retrieve the photo feature
            photo = train_features[key +'.jpg']
            
            neg_captions = desc[0]
            pos_captions = desc[1]
            for i in range(5):
                #print(pos_captions)
                pos_seq = [wordtoix[word] for word in pos_captions[i].split(' ') if word in wordtoix]
                pos_pad_seq = pad_sequences([pos_seq], maxlen=max_length, padding='post')[0]
                
                X1.append(photo)
                X2.append(pos_pad_seq)
                y.append(1)
                
                neg_seq = [wordtoix[word] for word in neg_captions[i].split(' ') if word in wordtoix]
                neg_pad_seq = pad_sequences([neg_seq], maxlen=max_length, padding='post')[0]
                
                X1.append(photo)
                X2.append(neg_pad_seq)
                y.append(0)  

            if n==batch_size:
                yield ([array(X1), array(X2)], array(y))
                X1, X2, y = list(), list(), list()
                n=0


epochs = 30
batch_size = 4
steps = len(pos_neg_train_descriptions)//batch_size
val_steps = len(pos_neg_test_descriptions)//batch_size

train_generator = data_generator(pos_neg_train_descriptions, train_features, wordtoix, max_length, batch_size)
test_generator  = data_generator(pos_neg_test_descriptions, test_features, wordtoix, max_length, batch_size)
val_generator  = data_generator(pos_neg_val_descriptions, val_features, wordtoix, max_length, batch_size)

In [None]:
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
es=EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)
mc = ModelCheckpoint("Caption.h5", monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)
model.fit_generator(train_generator, epochs=epochs, steps_per_epoch=steps, validation_data=test_generator, validation_steps =val_steps, shuffle=True, callbacks=[es,mc], verbose=1) #15 epochs

In [None]:
model.save("Captions.h5")

In [None]:
import keras
model_path = "/kaggle/input/model6/Caption (1).h5"
model = keras.models.load_model(model_path)

In [None]:
a = model.layers[-2].get_weights()
print(a[0].shape)

In [None]:
model.evaluate(val_generator, steps = val_steps)

In [None]:
# For Overall
def generate_y(gen, steps):
    y_true = np.array([])
    y_pred = np.array([])
    for i in range(steps):
        x,y = next(gen)
        y_true = np.append(y_true, y)
        y_pred = np.append(y_pred, model.predict(x))
    
    return y_true,y_pred

y_true, y_pred  = generate_y(val_generator, val_steps)
y_pred  = np.round(y_pred)
pos_indices = np.where(y_true == 1)[0]
neg_indices = np.where(y_true == 0)[0]


precision = Precision(y_true,y_pred)
print("Precision on Validation Dataset : ", precision)

recall = Recall(y_true,y_pred)
print("Recall on Validation Dataset    : ", recall)

f_score = F_score(y_true,y_pred)
print("F_score on Validation Dataset   : ", f_score)

accuracy = Accuracy(y_true,y_pred)
print("Accuracy on Validation Dataset  : ", accuracy)


In [None]:
# For Positive and negative samples

pos_y_true = y_true[pos_indices]
pos_y_pred = y_pred[pos_indices]

neg_y_true = y_true[neg_indices]
neg_y_pred = y_pred[neg_indices]


# POSITIVE


pos_accuracy = Accuracy(pos_y_true, pos_y_pred)
print("Accuracy on Validation Dataset(only 1-class) : ", pos_accuracy)



# NEGATIVE

neg_accuracy = Accuracy(neg_y_true, neg_y_pred)
print("Accuracy on Validation Dataset(only 0-class) : ", neg_accuracy)


In [None]:
def decode_caption(caption):
    
    seq = [wordtoix[word] for word in caption.split(' ') if word in wordtoix]
    pad_seq = pad_sequences([seq], maxlen=max_length)[0]
    return np.array([pad_seq])

#key = '103106960_e8a41d64f8.jpg'
path = "/kaggle/input/picture1/photo-1455577380025-4321f1e1dca7.jpg"
X1 = np.array([encode(path)])
#X1 = np.array([val_features[key]])
X2 = decode_caption('red trees')
x=plt.imread(path)
plt.imshow(x)
plt.show()
y = model.predict([X1,X2])
print(y)

In [None]:
# Display Image, Caption , Actual answer, Predicted answer

desc = pos_neg_val_descriptions
my_path = images_path
n = 0
for k,v in desc.items():
    X1 = []
    X2 = []
    y = []
    n += 1
    pic = k + ".jpg"
    print(pic)
    x=plt.imread(my_path+pic)
    plt.imshow(x)
    plt.show()
    photo = val_features[pic]
    neg_captions = v[0]
    pos_captions = v[1]
    captions = []
    for i in range(5):
        captions.append(pos_captions[i])
        pos_seq = [wordtoix[word] for word in pos_captions[i].split(' ') if word in wordtoix]
        pos_pad_seq = pad_sequences([pos_seq], maxlen=max_length)[0]
                
        X1.append(photo)
        X2.append(pos_pad_seq)
        y.append([1])
          
        captions.append(neg_captions[i])
        neg_seq = [wordtoix[word] for word in neg_captions[i].split(' ') if word in wordtoix]
        neg_pad_seq = pad_sequences([neg_seq], maxlen=max_length)[0]
                
        X1.append(photo)
        X2.append(neg_pad_seq)
        y.append([0])
    X = [np.array(X1),np.array(X2)]
    y_true = np.array(y)
    #print(X.shape)
    y_pred = model.predict(X)
    y_pred1 = np.round(y_pred).astype(int)
    #y_pred = model.predict(X)

    for i in range(10):
        print(captions[i])
        print("Actual Answer    : ", y_true[i][0])
        print("Predicted Answer : ", y_pred[i][0], " => ", y_pred1[i][0])
        print()
        
    if(n == 10):
        break



