In [None]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import string
import os
from PIL import Image, ImageOps
import math
import glob
from pickle import dump, load
from time import time
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras.layers.wrappers import Bidirectional
from keras.layers.merge import add
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras import Input, layers
from keras import optimizers
from keras.applications.inception_v3 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [None]:
def load_doc(filename):    
    file= open('results.csv','r',encoding='utf-8')
    text=file.read()
    file.close()
    return text

In [None]:
def load_descriptions():
    mapping=dict()
    text=load_doc('results.csv')
    for line in text.split('\n'):
        tokens=line.split('|')
        #print(tokens[0]," ",tokens[1:],end='\n')
        image_id,image_desc=tokens[0],tokens[-1]
        image_id=image_id.split('.')[0]
        
        if image_id not in mapping:
            mapping[image_id]=list()
        mapping[image_id].append(image_desc)
        
    return mapping

In [None]:
descriptions=load_descriptions()

In [None]:
list(descriptions.keys())[:5]

In [None]:
descriptions['1000268201']

In [None]:
descriptions['1000344755']

In [None]:
def clean_descriptions(descriptions):
    table=str.maketrans('','',string.punctuation)
    for key,desc_list in descriptions.items():
        for i in range(len(desc_list)):
            desc=desc_list[i]
            desc=desc.split()
            desc=[word.lower() for word in desc]
            desc=[w.translate(table) for w in desc]
            desc=[word for word in desc if len(word)>1]
            desc=[word for word in desc if word.isalpha()]
            desc_list[i]=' '.join(desc)


In [None]:
clean_descriptions(descriptions)

In [None]:
descriptions['1000344755']

In [None]:
descriptions['1000268201']

In [None]:
def save_descriptions(descriptions,filename):
    lines=list()
    for key,desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key+' '+desc)
    data='\n'.join(lines)
    
    file=open(filename,'w')
    file.write(data)
    file.close()
    

In [None]:
all_desc=set()
for key in descriptions.keys():
    #print(key)
    for d in descriptions[key]:
        [all_desc.update(d.split())]

In [None]:
vocabulary=all_desc
print("size of the vocabulary=",len(vocabulary))

In [None]:
def load_set(filename):
    doc=load_doc(filename)
    dataset=list()
    for line in doc.split('\n'):
        if len(line)<1:
            continue
        identifier=line.split('.')[0]
        dataset.append(identifier)
    return set(dataset)


In [None]:
filename='results.csv'
train=load_set(filename)
print('dataset=',len(train))

In [None]:
images='./flickr30k_images/flickr30k_images/'
img=glob.glob(images+'*.jpg')

In [None]:
def preprocess(image_path):
    img=image.load_img(image_path)
    size=img.size
    if(size[0]>size[1]):
        (size_max,size_min)=(size[0],size[1])
        border_dir='v'
    else:
        (size_max,size_min)=(size[1],size[0])
        border_dir='h'
    border_amount=math.ceil((size_max-size_min)/2)
    if (border_dir == 'v'):
        img = ImageOps.expand(img,border=(0,border_amount),fill='white')
    else:
        img = ImageOps.expand(img,border=(border_amount,0),fill='white')
    
    img = img.resize((299,299),resample=0)
    x=image.img_to_array(img)
    x=np.expand_dims(x,axis=0)
    x=preprocess_input(x)
    return x

In [None]:
model=InceptionV3(weights='imagenet')

In [None]:
model_new=Model(model.input,model.layers[-2].output)


In [None]:
def encode(image):
    image=preprocess(image)
    fea_vec=model_new.predict(image)
    fea_vec=np.reshape(fea_vec,fea_vec.shape[1])
    return fea_vec

In [None]:
start=time()
encoding_train={}
for img in img:
    encoding_train[img[len(images):]]=encode(img)
    print("time tken in second=",time()-start)


In [None]:
with open('encoded_train_images.pkl','wb') as f:
    dump(encoding_train,f)

In [None]:
train_features=load(open('encoded_train_images.pkl','rb'))
print(len(train_features))

In [None]:
all_train_captions=[]
for key,val in descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
len(all_train_captions)

In [None]:
word_count_threshold=10
word_counts={}
nsents=0
for sent in all_train_captions:
    nsents+=1
    for w in sent.split(' '):
        word_counts[w]=word_counts.get(w,0)+1
        
vocab=[w for w in word_counts if word_counts[w]>=word_count_threshold]
print("no of word =",len(word_counts))
print("len of vocabulary=",len(vocab))

In [None]:
ixtoword={}
wordtoix={}
ix=1
for w in vocab:
    wordtoix[w]=ix
    ixtoword[ix]=w
    ix+=1
    

In [None]:
with open('ixtoword.pkl','wb') as f1:
    dump(ixtoword,f1)
    f1.close()

with open('wordtoix.pkl','wb') as f2:
    dump(wordtoix,f2)
    f2.close()


In [None]:
vocab_size=len(ixtoword)+1
vocab_size

In [None]:
def to_lines(descriptions):
    all_desc=list()
    for key in descriptions.keys():
        [all_desc.append(d) for d in descriptions[key]]
    return all_desc

In [None]:
print(len(to_lines(descriptions)))

In [None]:
def max_length(descriptions):
    lines=to_lines(descriptions)
    return max(len(d.split()) for d in lines)

In [None]:
max_length=max_length(descriptions)


In [None]:
print("length of max length descriptions=",max_length)

In [None]:
def data_generator(descriptions,photos,wordtoix,max_length,num_photos_per_batch):
    X1,X2,y=list(),list(),list()
    n=0
    
    while 1:
        for key,desc_list in descriptions.items():
            n+=1
            photo=photos[key+'.jpg']
            for desc in desc_list:
                seq=[wordtoix[word] for word in desc.split(' ') if word in wordtoix ]
                for i in range(1,len(seq)):
                    in_seq,out_seq=seq[:i],seq[i]
                    in_seq=pad_sequences([in_seq],maxlen=max_length)[0]
                    
                    out_seq=to_categorical([out_seq],num_classes=vocab_size)[0]
                    
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
                    
            if n==num_photos_per_batch:
                yield [[array(X1),array(X2)],array(y)]
                X1,X2,y=list(),list(),list()
                n=0

In [None]:
glove_dir='glove.6B.200d.txt'
embedding_index={}

f=open(glove_dir,encoding="utf-8")
for line in f:
    values=line.split()
    word=values[0]
    coefs=np.asarray(values[1:],dtype='float32')
    embedding_index[word]=coefs
f.close()

print("found %s word vectors",len(embedding_index))

In [None]:
embedding_dim=200
embedding_matrix=np.zeros((vocab_size,embedding_dim))

for word, i in wordtoix.items():
    embedding_vector=embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i]=embedding_vector

In [None]:
embedding_matrix.shape

In [None]:
inputs1=Input(shape=(2048,))
fe1=Dropout(0.5)(inputs1)
fe2=Dense(256,activation='relu')(fe1)
inputs2=Input(shape=(max_length,))

se1=Embedding(vocab_size,embedding_dim,mask_zero=True)(inputs2)
se2=Dropout(0.5)(se1)
se3=LSTM(256)(se2)
decoder1=add([fe2,se3])
decoder2=Dense(256,activation='relu')(decoder1)
outputs=Dense(vocab_size,activation='softmax')(decoder2)
model=Model(inputs=[inputs1,inputs2],outputs=outputs)



In [None]:
model.summary()

In [None]:
model.layers[2]

In [None]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable=False

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [None]:
epochs=9
number_pics_per_batch=3
steps=len(descriptions)//number_pics_per_batch

In [None]:
del descriptions['image_name']
del descriptions['']

In [None]:
 model.save('./model_weights/model_'+str(0)+'.h5')

In [None]:
for i in range(epochs+1):
    generator=data_generator(descriptions,train_features,wordtoix,max_length,number_pics_per_batch)
    model.fit_generator(generator,epochs=1,steps_per_epoch=steps,verbose=1)
    model.save('./model_weights/model_'+str(i)+'.h5')

In [None]:
import tensorflow as tf
new_model=tf.keras.models.load_model('./model_weights/model_7.h5')

In [None]:
model.compile(loss='categorical_crossentropy',optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.0001))
epochs = 10
number_pics_per_bath = 6
steps = len(descriptions)//number_pics_per_bath

In [None]:
for i in range(epochs+1):
    generator=data_generator(descriptions,train_features,wordtoix,max_length,number_pics_per_batch)
    model.fit_generator(generator,epochs=1,steps_per_epoch=steps,verbose=1)
    model.save('./model_weights/model_'+str(i+10)+'.h5')

In [None]:
import tensorflow as tf
model=tf.keras.models.load_model('./model_weights/final_model.h5')
model.compile(loss='categorical_crossentropy',optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=0.0001))


In [None]:
def greedySearch(photo):
    in_text = 'startseq'
    for i in range(max_length):
        sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = ixtoword[yhat]
        in_text += ' ' + word
        if word == 'endseq':
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [None]:
test_folder='./test/'
test_images=glob.glob(test_folder+'*.jpg')

In [None]:
start=time()
encoding_test={}
for img in test_images:
    encoding_test[img[len(test_folder):]]=encode(img)
    print("time tken in second=",time()-start)

In [None]:
pics=list(encoding_test.keys())
for pic in pics:
    image=encoding_test[pic].reshape((1,2048))
    print("greedy:",greedySearch(image))
    print('\n')