In [1]:
def readFile(path):
    with open(path) as f:
        captions=f.read()
    return captions

In [2]:
captions=readFile("flickr8k/Flickr_Data/Flickr_Data/Flickr_TextData/Flickr8k.token.txt")

In [3]:
 captions=captions.split("\n")[:-1]

In [4]:
#print the number of captions we have
print(len(captions))

40460


In [5]:
import re

In [6]:
def cleanSentence(sentence):
    sentence=sentence.lower()
    sentence=re.sub(r"[^a-z]+"," ",sentence)
    sentence=sentence.split()
    sentence=[i for i in sentence if len(i)>1]
    sentence=" ".join(sentence)
    return sentence

In [7]:
description={}
for caption in captions:
    img_name=caption.split("\t")[0].split(".")[0]
    sent=cleanSentence(caption.split("\t")[1])
    if description.get(img_name) is None:
        description[img_name]=[]
    description[img_name].append(sent)

In [8]:
#test on a sample input
description.get("1000268201_693b08cb0e")

['child in pink dress is climbing up set of stairs in an entry way',
 'girl going into wooden building',
 'little girl climbing into wooden playhouse',
 'little girl climbing the stairs to her playhouse',
 'little girl in pink dress going into wooden cabin']

# Creating a vocabulary

In [10]:
tot_word=[]
for key in description.keys():
    [tot_word.append(i) for sentence in description[key] for i in sentence.split()]

In [11]:
#print the lenght
print(len(tot_word))

373837


In [12]:
import collections

In [13]:
freq_cnt=collections.Counter(tot_word)
freq_cnt=dict(freq_cnt)

In [17]:
#remove less frequent words from vocabulary
vocab=freq_cnt.items()
threshold=10
new_vocab=[x for x in vocab if x[1]>threshold]
tot_word=[x[0] for x in new_vocab]

In [18]:
#print size of new update vocabulary
print(len(tot_word))

1845


# Reading train and test data files

In [54]:
train=readFile("./flickr8k/Flickr_Data/Flickr_Data/Flickr_TextData/Flickr_8k.trainImages.txt")
test=readFile("./flickr8k/Flickr_Data/Flickr_Data/Flickr_TextData/Flickr_8k.testImages.txt")

In [55]:
train=[i.split(".")[0] for i in train.split("\n")[:-1]]
test=[i.split(".")[0] for i in test.split("\n")[:-1]]

In [56]:
train[:10]

['2513260012_03d33305cf',
 '2903617548_d3e38d7f88',
 '3338291921_fe7ae0c8f8',
 '488416045_1c6d903fe0',
 '2644326817_8f45080b87',
 '218342358_1755a9cce1',
 '2501968935_02f2cd8079',
 '2699342860_5288e203ea',
 '2638369467_8fc251595b',
 '2926786902_815a99a154']

In [132]:
#add a start sequence and an end sequence to all the training data
train_descriptions={}
for img_id in train:
    train_descriptions[img_id]=[]
    for sentence in description[img_id]:
        train_descriptions[img_id].append("startseq "+sentence+" endseq")

In [133]:
#print a sample from the dictionary created
train_desciptions["2513260012_03d33305cf"]

['startseq black dog is running after white dog in the snow endseq',
 'startseq black dog chasing brown dog through snow endseq',
 'startseq two dogs chase each other across the snowy ground endseq',
 'startseq two dogs play together in the snow endseq',
 'startseq two dogs running through low lying body of water endseq']

# Extracting features using transfer learning

In [62]:
from keras.applications.resnet50 import ResNet50
import numpy as np


In [65]:
model=ResNet50(weights='imagenet',input_shape=(224,224,3))
model.summary()

Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_2[0][0]                    
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
bn_conv1 (BatchNormalization)   (None, 112, 112, 64) 256         co

In [71]:
from keras.models import *
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing import image

In [70]:
model_new=Model(model.input,model.layers[-2].output)

In [83]:
def preprocessImage(img):
    img=image.load_img(img,target_size=(224,224,3))
    img=image.img_to_array(img)
    img=img.reshape(-1,224,224,3)
    img=preprocess_input(img)
    return img

In [84]:
IMG_PATH="./flickr8k/Flickr_Data/Flickr_Data/Images/"

In [90]:
def encodeImage(img):
    img=preprocessImage(img)
    img=model_new.predict(img)
    img=img.reshape((-1))
    return img

In [95]:
#print the shape of the encoding
encodeImage(IMG_PATH+"2699342860_5288e203ea.jpg").shape

(2048,)

In [99]:
train_encodings={}
for i,img_id in enumerate(train):
    img_path="{}/{}.jpg".format(IMG_PATH,img_id)
    train_encodings[img_id]=encodeImage(img_path)
    if i%100==0:
        print("encoding image {}/{}".format(i,len(train)))

encoding image 0/6000
encoding image 100/6000
encoding image 200/6000
encoding image 300/6000
encoding image 400/6000
encoding image 500/6000
encoding image 600/6000
encoding image 700/6000
encoding image 800/6000
encoding image 900/6000
encoding image 1000/6000
encoding image 1100/6000
encoding image 1200/6000
encoding image 1300/6000
encoding image 1400/6000
encoding image 1500/6000
encoding image 1600/6000
encoding image 1700/6000
encoding image 1800/6000
encoding image 1900/6000
encoding image 2000/6000
encoding image 2100/6000
encoding image 2200/6000
encoding image 2300/6000
encoding image 2400/6000
encoding image 2500/6000
encoding image 2600/6000
encoding image 2700/6000
encoding image 2800/6000
encoding image 2900/6000
encoding image 3000/6000
encoding image 3100/6000
encoding image 3200/6000
encoding image 3300/6000
encoding image 3400/6000
encoding image 3500/6000
encoding image 3600/6000
encoding image 3700/6000
encoding image 3800/6000
encoding image 3900/6000
encoding ima

In [101]:
#store the data on the disk
import pickle

In [102]:
with open("train_encodings.pkl","wb") as f:
    pickle.dump(train_encodings,f)

In [104]:
test_encodings={}
for i,img_id in enumerate(test):
    img_path="{}/{}.jpg".format(IMG_PATH,img_id)
    test_encodings[img_id]=encodeImage(img_path)
    if i%100==0:
        print("encoding image {}/{}".format(i,len(test)))

encoding image 0/1000
encoding image 100/1000
encoding image 200/1000
encoding image 300/1000
encoding image 400/1000
encoding image 500/1000
encoding image 600/1000
encoding image 700/1000
encoding image 800/1000
encoding image 900/1000


In [105]:
with open("test_encodings.pkl","wb") as f:
    pickle.dump(test_encodings,f)

In [106]:
word_2_index={}
index_2_word={}
tot_word.append('startseq')
tot_word.append('endseq')

In [121]:
vocab_size=len(tot_word)+1
print(vocab_size)

1848


In [108]:
for i,word in enumerate(tot_word):
    word_2_index[word]=(i+1)
    index_2_word[i+1]=word

In [120]:
#see an example 
print(word_2_index["beautiful"],index_2_word[435])

435 beautiful


In [118]:
#find maximum length of a sentence 
maxlen=0
for key in train_desciptions.keys():
    for sentence in train_desciptions[key]:
        maxlen=max(maxlen,len(sentence.split()))
print(maxlen)

35


In [128]:
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

# Create a data generator

In [129]:
def data_generator(train_descriptions,train_encodings,word_2_index,max_len,batch_size):
    X1,X2,y=[],[],[]
    n=0
    while True:
        for key,desc_list in train_descriptions.items():
            n+=1
            photo=train_encodings[key]
            for desc in desc_list:
                seq=[word_2_index[word] for word in desc.split() if word in word_2_index]
                for i in range(1,len(seq)):
                    xi=seq[:i]
                    yi=seq[i]
                    xi=pad_sequences([xi],maxlen=max_len,value=0,padding='post')[0]
                    yi=to_categorical([yi],num_classes=vocab_size)[0]
                    X1.append(photo)
                    X2.append(xi)
                    y.append(yi)
            if n==batch_size:
                yield [[np.array(X1),np.array(X2)],np.array(y)]
                X1,X2,y=[],[],[]
                n=0
                    
            

In [139]:
f= open("./glove.6B.50d.txt",encoding='utf8')

In [140]:
embedding_index={}
for line in f:
    values=line.split()
    word=values[0]
    embeddings=np.array(values[1:],dtype='float32')
    embedding_index[word]=embeddings

In [143]:
#test against an example
print(embedding_index['beautiful'].shape)

(50,)


In [144]:
def getEmbeddings():
    emb_dim=50
    matrix=np.zeros((vocab_size,emb_dim))
    for idx,word in index_2_word.items():
        embedding_vector=embedding_index.get(word)
        if embedding_vector is not None:
            matrix[idx]=embedding_vector
    return matrix

In [148]:
#get the embedding matrix
embedding_matrix=getEmbeddings()
print(embedding_matrix.shape)

(1848, 50)


In [165]:
from keras.layers import *

In [167]:
input_img=Input(shape=(2048,))
input_img1=Dropout(0.4)(input_img)
input_img2=Dense(256,activation='relu')(input_img1)

In [168]:
input_cap=Input(shape=(maxlen,))
input_cap1=Embedding(input_dim=vocab_size,output_dim=50,mask_zero=True)(input_cap)
input_cap2=Dropout(0.4)(input_cap1)
input_cap3=LSTM(256)(input_cap2)

In [170]:
decoder1=add([input_img2,input_cap3])
decoder2=Dense(256,activation='relu')(decoder1)
outputs=Dense(vocab_size,activation='softmax')(decoder2)

In [171]:
model=Model(inputs=[input_img,input_cap],outputs=outputs)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 35)           0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 2048)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 35, 50)       92400       input_5[0][0]                    
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 2048)         0           input_4[0][0]                    
__________________________________________________________________________________________________
dropout_3 

In [183]:
#initializing embedding weights
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable=False

In [189]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

# Training the model

In [185]:
epochs=20
batch_size=3
steps_per_epoch=len(train)//batch_size

In [187]:
def train():
    for i in range(epochs):
        generator=data_generator(train_descriptions,train_encodings,word_2_index,maxlen,batch_size)
        model.fit_generator(generator,steps_per_epoch=steps_per_epoch)
        model.save("./model_weights/epoch{}.h5".format(i+1))

In [188]:
train()

Instructions for updating:
Use tf.cast instead.
Epoch 1/1
  89/2000 [>.............................] - ETA: 15:44 - loss: 5.6936

KeyboardInterrupt: 