In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/Image_Caption'

In [1]:
import tensorflow as tf
from keras.preprocessing import image
from keras.applications.resnet50 import ResNet50
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, RepeatVector, Dense, LSTM
from keras.layers import Embedding, Dropout, TimeDistributed, Concatenate
from keras.layers import Activation
from keras.optimizers import Adam
from keras.layers import add
from keras.callbacks import ModelCheckpoint
import os
import numpy as np
from PIL import Image




In [23]:
folder = "Flickr_Data/Flickr_Data/Images/"
images = os.listdir(folder)

image_model = ResNet50(weights='ResNet50/resnet50_weights_tf_dim_ordering_tf_kernels.h5')
# image_model = ResNet50()
model_new = tf.keras.Model(image_model.input,image_model.layers[-2].output)

# Store image features in dictionary
img_features = dict()  
for img in images: 
    img1 = image.load_img(folder + img, target_size=(224, 224))
    x = image.img_to_array(img1)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    fea_x = model_new.predict(x)
    fea_x1 = np.reshape(fea_x , fea_x.shape[1])
    img_features[img] = fea_x1



In [24]:
fn = "Flickr_Data/Flickr_Data/Flickr_TextData/Flickr8k.token.txt"
f = open(fn, 'r')
capts = f.read()
#Group all captions by filename, for references
captions = dict()
i = 0

try:
    for line in capts.split("\n"):
        txt = line.split('\t')
        fn = txt[0].split('#')[0]
        if fn not in captions.keys():
            captions[fn] = [txt[1]]
        else:
            captions[fn].append(txt[1])
        i += 1
except:
    pass #pass Model
    

def getCaptions(path):
    
    f = open(path, 'r')
    capts = f.read()
    desc = dict()

    try:
        for line in capts.split("\n"):
            image_id = line
            image_descs = captions[image_id]

            for des in image_descs:
                ws = des.split(" ")
                w = [word for word in ws if word.isalpha()]
                des = "startseq " + " ".join(w) + " endseq"
                if image_id not in desc:
                    desc[image_id] = list()
                desc[image_id].append(des)
    except:
        pass
    
    return desc

# Split captions
train_caps = getCaptions("Flickr_Data/Flickr_Data/Flickr_TextData/Flickr_8k.trainImages.txt")
val_caps = getCaptions("Flickr_Data/Flickr_Data/Flickr_TextData/Flickr_8k.devImages.txt")

In [25]:
print("train_caps type: ", type(train_caps))
print("First few (key, value) paris of train_caps:") 
for i, (k, v) in enumerate(train_caps.items()):
    print(k, v)
    if i>2:
        break
print("")
print("val_caps type: ", type(val_caps))
print("First few (key, value) pairs of val_caps:") 
for i, (k, v) in enumerate(val_caps.items()):
    print(k, v)
    if i>2:
        break

train_caps type:  <class 'dict'>
First few (key, value) paris of train_caps:
2513260012_03d33305cf.jpg ['startseq A black dog is running after a white dog in the snow endseq', 'startseq Black dog chasing brown dog through snow endseq', 'startseq Two dogs chase each other across the snowy ground endseq', 'startseq Two dogs play together in the snow endseq', 'startseq Two dogs running through a low lying body of water endseq']
2903617548_d3e38d7f88.jpg ['startseq A little baby plays croquet endseq', 'startseq A little girl plays croquet next to a truck endseq', 'startseq The child is playing croquette by the truck endseq', 'startseq The kid is in front of a car with a put and a ball endseq', 'startseq The little boy is playing with a croquet hammer and ball beside the car endseq']
3338291921_fe7ae0c8f8.jpg ['startseq A brown dog in the snow has something hot pink in its mouth endseq', 'startseq A brown dog in the snow holding a pink hat endseq', 'startseq A brown dog is holding a pink sh

In [26]:
train_captions = []
for key, desc_list in train_caps.items():
    for i in range(len(desc_list)):
        train_captions.append(desc_list[i])

# Tokenize top 5000 words in Train Captions
vocab_size = 5000
tokenizer = Tokenizer(num_words=vocab_size,
                      oov_token="<unk>",
                      filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.fit_on_texts(train_captions)
word_index = tokenizer.word_index
index_word = tokenizer.index_word

In [27]:
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: {}'.format(vocab_size))

Vocabulary Size: 7278


In [28]:
print("word_index type: ", type(word_index))
print("First few (key, value) paris of word_index:") 
for i, (k, v) in enumerate(word_index.items()):
    print(k, v)
    if i>2:
        break
print("")
print("index_word type: ", type(index_word))
print("First few (key, value) pairs of index_word:") 
for i, (k, v) in enumerate(index_word.items()):
    print(k, v)
    if i>2:
        break

word_index type:  <class 'dict'>
First few (key, value) paris of word_index:
<unk> 1
a 2
startseq 3
endseq 4

index_word type:  <class 'dict'>
First few (key, value) pairs of index_word:
1 <unk>
2 a
3 startseq
4 endseq


In [29]:
train_fns = list(train_caps.keys())
train_set = dict((k, img_features[k]) for k in train_fns)
val_fns = list(val_caps.keys())
val_set = dict((k, img_features[k]) for k in val_fns)
fn_test = "Flickr_Data/Flickr_Data/Flickr_TextData/Flickr_8k.testImages.txt"
f = open(fn_test, 'r')
t = f.read()

test_fns= t.split("\n")
test_set = dict((k, img_features[k]) for k in list(test_fns[:-1]))

In [32]:
print("train_set type: ", type(train_set))
print("First few (key, value) paris of train_set:") 
for i, (k, v) in enumerate(train_set.items()):
    print(k, v)
    if i>2:
        break
print("")
print("val_set type: ", type(val_set))
print("First few (key, value) pairs of val_set:") 
for i, (k, v) in enumerate(val_set.items()):
    print(k, v)
    if i>2:
        break
print("")
print("test_set type: ", type(test_set))
print("First few (key, value) pairs of test_set:") 
for i, (k, v) in enumerate(test_set.items()):
    print(k, v)
    if i>2:
        break

train_set type:  <class 'dict'>
First few (key, value) paris of train_set:
2513260012_03d33305cf.jpg [0.3470798  0.51159126 0.08728857 ... 1.1897299  0.04043837 0.1527107 ]
2903617548_d3e38d7f88.jpg [0.         0.00307793 0.09360002 ... 0.43103483 0.37985367 0.91657543]
3338291921_fe7ae0c8f8.jpg [0.7674452  0.5342245  0.38325223 ... 0.0209866  0.18429367 0.04270015]
488416045_1c6d903fe0.jpg [6.2481093e-01 3.2965469e-01 1.7831034e-04 ... 1.2753816e+00 4.6392554e-01
 4.9036142e-01]

val_set type:  <class 'dict'>
First few (key, value) pairs of val_set:
2090545563_a4e66ec76b.jpg [0.19071439 0.23291121 0.         ... 0.01300229 1.470331   0.03179051]
3393035454_2d2370ffd4.jpg [0.02049527 0.49463907 0.6934476  ... 0.         0.09734656 0.        ]
3695064885_a6922f06b2.jpg [0.26762405 0.6193744  0.01225693 ... 0.3813506  1.8359742  1.2442331 ]
1679557684_50a206e4a9.jpg [0.5639569  0.46300268 0.47889507 ... 0.41534156 0.86708325 0.23478833]

test_set type:  <class 'dict'>
First few (key, val

In [33]:
def create_sequences(tokenizer, max_length, desc_list, photo, vocab_size):
    X1, X2, y = list(), list(), list()
    # walk through each description for the image
    for desc in desc_list:
        # encode the sequence
        seq = tokenizer.texts_to_sequences([desc])[0]
        # split one sequence into multiple X,y pairs
        for i in range(1, len(seq)):
            # split into input and output pair
            in_seq, out_seq = seq[:i], seq[i]
            # pad input sequence
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            # encode output sequence
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            # store
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

In [34]:
max_length = 34

In [35]:
def data_generator(descriptions, photos, tokenizer, max_length, vocab_size):
    # loop for ever over images
    while 1:
        for key, desc_list in descriptions.items():
            # retrieve the photo feature
            photo = photos[key]
            in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo, vocab_size)
            yield [in_img, in_seq], out_word

In [36]:
generator = data_generator(train_caps, train_set, tokenizer, max_length, vocab_size)
inputs, outputs = next(generator)
print(inputs[0].shape)
print(inputs[1].shape)
print(outputs.shape)

(50, 2048)
(50, 34)
(50, 7278)


In [37]:
embeddings_index = {} # empty dictionary
f = open("Glove/glove.6B.200d.txt", encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
# Get 200-dim dense vector for each of the 10000 words in out vocabulary
vocab_size = len(word_index) + 1
embedding_dim = 200
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


In [38]:
image_model = tf.keras.models.Sequential()

image_model.add(Dense(embedding_dim, input_shape=(2048,), activation='relu'))
image_model.add(RepeatVector(max_length))

language_model = tf.keras.models.Sequential()

language_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
language_model.add(LSTM(256, return_sequences=True))
language_model.add(TimeDistributed(Dense(embedding_dim)))

conca = Concatenate()([image_model.output, language_model.output])
x = LSTM(128, return_sequences=True)(conca)
x = LSTM(512, return_sequences=False)(x)
x = Dense(vocab_size)(x)
out = Activation('softmax')(x)
model_1 = Model(inputs=[image_model.input, language_model.input], outputs = out)

model_1.layers[2].set_weights([embedding_matrix])
model_1.layers[2].trainable = False

model_1.compile(loss='categorical_crossentropy', optimizer = Adam(learning_rate = 0.0001), metrics=['accuracy'])

In [39]:
epochs = 50
steps = len(train_caps)
for i in range(epochs):
    # create the data generator
    generator = data_generator(train_caps, train_set, tokenizer, max_length, vocab_size)
    # fit for one epoch
    model_1.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)




KeyboardInterrupt: 

In [None]:
model_1.save("Image_Caption_Trained_model.h5")
# map an integer to a word
def word_for_id(integer, tokenizer):
    return tokenizer.index_word.get(integer)

# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
    # seed the generation process
    in_text = 'startseq'
    # iterate over the whole length of the sequence
    for i in range(max_length):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        #print("sequence after tok: ", sequence)
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        if i==0:
            photo = np.expand_dims(photo, axis=0)
        #print("photo: ", photo)
        #print("sequence: ", sequence)
        yhat = model.predict([photo, sequence], verbose=0)
        # convert probability to integer
        yhat = np.argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # stop if we predict the end of the sequence
        if word == 'endseq':
            break
    return in_text

In [None]:
# test the predict function
tmpimg1 = np.expand_dims(np.array(photo), axis=0)
print(tmpimg1.shape)
tmpcap1 = pad_sequences([[3]], maxlen=max_length)
print(tmpcap1.shape)
tmpout1 = model_1.predict([tmpimg1, tmpcap1], verbose=0)
print(tmpout1.shape)
print(tmpout1)

In [2]:
def image_to_feat_vec(imagePath):
    img1 = image.load_img(imagePath, target_size=(224, 224))
    x = image.img_to_array(img1)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    fea_x = model_new.predict(x)
    fea_x1 = np.reshape(fea_x , fea_x.shape[1])
    return fea_x1

imagePath = "GarageImages/GarageImages/image1086.jpg"
photo = image_to_feat_vec(imagePath)
print("Predicted Caption:", generate_desc(model_1, tokenizer, photo, max_length))
Image.open(imagePath)

NameError: name 'image' is not defined

In [1]:
imagePath = "../input/garage-detection-unofficial-ssl-challenge/GarageImages/GarageImages/image1185.jpg"
photo = image_to_feat_vec(imagePath)
print("Predicted Caption:", generate_desc(model_1, tokenizer, photo, max_length))
Image.open(imagePath)

NameError: name 'image_to_feat_vec' is not defined