In [None]:
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import nltk

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


---
## Data Loader (Generator)

Creating Data Points and its corresponding result

In [None]:
max = 30
vocab_size = 1803
from keras.utils import pad_sequences, to_categorical
def data_generator(train_descriptions,Encoded_train,word_to_idx,max_length_caption,batch_size):
    x1,x2,y = [], [], []
    n=0
    while True:
        for key, desc_list in train_descriptions.items():
            n+=1
            photo = Encoded_train[key]
            for desc in desc_list:
                seq = [word_to_idx[word] for word in desc.split() if word in word_to_idx]
                for i in range(1,len(seq)):
                    xi = seq[0:i]
                    yi = seq[i]
                    # 0 denotes padding words
                    xi = pad_sequences([xi],maxlen=max_length_caption,value=0,padding="post")[0]
                    yi = to_categorical([yi],num_classes=vocab_size)[0]


                    x1.append(photo)
                    x2.append(xi)
                    y.append(yi)

                if n==batch_size:
                    yield [[np.array(x1),np.array(x2)],np.array(y)]
                    x1,x2,y = [], [], []
                    n=0

## Word Embeddings

Using Glove.6B.50D which has a 50 dimensional vector for 6 Billion words

In [None]:
f = open("/content/drive/MyDrive/glove.6B.50d.txt",encoding="utf8")

embedding_idx = {}
for line in f:
    values = line.split()
    word = values[0]
    embedding = np.array(values[1:],dtype='float')
    embedding_idx[word] = embedding
f.close()

In [None]:
embedding_idx.get('cannon')

array([-0.34126 , -0.06117 ,  0.93339 , -0.48938 , -0.16882 ,  0.76947 ,
        0.11972 , -0.11674 , -0.54492 , -0.44433 , -0.44316 , -0.10517 ,
       -0.086709,  0.13392 , -1.0489  , -0.10756 ,  0.4921  ,  0.62526 ,
       -1.5142  , -0.88477 ,  0.28845 ,  0.26258 ,  0.10793 , -0.92586 ,
       -0.043537, -0.67035 ,  0.22245 ,  0.28499 ,  0.32135 , -0.22713 ,
        1.0602  , -0.89463 , -0.14558 , -0.041262,  0.19645 ,  0.65909 ,
        0.85064 , -0.34087 ,  0.19177 ,  0.49908 ,  1.0036  ,  0.70362 ,
       -0.41316 , -0.47519 ,  0.94353 ,  0.14839 , -0.42612 , -0.5682  ,
       -0.6244  ,  0.30874 ])

In [None]:
def embedded_matrix():
    matrix = np.zeros((vocab_size,50))
    for word,idx in word_to_idx.items():
        embedded_vector = embedding_idx.get(word)

        if embedded_vector is not None:
            matrix[idx] = embedded_vector

    return matrix

In [None]:
embedded_matrix = embedded_matrix()
embedded_matrix.shape

(1803, 50)

In [None]:
#  embedding vector for <s> i.e start sequence
embedded_matrix[1800]

array([-0.34126 , -0.06117 ,  0.93339 , -0.48938 , -0.16882 ,  0.76947 ,
        0.11972 , -0.11674 , -0.54492 , -0.44433 , -0.44316 , -0.10517 ,
       -0.086709,  0.13392 , -1.0489  , -0.10756 ,  0.4921  ,  0.62526 ,
       -1.5142  , -0.88477 ,  0.28845 ,  0.26258 ,  0.10793 , -0.92586 ,
       -0.043537, -0.67035 ,  0.22245 ,  0.28499 ,  0.32135 , -0.22713 ,
        1.0602  , -0.89463 , -0.14558 , -0.041262,  0.19645 ,  0.65909 ,
        0.85064 , -0.34087 ,  0.19177 ,  0.49908 ,  1.0036  ,  0.70362 ,
       -0.41316 , -0.47519 ,  0.94353 ,  0.14839 , -0.42612 , -0.5682  ,
       -0.6244  ,  0.30874 ])

## Model Architechture

In [None]:
from keras.layers import Dense,Input, Dropout, Embedding, LSTM, add
#  Image feature extractor
input_img_features = Input(shape=(2048,))
se1 = Dropout(0.3)(input_img_features)
se2 = Dense(256,activation='relu')(se1)

#  Partial caption processing
input_captions = Input(shape=(max,))
fe1 = Embedding(input_dim=vocab_size,output_dim=50,mask_zero=True)(input_captions)
fe2 = Dropout(0.3)(fe1)
lstm_layer = LSTM(256)(fe2)

# Concatenate inputs and decode them
decoder = add([se2,lstm_layer])
decoder2 = Dense(256,activation='relu')(decoder)
outputs = Dense(vocab_size,activation='softmax')(decoder2)

Combined Model

In [None]:
from keras import Model
IC_model = Model(inputs=[input_img_features,input_captions],outputs=outputs)
IC_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 30)]                 0         []                            
                                                                                                  
 input_1 (InputLayer)        [(None, 2048)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 30, 50)               90150     ['input_2[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 2048)                 0         ['input_1[0][0]']             
                                                                                              

In [None]:
IC_model.layers[2].set_weights([embedded_matrix])
IC_model.layers[2].trainable = False
IC_model.compile(loss='categorical_crossentropy',optimizer='adam')

---

## Training the Model

In [None]:
import pickle
with open("/content/drive/MyDrive/Encoded_train.pkl","rb") as f:
    encoding_train = pickle.load(f)

In [2]:
with open('/content/drive/MyDrive/word_to_idx.pkl', 'rb') as f:
    word_to_idx = pickle.load(f)

with open('/content/drive/MyDrive/train_descriptions.pkl', 'rb') as f:
    train_descriptions = pickle.load(f)

In [None]:
epochs = 20
batch_size = 3
steps = len(train_descriptions)//batch_size

def train():
    for i in range(epochs):
        generator = data_generator(train_descriptions,encoding_train,word_to_idx,max,batch_size)
        IC_model.fit(generator,epochs=1,steps_per_epoch=steps)
        IC_model.save("/content/drive/MyDrive/Model_weights/model_"+str(i)+".h5")

In [None]:
train()



  saving_api.save_model(


