In [1]:
import json
import scipy.io
import time
import numpy as np
import pandas as pd
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Embedding, Dense, RepeatVector, Activation, Bidirectional
from keras.optimizers import Adam, RMSprop
import ast
from keras.models import Model
from keras.layers import concatenate

In [2]:
dataset_path = 'data/flickr8k/dataset.json'
print('BasicDataProvider: reading %s' % (dataset_path, )) 
dataset = json.load(open(dataset_path, 'r'))

BasicDataProvider: reading data/flickr8k/dataset.json


In [3]:
f = open('flickr8k_dataset.txt', 'w')
f.write("filename\timage_id\tcaption_id\tcaptions\tsplit\n")
a = []
for i in dataset['images']:
    for n,j in enumerate(i['sentids']):

        f.write(i['filename']+ "\t" + str(i['imgid']) + "\t" + str(j) + "\t" +
         str(['<start>']+i['sentences'][n]['tokens']+['.'])+"\t"+i['split']+ "\n")
f.close()

In [2]:
def preProBuildWordVocab(dataset='flickr8k', word_count_threshold=5):
  # count up all word counts so that we can threshold
  # this shouldnt be too expensive of an operation
  print('preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold, ))
  if dataset == 'flickr8k':
      sentences = pd.read_csv('flickr8k_dataset.txt', delimiter='\t')['captions'].to_list()
  t0 = time.time()
  word_counts = {}
  nsents = 0
  max_len = 0
  for sent in sentences:
    tokened = ast.literal_eval(sent)
    if len(tokened) > max_len:
      max_len = len(tokened)
    nsents += 1
    for w in tokened:
      word_counts[w] = word_counts.get(w, 0) + 1
  vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
  print('filtered words from %d to %d in %.2fs' % (len(word_counts), len(vocab), time.time() - t0))

  ixtoword = {}
  ixtoword[0] = '.'  # period at the end of the sentence. make first dimension be end token
  wordtoix = {}
  wordtoix['#START#'] = 0 # make first vector be the start token
  ix = 1
  for w in vocab:
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

  # compute bias vector, which is related to the log probability of the distribution
  # of the labels (words) and how often they occur. We will use this vector to initialize
  # the decoder weights, so that the loss function doesnt show a huge increase in performance
  # very quickly (which is just the network learning this anyway, for the most part). This makes
  # the visualizations of the cost function nicer because it doesn't look like a hockey stick.
  # for example on Flickr8K, doing this brings down initial perplexity from ~2500 to ~170.
  # word_counts['.'] = nsents
  # bias_init_vector = np.array([1.0*word_counts[ixtoword[i]] for i in ixtoword])
  # bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
  # bias_init_vector = np.log(bias_init_vector)
  # bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range
  return wordtoix, ixtoword,max_len,len(wordtoix)#, bias_init_vector

In [3]:
wordtoix, ixtoword,max_len,voc_len = preProBuildWordVocab()

preprocessing word counts and creating vocab based on word count threshold 5
filtered words from 8385 to 2943 in 1.07s


In [4]:
def data_generator(wordtoix,max_len,batch_size = 32):
        captions = []
        images = []

        df = pd.read_csv('flickr8k_training_dataset.txt', delimiter='\t')
        df = df.sample(frac=1)
        iter = df.iterrows()
        c = []
        imgs = []
        for i in range(df.shape[0]):
                x = next(iter)
                c.append(ast.literal_eval(x[1][3]))
                imgs.append(x[1][1])
        features_path = 'data/flickr8k/vgg_feats.mat'
        features_struct = scipy.io.loadmat(features_path)['feats']
        count = 0
        while True:
            for text,im in zip(c,imgs):
                current_image = features_struct[:,im]
                word_idx = [wordtoix[i] for i in text if i in wordtoix]
                word_idx.append(0)
                captions.append(word_idx)
                count+=1
                images.append(current_image)
                if count>=batch_size:
                    images = np.asarray(images)
                    captions = pad_sequences(captions, maxlen=max_len, padding='post')
                    yield [[images, captions], captions]
                    captions = []
                    images = []
                    count = 0

In [14]:
a = data_generator(wordtoix,max_len,batch_size = 32)
b = next(a)


In [6]:
# image_model = Sequential([
#        Dense(256, input_shape=(max_len,), activation='relu'),
#       RepeatVector(max_len)
#       ])
image_model = Sequential()
image_model.add(Dense(256, input_shape=(4096,), activation='relu'))
image_model.add(RepeatVector(max_len))
# caption_model = Sequential([
#           Embedding(voc_len,256, input_length=max_len),
#           LSTM(256, return_sequences=True)
#                     ])
caption_model = Sequential()
caption_model.add(Embedding(max_len,256, input_length=max_len))
caption_model.add(LSTM(256, return_sequences=True))

In [115]:
# final_model = Sequential([
#                         Concatenate((image_model, caption_model)),
#                         Bidirectional(LSTM(256, return_sequences=False)),
#                         Dense(voc_len),
#                         Activation('softmax')
#                     ])

In [7]:
concat_layer = concatenate([image_model.output, caption_model.output],axis=1)
x = Bidirectional(LSTM(256, return_sequences=False))(concat_layer)
x = Dense(max_len)(x)
out = Activation('softmax')(x)

In [8]:
final_model = Model([image_model.input, caption_model.input],[out])
final_model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])

In [32]:
final_model([b[0][0],b[0][1].astype('float32')]).shape

TensorShape([32, 2944])

In [None]:
final_model.fit_generator(data_generator(wordtoix,max_len,batch_size = 256), epochs=1)

In [8]:
final_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 dense_input (InputLayer)       [(None, 4096)]       0           []                               
                                                                                                  
 embedding_input (InputLayer)   [(None, 39)]         0           []                               
                                                                                                  
 dense (Dense)                  (None, 256)          1048832     ['dense_input[0][0]']            
                                                                                                  
 embedding (Embedding)          (None, 39, 256)      9984        ['embedding_input[0][0]']        
                                                                                              

In [1]:
from keras_version.model import keras_model
from keras_version.data_generator import *

In [15]:
from ast import literal_eval
import pandas as pd
import scipy.io
import numpy as np
from keras.utils import pad_sequences
def build_generator(wordtoix, max_len, batch_size=32, dataset='flickr8k'):
    captions = []
    images = []
    df = pd.read_csv('flickr8k_training_dataset.txt', delimiter='\t')
    df = df.sample(frac=1)
    iter = df.iterrows()
    c = []
    imgs = []
    for i in range(df.shape[0]):
        x = next(iter)
        c.append(literal_eval(x[1][3]))
        imgs.append(x[1][1])
    features_path = f'data/{dataset}/vgg_feats.mat'
    features_struct = scipy.io.loadmat(features_path)['feats']
    count = 0
    while True:
        for text, im in zip(c, imgs):
            current_image = features_struct[:, im]
            word_idx = [wordtoix[i] for i in text if i in wordtoix]
            word_idx.append(0)
            captions.append(word_idx)
            count += 1
            images.append(current_image)
            if count >= batch_size:
                images = np.asarray(images)
                captions = pad_sequences(
                    captions, maxlen=max_len, padding='post')
                yield [[images, captions], captions]
                captions = []
                images = []
                count = 0


In [3]:
data = data_generator()
data.build_vocab()

preprocessing word counts and creating vocab based on word count threshold 5
Length of the word index: 2944


In [4]:
generatort = build_generator(data._wordtoix,data._max_len)

In [5]:
model = keras_model(data._max_len)

In [6]:
model.compile(metrics=['Accuracy'],loss='categorical_crossentropy',optimizer='Adam')

In [7]:
model.train(generatort,epochs=1)

     90/Unknown - 8s 37ms/step - loss: 16715.2148 - Accuracy: 0.1132

KeyboardInterrupt: 

In [1]:

from keras_version.model import keras_model
from keras_version.data_generator import *


data = data_generator()
data.build_vocab()
generatort = build_generator(data._wordtoix,data._max_len)
model = keras_model(data._max_len)
model.compile(metrics=['Accuracy'],loss='categorical_crossentropy',optimizer='Adam')
model.train(generatort,epochs=1)

preprocessing word counts and creating vocab based on word count threshold 5
Length of the word index: 2944
    115/Unknown - 9s 37ms/step - loss: 13992.9307 - Accuracy: 0.1125

KeyboardInterrupt: 