In [8]:
import pickle
import random

import h5py
import numpy as np
import pandas as pd
import os

In [9]:
root_path='/home/eric/Documents/Hashtag-recommendation-for-social-images/image_text_hashtagging/datasets/image_text/preprocessed_data'
training_filename =os.path.join(root_path,'training_data.txt')
validation_filename = os.path.join(root_path,'validation_data.txt')
image_features_filename = ('inception_image_name_to_features.h5')
data_logs = np.genfromtxt(root_path+'/'+'data_parameters.log',
                                  delimiter=' ', dtype='str')
data_logs = dict(zip(data_logs[:, 0], data_logs[:, 1]))
MAX_TOKEN_LENGTH = int(data_logs['max_caption_length:']) + 2
IMG_FEATS = int(data_logs['IMG_FEATS:'])
BOS = str(data_logs['BOS:'])
EOS = str(data_logs['EOS:'])
PAD = str(data_logs['PAD:'])

In [10]:
print('Loading training dataset...')
train_data = pd.read_table(training_filename, delimiter='*')
train_data = np.asarray(train_data,dtype=str)
training_dataset = train_data

print('Loading validation dataset...')
validation_dataset = pd.read_table(validation_filename,delimiter='*')
validation_dataset = np.asarray(validation_dataset, dtype=str)
validation_dataset = validation_dataset

Loading training dataset...
Loading validation dataset...


In [12]:
print('Loading vocabulary...')
word_to_id = pickle.load(open(os.path.join(root_path,'word_to_id.p'), 'rb'))
id_to_word = pickle.load(open(os.path.join(root_path,'id_to_word.p'), 'rb'))
VOCABULARY_SIZE = len(word_to_id)
word_to_id = word_to_id
id_to_word = id_to_word

Loading vocabulary...


In [14]:
image_names_to_features = h5py.File(os.path.join(root_path,image_features_filename), 'r')
data = training_dataset

In [15]:
tweets=data[:,1]

In [17]:
print(tweets[0])

heaven earth follow


In [19]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(tweets)

Using TensorFlow backend.
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [20]:
maxlen = 180
def get_features(text_series):
    """
    transforms text data to feature_vectors that can be used in the ml model.
    tokenizer must be available.
    """
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)
tweets_vec = get_features(tweets)

In [21]:
print(tweets_vec[2])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0 2490   32   85    4    5  316  258   17]


In [22]:
BATCH_SIZE=16
def make_empty_batch():
    captions_batch = np.zeros((BATCH_SIZE,MAX_TOKEN_LENGTH,
                                    VOCABULARY_SIZE))
    images_batch = np.zeros((BATCH_SIZE, MAX_TOKEN_LENGTH,
                                    IMG_FEATS))
    tweets_batch=np.zeros((BATCH_SIZE,maxlen))
    targets_batch = np.zeros((BATCH_SIZE,MAX_TOKEN_LENGTH,
                                    VOCABULARY_SIZE))
    return captions_batch, images_batch , targets_batch,tweets_batch


In [23]:
def get_one_hot_target(one_hot_caption):
    one_hot_target = np.zeros_like(one_hot_caption)
    one_hot_target[:-1, :] = one_hot_caption[1:, :]
    return one_hot_target

In [24]:
def wrap_in_dictionary(one_hot_caption,image_features,tweets,
                           one_hot_target):

        return [{'text': one_hot_caption,
                'image': image_features,
                'tweets':tweets
                },
                {'output': one_hot_target}]

In [25]:
def format_to_one_hot(caption):
        tokenized_caption = caption.split()
     #   tokenized_caption = [self.BOS] + tokenized_caption + [self.EOS]
        tokenized_caption = [BOS] + tokenized_caption
        #print(tokenized_caption)
        one_hot_caption = np.zeros((MAX_TOKEN_LENGTH,
                                    VOCABULARY_SIZE))
        word_ids = [word_to_id[word] for word in tokenized_caption
                        if word in word_to_id]
        for sequence_arg, word_id in enumerate(word_ids):
            one_hot_caption[sequence_arg,word_id] = 1
        return one_hot_caption

In [26]:
def get_image_features( image_name):
    image_features = image_names_to_features[image_name]\
                                            ['image_features'][:]
    image_input = np.zeros((MAX_TOKEN_LENGTH, IMG_FEATS))
        # print(self.IMG_FEATS)
        # print(image_features.shape)
    for i in range(MAX_TOKEN_LENGTH):
        image_input[i,:] =  image_features
    return image_input

In [27]:

image_names = data[:,0].tolist()
empty_batch = make_empty_batch()
captions_batch = empty_batch[0]
images_batch = empty_batch[1]
targets_batch = empty_batch[2]
tweets_batch=empty_batch[3]
batch_counter = 0

for data_arg, image_name in enumerate(image_names):
    caption = data[data_arg,1]
                #print(caption)
    one_hot_caption = format_to_one_hot(caption)
    captions_batch[batch_counter, :, :] = one_hot_caption
    targets_batch[batch_counter, :, :]  = get_one_hot_target(
                                                            one_hot_caption)
    images_batch[batch_counter, :, :]   = get_image_features(
                                                            image_name)
    tweets_batch[batch_counter,:]=tweets_vec[data_arg]

    if batch_counter == BATCH_SIZE - 1:
        yield_dictionary = wrap_in_dictionary(captions_batch,images_batch,tweets_batch,
                                                                targets_batch)
#         print(yield_dictionary)
        print(yield_dictionary[0]['text'].shape)
        print(yield_dictionary[0]['image'].shape)
        print(yield_dictionary[0]['tweets'].shape)
        print(yield_dictionary[1]['output'].shape)
        break
        empty_batch = make_empty_batch()
        captions_batch = empty_batch[0]
        images_batch = empty_batch[1]
        targets_batch = empty_batch[2]
        tweets_batch=empty_batch[3]
        batch_counter = 0

    batch_counter = batch_counter + 1

(16, 72, 1001)
(16, 72, 2048)
(16, 180)
(16, 72, 1001)
