In [1]:
import numpy as np

In [2]:
root_folder = '/home/lyt/code/vqa-concept'
data_folder = '{}/dataTVQA'.format(root_folder)
fea_folder = '{}/image-feature/bottomup'.format(data_folder)

In [3]:
def merge_embeddings(embedding_names):
    names = embedding_names.split('+')
    vocabs = []
    vecs = []
    for name in names:
        vocab, vec = load_embeddings(name)
        vocabs.append(vocab)
        vecs.append(vec)

    final_vocab = set(vocabs[0])
    for vocab in vocabs[1:]:
        final_vocab &= set(vocab)
    final_vocab = list(final_vocab)

    final_vec = []
    for vocab, vec in zip(vocabs, vecs):
        w2i = dict(zip(vocab, range(len(vocab))))
        inds = np.array([w2i[w] for w in final_vocab])
        final_vec.append(vec[inds])
    final_vec = np.hstack(final_vec)

    return dict(zip(final_vocab, final_vec))

def load_embeddings(name):
    emb_path = '{}/word-embedding/{}'.format(data_folder, name)
    #logger.debug('[Load] ' + emb_path)
    with open(emb_path) as f:
        word_vec_txt = [l.rstrip().split(' ', 1) for l in f.readlines()]
    vocab, vecs_txt = zip(*word_vec_txt)
    # infer vector dimention
    vec_size = len(vecs_txt[0].split())
    # fromstring faster than loadtxt
    vecs = np.fromstring(' '.join(vecs_txt), dtype='float32', sep=' ')
    vecs = vecs.reshape(-1, vec_size)
    return vocab, vecs

In [4]:
WORD_EMBEDDINGS = 'bert.PCA.300d.txt'
word_vec = merge_embeddings(WORD_EMBEDDINGS)
aword = next(iter(word_vec))
emb_size = len(word_vec[aword])

In [5]:
with open('{}/objects_vocab.txt'.format(data_folder)) as f:
    objects_vocab = f.read().splitlines()
objects_vocab = ['__no_objects__'] + objects_vocab

In [6]:
def get_class_embedding(class_name, word_vec, emb_size):
    synonyms = class_name.split(',')
    act_num = []
    act_ratio = []
    for label in synonyms:
        words = label.split()
        act = sum([1 for word in words if word in word_vec])
        act_num.append(act)
        act_ratio.append(act / len(words))
    act_idx = max(range(len(act_num)), key=lambda x: act_ratio[x])
    vec = np.zeros((emb_size,), dtype='float32')
    pretrained_avail = act_num[act_idx] > 0
    if pretrained_avail:
        for word in synonyms[act_idx].split():
            if word in word_vec:
                vec += word_vec[word]
        vec /= act_num[act_idx]
    return pretrained_avail, vec

In [7]:
trainfile = '{}/{}_100_class-prob.npy'.format(fea_folder, 'train')
valfile = '{}/{}_100_class-prob.npy'.format(fea_folder, 'val')
testfile = '{}/{}_100_class-prob.npy'.format(fea_folder, 'test')

In [8]:
trainX = np.load(trainfile)
valX = np.load(valfile)
testX = np.load(testfile)

In [9]:
obj_emb = np.zeros((len(objects_vocab), emb_size), dtype='float32')
fill_cnt = 0
for i, line in enumerate(objects_vocab):
    avail, vec = get_class_embedding(line, word_vec, emb_size)
    if avail:
        obj_emb[i] = vec
        fill_cnt += 1
print('[debug] class embedding filling count: {}/{}'.format(fill_cnt, len(objects_vocab)))
print(trainX.shape)
print(valX.shape)
print(testX.shape)
print(obj_emb.shape)

[debug] class embedding filling count: 1596/1601
(21953, 100, 1601)
(3166, 100, 1601)
(3289, 100, 1601)
(1601, 300)


In [10]:
trX = np.dot(trainX,obj_emb)

In [11]:
vaX = np.dot(valX,obj_emb)

In [12]:
teX = np.dot(testX,obj_emb)

In [13]:
print(trX.shape)
print(vaX.shape)
print(teX.shape)

(21953, 100, 300)
(3166, 100, 300)
(3289, 100, 300)


In [14]:
trainfile = '{}/{}_100_class-emb.npy'.format(fea_folder, 'train')
valfile = '{}/{}_100_class-emb.npy'.format(fea_folder, 'val')
testfile = '{}/{}_100_class-emb.npy'.format(fea_folder, 'test')

In [15]:
np.save(trainfile,trX)
np.save(valfile,vaX)
np.save(testfile,teX)