In [1]:
import json
import numpy as np

In [2]:
from sklearn.decomposition import PCA
import os
import pickle

In [3]:
root_folder = '/home/lyt/code/concept_vqa'
data_folder = '{}/dataTVQA'.format(root_folder)
ori_emb_size = 768
new_emb_size = 300

In [4]:
def load_vocab_emb(emb_file, emb_size):
    with open(emb_file) as f:
        raw = f.read().splitlines()
    word_vec = [l.split(' ', 1) for l in raw]
    vocab, vecs_txt = zip(*word_vec)
    vecs = np.fromstring(' '.join(vecs_txt), dtype='float32', sep=' ')
    vecs = vecs.reshape(-1, emb_size)
    return vocab, vecs

In [5]:
def write_vocab_emb(emb_file, vocab, emb):
    emb = emb.tolist()
    emb = [['{:f}'.format(xx) for xx in x] for x in emb]
    vocab = [[v] for v in vocab]
    y = list(zip(vocab,emb))
    z = [' '.join(yy[0]+yy[1]) for yy in y]
    with open(emb_file, 'w') as f:
        f.write('\n'.join(z))

In [6]:
emb_file = '{}/word-embedding/bert.{}d.txt'.format(data_folder, ori_emb_size)
print('[Load] {} ...'.format(emb_file))
vocab, dic_emb = load_vocab_emb(emb_file, ori_emb_size)
print('[Info] Dict emb shape: {}'.format(dic_emb.shape))

[Load] /home/lyt/code/concept_vqa/dataTVQA/word-embedding/bert.768d.txt ...
[Info] Dict emb shape: (400000, 768)


In [7]:
pcafile = '{}/word-embedding/pca_dict.bert.{}-{}.pkl'.format(data_folder, ori_emb_size, new_emb_size)
if os.path.exists(pcafile):
    with open(pcafile, 'rb') as f:
        pca=pickle.load(f)
else:
    pca = PCA(n_components=new_emb_size)
    pca.fit(dic_emb)
    with open(pcafile, 'wb') as f:
        pickle.dump(pca, f)

In [8]:
new_dic_emb = pca.transform(dic_emb)
print('[Info] New dict emb shape: {}'.format(new_dic_emb.shape))

[Info] New dict emb shape: (400000, 300)


In [9]:
emb_file = '{}/word-embedding/bert.PCA.{}d.txt'.format(data_folder, new_dic_emb.shape[1])
print('[Write] {} ...'.format(emb_file))
write_vocab_emb(emb_file, vocab, new_dic_emb)

[Write] /home/lyt/code/concept_vqa/dataTVQA/word-embedding/bert.300d.txt ...
