In [1]:
import numpy as np
from sklearn.decomposition import PCA
import os
import pickle

In [2]:
root_folder = '/home/lyt/code/vqa-concept'
data_folder = '{}/dataTVQA'.format(root_folder)
fea_folder = '{}/image-feature/bottomup'.format(data_folder)

In [3]:
#split = 'trval' # train on Tr; test on Val
split = 'trvalte' # train on Tr+Val; test on Te

#suffix = '' # ocr_bert_DR
#suffix = '.0init' # ocr_bert_DR.0init
#suffix = '.clean.0init' # ocr_bert_DR.clean.0init
suffix = '.assemble.0init'

In [4]:
emb_size = 768
trainfile = '{}/ocr_bert.new.{}/{}_50_ocr{}.npy'.format(fea_folder, emb_size, 'train', suffix)
valfile = '{}/ocr_bert.new.{}/{}_50_ocr{}.npy'.format(fea_folder, emb_size, 'val', suffix)
testfile = '{}/ocr_bert.new.{}/{}_50_ocr{}.npy'.format(fea_folder, emb_size, 'test', suffix)
itoafile = '{}/ocr_bert.new.{}/{}/{}_emb.npy'.format(fea_folder, emb_size, split, 'itoa')

In [5]:
trainX = np.load(trainfile)
valX = np.load(valfile)
testX = np.load(testfile)
itoaX = np.load(itoafile)

In [6]:
print(trainX.shape)
print(valX.shape)
print(testX.shape)
print(itoaX.shape)

(21953, 50, 768)
(3166, 50, 768)
(3289, 50, 768)
(9452, 768)


In [7]:
trainX = trainX.reshape(-1,emb_size)
valX = valX.reshape(-1,emb_size)
testX = testX.reshape(-1,emb_size)

In [8]:
print(trainX.shape)
print(valX.shape)
print(testX.shape)
print(itoaX.shape)

(1097650, 768)
(158300, 768)
(164450, 768)
(9452, 768)


In [9]:
if split == 'trval':
    X = np.concatenate((trainX, valX, itoaX), 0)
elif split == 'trvalte':
    X = np.concatenate((trainX, valX, testX, itoaX), 0)
print(X.shape)

(1429852, 768)


In [10]:
emb_size = 300
pcafile = '{}/ocr_bert.new.{}/{}/pca{}.pkl'.format(fea_folder, emb_size, split, suffix)
if os.path.exists(pcafile):
    with open(pcafile, 'rb') as f:
        pca=pickle.load(f)
else:
    pca = PCA(n_components=emb_size)
    pca.fit(X)
    with open(pcafile, 'wb') as f:
        pickle.dump(pca, f)

In [11]:
trX = pca.transform(trainX)
vaX = pca.transform(valX)
teX = pca.transform(testX)
iaX = pca.transform(itoaX)

In [12]:
print(trX.shape)
print(vaX.shape)
print(teX.shape)
print(iaX.shape)

(1097650, 300)
(158300, 300)
(164450, 300)
(9452, 300)


In [13]:
trX = trX.reshape(-1,50,emb_size)
vaX = vaX.reshape(-1,50,emb_size)
teX = teX.reshape(-1,50,emb_size)

In [14]:
print(trX.shape)
print(vaX.shape)
print(teX.shape)
print(iaX.shape)

(21953, 50, 300)
(3166, 50, 300)
(3289, 50, 300)
(9452, 300)


In [15]:
trainfile = '{}/ocr_bert.new.{}/{}/{}_50_ocr{}.npy'.format(fea_folder, emb_size, split, 'train', suffix)
valfile = '{}/ocr_bert.new.{}/{}/{}_50_ocr{}.npy'.format(fea_folder, emb_size, split, 'val', suffix)
testfile = '{}/ocr_bert.new.{}/{}/{}_50_ocr{}.npy'.format(fea_folder, emb_size, split, 'test', suffix)
itoafile = '{}/ocr_bert.new.{}/{}/{}_emb{}.npy'.format(fea_folder, emb_size, split, 'itoa', suffix)

In [16]:
np.save(trainfile,trX)
np.save(valfile,vaX)
np.save(testfile,teX)
np.save(itoafile,iaX)