In [1]:
import numpy as np
from sklearn.decomposition import PCA
import os
import pickle
from config import cfg

In [2]:
emb_size = 1024
ocr_num = 50

In [3]:
split = 'TVA' # train on Tr; test on Val
# split = 'TVEA' # train on Tr+Val; test on Te

# suffix = '' # ocr_bert_DR
suffix = '.0init' # ocr_bert_DR.0init
# suffix = '.0init.OCRnorm' # ocr_bert_DR.0init.OCRnorm
# suffix = '.clean.0init' # ocr_bert_DR.clean.0init
# suffix = '.assemble.0init'

In [4]:
load_folder = '{}/bert.{}{}'.format(cfg.OCR_DIR, emb_size, suffix)
trainfile = '{}/{}_{}_ocr.npy'.format(load_folder, 'train', ocr_num)
valfile = '{}/{}_{}_ocr.npy'.format(load_folder, 'val', ocr_num)
testfile = '{}/{}_{}_ocr.npy'.format(load_folder, 'test', ocr_num)
itoafile = '{}/{}/ans_emb.npy'.format(load_folder, split)

In [5]:
trainX = np.load(trainfile)
valX = np.load(valfile)
testX = np.load(testfile)
itoaX = np.load(itoafile)

In [6]:
print(trainX.shape)
print(valX.shape)
print(testX.shape)
print(itoaX.shape)

(21953, 50, 768)
(3166, 50, 768)
(3289, 50, 768)
(8205, 768)


In [7]:
trainX = trainX.reshape(-1,emb_size)
valX = valX.reshape(-1,emb_size)
testX = testX.reshape(-1,emb_size)

In [8]:
print(trainX.shape)
print(valX.shape)
print(testX.shape)
print(itoaX.shape)

(1097650, 768)
(158300, 768)
(164450, 768)
(8205, 768)


In [9]:
if split == 'TVA':
    X = np.concatenate((trainX, valX, itoaX), 0)
elif split == 'TVEA':
    X = np.concatenate((trainX, valX, testX, itoaX), 0)
print(X.shape)

(1264155, 768)


In [10]:
emb_size = 300
save_folder = '{}/bert.{}{}.{}'.format(cfg.OCR_DIR, emb_size, suffix, split)
if not os.path.isdir(save_folder):
    os.makedirs(save_folder)
pcafile = '{}/pca.pkl'.format(save_folder)
if os.path.exists(pcafile):
    with open(pcafile, 'rb') as f:
        pca=pickle.load(f)
else:
    pca = PCA(n_components=emb_size)
    pca.fit(X)
    with open(pcafile, 'wb') as f:
        pickle.dump(pca, f)

In [11]:
trX = pca.transform(trainX)
vaX = pca.transform(valX)
teX = pca.transform(testX)
iaX = pca.transform(itoaX)

In [12]:
print(trX.shape)
print(vaX.shape)
print(teX.shape)
print(iaX.shape)

(1097650, 300)
(158300, 300)
(164450, 300)
(8205, 300)


In [13]:
trX = trX.reshape(-1,50,emb_size)
vaX = vaX.reshape(-1,50,emb_size)
teX = teX.reshape(-1,50,emb_size)

In [14]:
print(trX.shape)
print(vaX.shape)
print(teX.shape)
print(iaX.shape)

(21953, 50, 300)
(3166, 50, 300)
(3289, 50, 300)
(8205, 300)


In [15]:
trainfile = '{}/{}_{}_ocr.npy'.format(save_folder, 'train', ocr_num)
valfile = '{}/{}_{}_ocr.npy'.format(save_folder, 'val', ocr_num)
testfile = '{}/{}_{}_ocr.npy'.format(save_folder, 'test', ocr_num)
itoafile = '{}/ans_emb.npy'.format(save_folder)

In [16]:
np.save(trainfile,trX)
np.save(valfile,vaX)
np.save(testfile,teX)
np.save(itoafile,iaX)