-
Notifications
You must be signed in to change notification settings - Fork 23
/
online_compress_21k.py
51 lines (44 loc) · 1.31 KB
/
online_compress_21k.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# -*- coding: utf-8 -*-
import numpy as np
import os
from sklearn import preprocessing, decomposition
root_path = '/mnt/disk/data/'
model_name1 = root_path + 'train_feat_21k_21k_6crop'
feature_dim = 21841
#x = np.fromfile(model_name1, dtype=np.float32, count=feature_dim*30000).reshape(-1, feature_dim)
#
#print ('train: ', x.shape)
#preprocessing.normalize(x, copy=False)
#
#n_comp = 2048
#mean = np.mean(x, axis=0)
#x -= mean
#svd = decomposition.TruncatedSVD(n_components=n_comp, algorithm='arpack')
#svd.fit(x)
#print('explained: ', svd.explained_variance_ratio_.sum())
#x = svd.transform(x)
#
#np.save('online_mean_21k_2048.npy', mean)
#np.save('online_components_21k_2048.npy', svd.components_)
#
#qwe
mean = np.load('online_mean_21k_2048.npy')
comp = np.load('online_components_21k_2048.npy')
batch_size = 10000
filename = root_path + 'feat_21k_21k_6crop_2048' + '_train'
try:
os.remove(filename)
except:
pass
f_out = open(filename, 'ab')
for i in range(10000):
f = open(model_name1, 'rb')
f.seek(i * batch_size * feature_dim * 4, os.SEEK_SET)
x = np.fromfile(f, dtype=np.float32, count=feature_dim*batch_size).reshape(-1, feature_dim)
preprocessing.normalize(x, copy=False)
x = x - mean
x = x.dot(comp.T)
x.tofile(f_out)
print(i, x.shape)
if x.shape[0] < batch_size:
break