forked from WebAssembly/binaryen
-
Notifications
You must be signed in to change notification settings - Fork 0
/
entropy.cpp
236 lines (198 loc) · 7.59 KB
/
entropy.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
/*
* Copyright 2021 WebAssembly Community Group participants
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cassert>
#include <cmath>
#include <cstdint>
#include <iostream>
#include <numeric>
#include "entropy.h"
#include "mixed_arena.h"
namespace wasm {
namespace Entropy {
// Given a vector of frequencies of items, compute their entropy. That is, if
// given [50, 50] then each index has equal probability, so this is like a fair
// coin, and the entropy is 1. The result is measured in bits.
#if 0
static double computeEntropy(const std::vector<size_t>& freqs) {
double totalFreq = 0;
for (auto freq : freqs) {
totalFreq += freq;
}
if (totalFreq == 0) {
return 0;
}
double mean = 0;
for (auto freq : freqs) {
auto probability = double(freq) / totalFreq;
// A probability close to 0 does not contribute to the mean.
if (probability >= 0.001) {
mean += -double(probability) * log2(probability);
}
}
// We have computed the entropy, which is the mean bits per item. To get
// the total number of bits, multiply by the total frequency.
return mean * totalFreq;
}
#endif
struct PatternHash {
static const size_t PatternCacheSize = 1 << 24;
// Use a simple and fast hash, see http://www.cse.yorku.ca/~oz/hash.html
uint32_t hash = 5381;
void note(uint8_t byte) {
hash = ((hash << 5) + hash) ^ byte;
}
size_t get() {
return hash & (PatternCacheSize - 1);
}
};
static double estimateCompressedBytesInternal(const std::vector<uint8_t>& data) {
// Allocate everything in an arena for speed.
MixedArena arena;
double totalBits = 0;
// brotli will actally pick the optimal window out of 1K-16MB. gzip uses 32K.
// Use 64k which represents gzip++;
const size_t WindowSize = 64 * 1024;
// Deflate/gzip have this as the maximum pattern size. Brotli can have up to
// 16MB.
const size_t MaxPatternSize = 258;
const size_t MeaninglessIndex = -1;
struct PatternInfo {
// The index in |data| of the last time we saw this pattern. This marks the
// first byte in that appearance.
size_t lastStart = MeaninglessIndex;
bool is() {
return lastStart != MeaninglessIndex;
}
// We could also store stuff like the pattern length, last char, etc., to
// reduce hash collisions? Trusting the hash blindly is not good enough on ue4.
};
// A map of hash values to pattern info for that hash.
std::vector<PatternInfo> patternCache(PatternHash::PatternCacheSize);
// Tracks the frequency of each byte we emitted, of the bytes we emit when we
// fail to find a pattern. Start with equal probability of all bytes.
// TODO: we need to clear out old bytes (older than the window) somehow -
// track them, or randomly?
std::vector<size_t> byteFreqs(1 << 8, 1);
size_t totalByteFreqs = 256;
size_t i = 0;
while (i < data.size()) {
//std::cout << "seek pattern from " << i << '\n';
// Starting at the i-th byte, see how many bytes forward we can look while
// still finding something in the trie.
PatternHash pattern;
size_t j = i;
// We will note the last time we saw the pattern that we found to be
// repeating.
size_t previousLastStart = MeaninglessIndex;
bool emitByte = true;
while (1) {
if (j == data.size()) {
// Nothing to do, and not even a new byte to emit.
emitByte = false;
break;
}
if (j - i == MaxPatternSize) {
// This pattern is unreasonably-large; stop and emit a new byte without
// adding a pattern this size.
break;
}
// Add j to the pattern and see if we have seen that as well.
pattern.note(data[j]);
auto& info = patternCache[pattern.get()];
if (info.is()) {
// Note the last start of the parent before we look at the child. If we
// end up stopping at the child, then the repeating pattern is in the
// parent, and its |lastStart| is when last we saw the pattern.
previousLastStart = info.lastStart;
////std::cout << "prev last: " << previousLastStart << '\n';
// Mark that we have seen this pattern starting at i. We need to do that
// regardless of our actions later: this is either one we've seen too
// long ago, and so it counts as new, or it is actually new. Either way,
// |i| is the lastStart for it.
info.lastStart = i;
////std::cout << "set node's last to " << i << '\n';
if (i - info.lastStart >= WindowSize) {
// This is a too-old pattern. We must emit a byte for it as if it is
// a new pattern here.
break;
}
// Otherwise, we saw the pattern recently enough, and can proceed.
j++;
continue;
}
// Otherwise, we failed to find the pattern, so this extra
// character at data[j] is new. Emit a byte, and stop.
info.lastStart = i;
////std::cout << "created new node with last of " << i << '\n';
break;
}
if (previousLastStart != MeaninglessIndex) {
// We proceeded past the root, that is, we actually found at least one
// repeating byte of some pattern before we stopped to emit a new byte.
// To represent the size of that pattern in the compressed output,
// estimate it as if emitting the optimal number of bits for the distance.
//assert(i != previousLastStart);
//std::cout << "emit ref to known pattern: " << (i - previousLastStart) << " , of size " << (j - i) << '\n';
if (i != previousLastStart) {
totalBits += log2(i - previousLastStart);
}
//std::cout << "atotal " << totalBits << '\n';
// Also we estimate the bits for the size in a similar way.
assert(j != i);
totalBits += log2(j - i);
//std::cout << "btotal " << totalBits << '\n';
}
if (emitByte) {
auto byte = data[j];
//std::cout << "emit byte " << int(byte) << '\n';
// To estimate how many bits we need to emit the new byte, use the factor
// that the byte would have when computing the entropy.
totalBits += -log2(double(byteFreqs[byte]) / double(totalByteFreqs));
//std::cout << "ctotal " << totalBits << '\n';
// Update the frequency of the byte we are emitting.
byteFreqs[byte]++;
totalByteFreqs++;
}
// Continue after this pattern. TODO: fill in lastStarts of subpatterns?
i = j + 1;
}
return totalBits / 8;
}
double estimateCompressedBytes(const std::vector<uint8_t>& data) {
#if 1
// brotli will actally pick the optimal window out of 1K-16MB. gzip uses 32K.
// Use 64k which represents gzip + overlaps.
size_t ChunkSize = 64 * 1024;
size_t start = 0;
std::vector<uint8_t> temp;
size_t chunks = 0;
double total = 0;
while (start < data.size()) {
auto end = std::min(start + ChunkSize, data.size());
temp.resize(end - start);
std::copy(data.begin() + start, data.begin() + end, temp.begin());
total += estimateCompressedBytesInternal(temp);
chunks++;
start += ChunkSize;// / 2; // overlap like the window slides
}
//std::cout << "final final: " << (ratios / double(chunks)) << '\n';
return total;
#else
return estimateCompressedBytesInternal(data);
#endif
}
} // namespace Entropy
} // namespace wasm