forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
math.h
35 lines (28 loc) · 1.08 KB
/
math.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#pragma once
#include <cstdint>
namespace caffe2 {
namespace math {
// Returns the quantized and compressed values of floating inputs
// The "fused" representation stores the [bitwidth][tail][min][max]
// with the quantized data in one array. Since we store 8/bitwidth
// quantized data in one byte, the last buckets of some bytes may have
// unused bits. There are totally tail buckets are unused.
// We encode *bitwidth* and *tail* at the beginning,
// following by 32-bit floating data respresenting min and max.
// | bitwidth | tail | min | max | ... int8 data ... |
// | 1B | 1B | 4B | 4B | ...output_data....|
// In output_data: the b-th bucket of the i-th byte stores
// the i-th data of the b-th segment of input row
void quantize_and_compress(
const float* input_data,
std::uint8_t* output_data,
std::uint64_t input_size,
std::uint64_t bitwidth,
bool random,
const float* random_buffer);
void decompress_and_dequantize(
const std::uint8_t* input_data,
float* output_data,
std::uint64_t input_size);
} // namespace math
} // namespace caffe2