forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
math_cpu_base.cc
161 lines (144 loc) · 5.08 KB
/
math_cpu_base.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
// Implements the math functions for CPU.
// The implementation in this file allows us to route the underlying numerical
// computation library to different compiler options (-mno-avx2 or -mavx2).
#include <cfloat>
#include <cmath>
#include <cstdint>
#include "common.h"
#include "math.h"
using std::uint64_t;
using std::uint8_t;
namespace caffe2 {
namespace math {
static constexpr double QEPSILON = 1e-8;
void quantize_and_compress__base(
const float* input_data,
uint8_t* output_data,
uint64_t input_size,
uint64_t bitwidth,
bool random,
const float* random_buffer) {
uint64_t data_per_byte = 8 / bitwidth;
uint64_t tail = input_size % data_per_byte;
tail = tail ? data_per_byte - tail : 0;
uint64_t segment_size = (input_size + data_per_byte - 1) / data_per_byte;
// basic info
float minimum_element = INFINITY, maximum_element = -INFINITY;
for (auto i = 0; i < input_size; ++i) {
minimum_element =
input_data[i] < minimum_element ? input_data[i] : minimum_element;
maximum_element =
input_data[i] > maximum_element ? input_data[i] : maximum_element;
}
output_data[0] = bitwidth;
output_data[1] = tail;
reinterpret_cast<float*>(output_data + 2)[0] = minimum_element;
reinterpret_cast<float*>(output_data + 2)[1] = maximum_element;
float gap = (maximum_element - minimum_element) / ((1 << bitwidth) - 1.0f);
float gap_inverse = 1. / (gap + QEPSILON);
uint8_t max_q = (1 << bitwidth) - 1;
uint64_t bit_start = 0;
if (random) {
for (int start = 0; start < input_size; start += segment_size) {
uint64_t stride = start + segment_size <= input_size ? segment_size
: input_size - start;
int i = 0;
for (; i < stride; ++i) {
float fval = input_data[start + i];
float thetimes = (fval - minimum_element) * gap_inverse;
float rounded = floor(thetimes + random_buffer[start + i]);
rounded = rounded < static_cast<float>(max_q)
? rounded
: static_cast<float>(max_q);
rounded = rounded > 0.0f ? rounded : 0.0f;
uint8_t qval = rounded;
uint8_t orval = output_data[10 + i];
output_data[10 + i] = orval | static_cast<uint8_t>(qval << bit_start);
}
bit_start += bitwidth;
}
} else {
for (int start = 0; start < input_size; start += segment_size) {
uint64_t stride = start + segment_size <= input_size ? segment_size
: input_size - start;
int i = 0;
for (; i < stride; ++i) {
float fval = input_data[start + i];
float thetimes = (fval - minimum_element) * gap_inverse;
thetimes = thetimes < static_cast<float>(max_q)
? thetimes
: static_cast<float>(max_q);
thetimes = thetimes > 0.0f ? thetimes : 0.0f;
uint8_t qval = nearbyint(thetimes);
uint8_t orval = output_data[10 + i];
output_data[10 + i] = orval | static_cast<uint8_t>(qval << bit_start);
}
bit_start += bitwidth;
}
}
}
decltype(quantize_and_compress__base) quantize_and_compress__avx2;
void quantize_and_compress(
const float* input_data,
uint8_t* output_data,
uint64_t input_size,
uint64_t bitwidth,
bool random,
const float* random_buffer) {
AVX2_DO(
quantize_and_compress,
input_data,
output_data,
input_size,
bitwidth,
random,
random_buffer);
BASE_DO(
quantize_and_compress,
input_data,
output_data,
input_size,
bitwidth,
random,
random_buffer);
}
void decompress_and_dequantize__base(
const uint8_t* input_data,
float* output_data,
uint64_t input_size) {
// basic info
const float minimum_element =
reinterpret_cast<const float*>(input_data + 2)[0];
const float maximum_element =
reinterpret_cast<const float*>(input_data + 2)[1];
const uint64_t bitwidth = input_data[0];
const float gap =
(maximum_element - minimum_element) / ((1 << bitwidth) - 1.f) +
QEPSILON; // for exact recovering
const uint64_t tail = input_data[1];
const uint64_t output_size = (input_size - 10) * (8 / bitwidth) - tail;
// decoding
uint64_t bit_start = 0;
const uint64_t segment_size = input_size - 10;
for (int start = 0; start < output_size; start += segment_size) {
uint64_t stride = start + segment_size <= output_size ? segment_size
: output_size - start;
uint8_t mask = (1 << bitwidth) - 1;
int i = 0;
for (; i < stride; ++i) {
output_data[start + i] =
((input_data[10 + i] >> bit_start) & mask) * gap + minimum_element;
}
bit_start += bitwidth;
}
}
decltype(decompress_and_dequantize__base) decompress_and_dequantize__avx2;
void decompress_and_dequantize(
const uint8_t* input_data,
float* output_data,
uint64_t input_size) {
AVX2_DO(decompress_and_dequantize, input_data, output_data, input_size);
BASE_DO(decompress_and_dequantize, input_data, output_data, input_size);
}
} // namespace math
} // namespace caffe2