-
Notifications
You must be signed in to change notification settings - Fork 6
/
tsc_amd64.go
219 lines (172 loc) · 5.34 KB
/
tsc_amd64.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
package tsc
import (
"math"
"time"
"github.com/templexxx/cpu"
)
// Configs of calibration.
// See tools/calibrate for details.
const (
samples = 128
sampleDuration = 16 * time.Millisecond
getClosestTSCSysRetries = 256
)
func init() {
_ = reset()
}
func reset() bool {
if !isHardwareSupported() {
return false
}
Calibrate()
if IsOutOfOrder() {
if cpu.X86.HasFMA {
UnixNano = unixNanoTSCFMA
return true
}
UnixNano = unixNanoTSC16B
return true
}
UnixNano = unixNanoTSC16Bfence
return true
}
func isHardwareSupported() bool {
if supported == 1 {
return true
}
// Invariant TSC could make sure TSC got synced among multi CPUs.
// They will be reset at same time, and run in same frequency.
// But in some VM, the max Extended Function in CPUID is < 0x80000007,
// we should enable TSC if the system clock source is TSC.
if !cpu.X86.HasInvariantTSC {
if GetCurrentClockSource() != "tsc" {
return false // Cannot detect invariant tsc by CPUID or linux clock source, return false.
}
}
// Some instructions need AVX, see tsc_amd64.s for details.
// And we need AVX supports for 16 Bytes atomic store/load, see internal/xatomic for deatils.
// Actually, it's hardly to find a CPU without AVX supports in present. :)
// And it's weird that a CPU has invariant TSC but doesn't have AVX.
if !cpu.X86.HasAVX {
return false
}
supported = 1
return true
}
// Calibrate calibrates tsc clock.
//
// It's a good practice that run Calibrate periodically (e.g., 5 min is a good start because the auto NTP adjust is always every 11 min).
func Calibrate() {
if !isHardwareSupported() {
return
}
cnt := samples
tscs := make([]float64, cnt*2)
syss := make([]float64, cnt*2)
for j := 0; j < cnt; j++ {
_, tsc0, sys0 := getClosestTSCSys(getClosestTSCSysRetries)
time.Sleep(sampleDuration)
_, tsc1, sys1 := getClosestTSCSys(getClosestTSCSysRetries)
tscs[j*2] = float64(tsc0)
tscs[j*2+1] = float64(tsc1)
syss[j*2] = float64(sys0)
syss[j*2+1] = float64(sys1)
}
coeff, offset := simpleLinearRegression(tscs, syss)
storeOffsetCoeff(OffsetCoeffAddr, offset, coeff)
storeOffsetFCoeff(OffsetCoeffFAddr, float64(offset), coeff)
}
func simpleLinearRegression(tscs, syss []float64) (coeff float64, offset int64) {
tmean, wmean := float64(0), float64(0)
for _, i := range tscs {
tmean += i
}
for _, i := range syss {
wmean += i
}
tmean = tmean / float64(len(tscs))
wmean = wmean / float64(len(syss))
denominator, numerator := float64(0), float64(0)
for i := range tscs {
numerator += (tscs[i] - tmean) * (syss[i] - wmean)
denominator += math.Pow(tscs[i]-tmean, 2)
}
coeff = numerator / denominator
return coeff, int64(wmean - coeff*tmean)
}
// CalibrateWithCoeff calibrates coefficient to wall_clock by variables.
//
// Not thread safe, only for testing.
func CalibrateWithCoeff(c float64) {
if !Supported() {
return
}
_, tsc, sys := getClosestTSCSys(getClosestTSCSysRetries)
off := sys - int64(float64(tsc)*c)
storeOffsetCoeff(OffsetCoeffAddr, off, c)
storeOffsetFCoeff(OffsetCoeffFAddr, float64(off), c)
}
// getClosestTSCSys tries to get the closest tsc register value nearby the system clock in a loop.
func getClosestTSCSys(n int) (minDelta, tscClock, sys int64) {
// 256 is enough for finding the lowest sys clock cost in most cases.
// Although time.Now() is using VDSO to get time, but it's unstable,
// sometimes it will take more than 1000ns,
// we have to use a big loop(e.g. 256) to get the "real" clock.
// And it won't take a long time to finish calibrating job, only about 20µs.
// [tscClock, wc, tscClock, wc, ..., tscClock]
timeline := make([]int64, n+n+1)
timeline[0] = RDTSC()
for i := 1; i < len(timeline)-1; i += 2 {
timeline[i] = time.Now().UnixNano()
timeline[i+1] = RDTSC()
}
// The minDelta is the smallest gap between two adjacent tscs,
// which means the smallest gap between sys clock and tscClock too.
minDelta = int64(math.MaxInt64)
minIndex := 1 // minIndex is sys clock index where has minDelta.
// time.Now()'s precision is only µs (on macOS),
// which means we will get multi same sys clock in timeline,
// and the middle one is closer to the real time in statistics.
// Try to find the minimum delta when sys clock is in the "middle".
for i := 1; i < len(timeline)-1; i += 2 {
last := timeline[i]
for j := i + 2; j < len(timeline)-1; j += 2 {
if timeline[j] != last {
mid := (i + j - 2) >> 1
if isEven(mid) {
mid++
}
delta := timeline[mid+1] - timeline[mid-1]
if delta < minDelta {
minDelta = delta
minIndex = mid
}
i = j
last = timeline[j]
}
}
}
tscClock = (timeline[minIndex+1] + timeline[minIndex-1]) >> 1
sys = timeline[minIndex]
return
}
// GetInOrder gets tsc value in strictly order.
// It's used for helping calibrate to avoid out-of-order issues.
//go:noescape
func GetInOrder() int64
// RDTSC gets tsc value out-of-order.
//go:noescape
func RDTSC() int64
//go:noescape
func unixNanoTSC16B() int64
//go:noescape
func unixNanoTSCFMA() int64
//go:noescape
func unixNanoTSC16Bfence() int64
//go:noescape
func storeOffsetCoeff(dst *byte, offset int64, coeff float64)
//go:noescape
func storeOffsetFCoeff(dst *byte, offset, coeff float64)
// Same logic as unixNanoTSC16B for checking getting offset & coeff correctly.
//go:noescape
func LoadOffsetCoeff(src *byte) (offset int64, coeff float64)