-
-
Notifications
You must be signed in to change notification settings - Fork 5.6k
/
Copy pathrem_pio2.jl
293 lines (267 loc) · 9.35 KB
/
rem_pio2.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# This file is a part of Julia. Except for the rem_pio2_kernel, and
# cody_waite_* methods (see below) license is MIT: https://julialang.org/license
# rem_pio2_kernel and cody_waite_* methods are heavily based on FDLIBM code:
# __ieee754_rem_pio2, that is made available under the following licence:
## Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
##
## Developed at SunPro, a Sun Microsystems, Inc. business.
## Permission to use, copy, modify, and distribute this
## software is freely granted, provided that this notice
## is preserved.
# Bits of 1/2π
# 1/2π == sum(x / 0x1p64^i for i,x = enumerate(INV_2PI))
# Can be obtained by:
#
# setprecision(BigFloat, 4096)
# I = 0.5/big(pi)
# for i = 1:19
# I *= 0x1p64
# k = trunc(UInt64, I)
# @printf "0x%016x,\n" k
# I -= k
# end
const INV_2PI = (
0x28be_60db_9391_054a,
0x7f09_d5f4_7d4d_3770,
0x36d8_a566_4f10_e410,
0x7f94_58ea_f7ae_f158,
0x6dc9_1b8e_9093_74b8,
0x0192_4bba_8274_6487,
0x3f87_7ac7_2c4a_69cf,
0xba20_8d7d_4bae_d121,
0x3a67_1c09_ad17_df90,
0x4e64_758e_60d4_ce7d,
0x2721_17e2_ef7e_4a0e,
0xc7fe_25ff_f781_6603,
0xfbcb_c462_d682_9b47,
0xdb4d_9fb3_c9f2_c26d,
0xd3d1_8fd9_a797_fa8b,
0x5d49_eeb1_faf9_7c5e,
0xcf41_ce7d_e294_a4ba,
0x9afe_d7ec_47e3_5742,
0x1580_cc11_bf1e_daea)
@inline function cody_waite_2c_pio2(x::Float64, fn, n)
pio2_1 = 1.57079632673412561417e+00
pio2_1t = 6.07710050650619224932e-11
z = muladd(-fn, pio2_1, x) # x - fn*pio2_1
y1 = muladd(-fn, pio2_1t, z) # z - fn*pio2_1t
y2 = muladd(-fn, pio2_1t, (z - y1)) # (z - y1) - fn*pio2_1t
n, DoubleFloat64(y1, y2)
end
@inline function cody_waite_ext_pio2(x::Float64, xhp)
pio2_1 = 1.57079632673412561417e+00
pio2_1t = 6.07710050650619224932e-11
pio2_2 = 6.07710050630396597660e-11
pio2_2t = 2.02226624879595063154e-21
pio2_3 = 2.02226624871116645580e-21
pio2_3t = 8.47842766036889956997e-32
fn = round(x*(2/pi)) # round to integer
# on older systems, the above could be faster with
# rf = 1.5/eps(Float64)
# fn = (x*(2/pi)+rf)-rf
r = muladd(-fn, pio2_1, x) # x - fn*pio2_1
w = fn*pio2_1t # 1st round good to 85 bit
j = xhp>>20
y1 = r-w
high = highword(y1)
i = j-((high>>20)&0x7ff)
if i>16 # 2nd iteration needed, good to 118
t = r
w = fn*pio2_2
r = t-w
w = muladd(fn, pio2_2t,-((t-r)-w))
y1 = r-w
high = highword(y1)
i = j-((high>>20)&0x7ff)
if i>49 # 3rd iteration need, 151 bits acc
t = r # will cover all possible cases
w = fn*pio2_3
r = t-w
w = muladd(fn, pio2_3t, -((t-r)-w))
y1 = r-w
end
end
y2 = (r-y1)-w
return unsafe_trunc(Int, fn), DoubleFloat64(y1, y2)
end
"""
fromfraction(f::Int128)
Compute a tuple of values `(z1,z2)` such that
``z1 + z2 == f / 2^128``
and the significand of `z1` has 27 trailing zeros.
"""
function fromfraction(f::Int128)
if f == 0
return (0.0,0.0)
end
# 1. get leading term truncated to 26 bits
s = ((f < 0) % UInt64) << 63 # sign bit
x = abs(f) % UInt128 # magnitude
n1 = Base.top_set_bit(x) # ndigits0z(x,2)
m1 = ((x >> (n1-26)) % UInt64) << 27
d1 = ((n1-128+1021) % UInt64) << 52
z1 = reinterpret(Float64, s | (d1 + m1))
# 2. compute remaining term
x2 = (x - (UInt128(m1) << (n1-53)))
if x2 == 0
return (z1, 0.0)
end
n2 = Base.top_set_bit(x2)
m2 = (x2 >> (n2-53)) % UInt64
d2 = ((n2-128+1021) % UInt64) << 52
z2 = reinterpret(Float64, s | (d2 + m2))
return (z1,z2)
end
function paynehanek(x::Float64)
# 1. Convert to form
#
# x = X * 2^k,
#
# where 2^(n-1) <= X < 2^n is an n-bit integer (n = 53, k = exponent(x)-52 )
# Computations are integer based, so reinterpret x as UInt64
u = reinterpret(UInt64, x)
# Strip x of exponent bits and replace with ^1
X = (u & significand_mask(Float64)) | (one(UInt64) << significand_bits(Float64))
# Get k from formula above
# k = exponent(x)-52
raw_exponent = ((u & exponent_mask(Float64)) >> significand_bits(Float64)) % Int
k = raw_exponent - exponent_bias(Float64) - significand_bits(Float64)
# 2. Let α = 1/2π, then:
#
# α*x mod 1 ≡ [(α*2^k mod 1)*X] mod 1
#
# so we can ignore the first k bits of α. Extract the next 3 64-bit parts of α.
#
# i.e. equivalent to
# setprecision(BigFloat,4096)
# α = 1/(2*big(pi))
# A = mod(ldexp(α,k), 1)
# z1 = ldexp(A,64)
# a1 = trunc(UInt64, z1)
# z2 = ldexp(z1-a1, 64)
# a2 = trunc(UInt64, z2)
# z3 = ldexp(z2-a2, 64)
# a3 = trunc(UInt64, z3)
# This is equivalent to
# idx, shift = divrem(k, 64)
# but divrem is slower.
idx = k >> 6
shift = k - (idx << 6)
Base.@assume_effects :nothrow :noub @inbounds if shift == 0
a1 = INV_2PI[idx+1]
a2 = INV_2PI[idx+2]
a3 = INV_2PI[idx+3]
else
# use shifts to extract the relevant 64 bit window
a1 = (idx < 0 ? zero(UInt64) : INV_2PI[idx+1] << shift) | (INV_2PI[idx+2] >> (64 - shift))
a2 = (INV_2PI[idx+2] << shift) | (INV_2PI[idx+3] >> (64 - shift))
a3 = (INV_2PI[idx+3] << shift) | (INV_2PI[idx+4] >> (64 - shift))
end
# 3. Perform the multiplication:
#
# X. 0 0 0
# × 0. a1 a2 a3
# ==============
# _. w w _
#
# (i.e. ignoring integer and lowest bit parts of result)
w1 = UInt128(X*a1) << 64 # overflow becomes integer
w2 = widemul(X,a2)
w3 = widemul(X,a3) >> 64
w = w1 + w2 + w3 # quotient fraction after division by 2π
# adjust for sign of x
w = flipsign(w,x)
# 4. convert to quadrant, quotient fraction after division by π/2:
q = (((w>>125)%Int +1)>>1) # nearest quadrant
f = (w<<2) % Int128 # fraction part of quotient after division by π/2, taking values on [-0.5,0.5)
# 5. convert quotient fraction to split precision Float64
z_hi,z_lo = fromfraction(f)
# 6. multiply by π/2
pio2 = 1.5707963267948966
pio2_hi = 1.5707963407039642
pio2_lo = -1.3909067614167116e-8
y_hi = (z_hi+z_lo)*pio2
y_lo = (((z_hi*pio2_hi - y_hi) + z_hi*pio2_lo) + z_lo*pio2_hi) + z_lo*pio2_lo
return q, DoubleFloat64(y_hi, y_lo)
end
"""
rem_pio2_kernel(x::Union{Float32, Float64})
Calculate `x` divided by `π/2` accurately for arbitrarily large `x`.
Returns a pair `(k, r)`, where `k` is the quadrant of the result
(multiple of π/2) and `r` is the remainder, such that ``k * π/2 = x - r``.
The remainder is given as a double-double pair.
`k` is positive if `x > 0` and is negative if `x ≤ 0`.
"""
@inline function rem_pio2_kernel(x::Float64) # accurate to 1e-22
xhp = poshighword(x)
# xhp <= highword(5pi/4) implies |x| ~<= 5pi/4,
if xhp <= 0x400f6a7a
# last five bits of xhp == last five bits of highword(pi/2) or
# highword(2pi/2) implies |x| ~= pi/2 or 2pi/2,
if (xhp & 0xfffff) == 0x921fb # use precise Cody Waite scheme
return cody_waite_ext_pio2(x, xhp)
end
# use Cody Waite with two constants
# xhp <= highword(3pi/4) implies |x| ~<= 3pi/4
if xhp <= 0x4002d97c
if x > 0.0
return cody_waite_2c_pio2(x, 1.0, 1)
else
return cody_waite_2c_pio2(x, -1.0, -1)
end
# 3pi/4 < |x| <= 5pi/4
else
if x > 0.0
return cody_waite_2c_pio2(x, 2.0, 2)
else
return cody_waite_2c_pio2(x, -2.0, -2)
end
end
end
# xhp <= highword(9pi/4) implies |x| ~<= 9pi/4
if xhp <= 0x401c463b
# xhp <= highword(7pi/4) implies |x| ~<= 7pi/4
if xhp <= 0x4015fdbc
# xhp == highword(3pi/2) implies |x| ~= 3pi/2
if xhp == 0x4012d97c # use precise Cody Waite scheme
return cody_waite_ext_pio2(x, xhp)
end
# use Cody Waite with two constants
if x > 0.0
return cody_waite_2c_pio2(x, 3.0, 3)
else
return cody_waite_2c_pio2(x, -3.0, -3)
end
# 7pi/4 < |x| =< 9pi/4
else
# xhp == highword(4pi/2) implies |x| ~= 4pi/2
if xhp == 0x401921fb # use precise Cody Waite scheme
return cody_waite_ext_pio2(x, xhp)
end
# use Cody Waite with two constants
if x > 0.0
return cody_waite_2c_pio2(x, 4.0, 4)
else
return cody_waite_2c_pio2(x, -4.0, -4)
end
end
end
# xhp < highword(2.0^20*pi/2) implies |x| ~< 2^20*pi/2
if xhp < 0x413921fb # use precise Cody Waite scheme
return cody_waite_ext_pio2(x, xhp)
end
# if |x| >= 2^20*pi/2 switch to Payne Hanek
return paynehanek(x)
end
@inline function rem_pio2_kernel(x::Float32)
xd = convert(Float64, x)
# use Cody Waite reduction with two coefficients
if abs(x) < Float32(pi*0x1p27) # x < 2^28 * pi/2
fn = round(xd * (2/pi))
r = fma(fn, -pi/2, xd)
y = fma(fn, -6.123233995736766e-17, r) # big(pi)/2 - pi/2 remainder
return unsafe_trunc(Int, fn), DoubleFloat32(y)
end
n, y = @noinline paynehanek(xd)
return n, DoubleFloat32(y.hi)
end