In [1]:
import numpy as np
from numba import njit

In [2]:
def serial(x):
    y = 0.0
    n = x.size
    for k in range(n):
        y += np.sin(x[k]) * np.sin(x[k]) + np.cos(x[k]) * np.cos(x[k])
    y /= n
    return y

@njit
def serial_numba(x):
    y = 0.0
    n = x.size
    for k in range(n):
        y += np.sin(x[k]) * np.sin(x[k]) + np.cos(x[k]) * np.cos(x[k])
    y /= n
    return y

In [3]:
n = 1_000_000
x = np.random.rand(n)
serial_numba(np.random.rand(10))

print("Python for 循环")
%timeit serial(x)

print("Numba for 循环")
%timeit serial_numba(x)

print("Numpy 向量化")
%timeit (np.dot(np.sin(x), np.sin(x)) + np.dot(np.cos(x), np.cos(x)))/len(x)

Python for 循环
3.81 s ± 46.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Numba for 循环
5.96 ms ± 173 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Numpy 向量化
33.9 ms ± 242 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [1]:
using BenchmarkTools, LinearAlgebra, LoopVectorization

function serial(x)
    y = 0.0
    @turbo for k in eachindex(x)
        y += sin(x[k]) * sin(x[k]) + cos(x[k]) * cos(x[k])
    end
    y /= length(x)
    return y
end

serial (generic function with 1 method)

In [2]:
n = 1_000_000
x = rand(n)

println("Julia SIMD for 循环")
@btime serial($x);

println("Julia 向量化")
@btime (sin.($x) ⋅ sin.($x) + cos.($x) ⋅ cos.($x))/length($x);

Julia SIMD for 循环
  2.035 ms (0 allocations: 0 bytes)
Julia 向量化
  31.292 ms (8 allocations: 30.52 MiB)
