In [1]:
using Pkg
Pkg.activate("../envs/lecture2-1")
Pkg.instantiate()

[32m[1mActivating[22m[39m environment at `~/projects/julia-performance/envs/lecture2-1/Project.toml`
[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[?25l[2K[?25h

# Is Julia fast?
*Enough talk -- let's code*

$$
sum(a) = \sum_i^n a_i
$$


- This material began life as a wonderful [lecture by Steven Johnson at MIT](https://github.com/stevengj/18S096-iap17/blob/master/lecture1/Boxes-and-registers.ipynb).
- With apologies to the numerical computing folks, this is not the algorithm you should use!


In [2]:
# pick a large N to not measure call-overhead
data = rand(10_000_000);

In [3]:
C_code =  """
#include <stddef.h>
double c_sum(size_t n, double *X) {
    double s = 0.0;
    for (size_t i = 0; i < n; ++i) {
        s += X[i];
    }
    return s;
}
""";

In [4]:
using Libdl
const Clib = tempname()   # make a temporary file

# compile to a shared library by piping C_code to gcc
# (works only if you have gcc installed):

open(`gcc -fPIC -O3 -msse3 -xc -shared -o $(Clib * "." * Libdl.dlext) -`, "w") do f
    print(f, C_code) 
end

In [5]:
function c_sum(X::Array{Float64})
    ccall(("c_sum", Clib), 
          Float64,
          (Csize_t, Ref{Float64}),
          length(X), X)
end

c_sum (generic function with 1 method)

In [6]:
using PyCall

# Get two python objects that represent data
# First a list and then a numpy array
# We do this to cut down conversion overhead
apy_list = PyCall.array2py(data)
apy_numpy = PyObject(data)

# get the Python built-in "sum" function:
pysum = pybuiltin("sum");
# get the Numpy "sum" function:
numpy_sum = pyimport("numpy").sum;

In [7]:
typeof(pysum)

PyObject

In [8]:
py"""
def py_sum(a):
    s = 0.0
    for x in a:
        s = s + x
    return s
"""

sum_py = py"py_sum";

In [9]:
function mysum(data)
  acc = zero(eltype(data))
  for x in data
      acc += x
  end
  return acc
end

mysum (generic function with 1 method)

In [10]:
@which sum(data)

In [11]:
@benchmark g(x)

LoadError: UndefVarError: @benchmark not defined

In [12]:
using BenchmarkTools

suite = BenchmarkGroup()
suite["Julia handwritten"]       = @benchmarkable mysum($data)
suite["Julia builtin"]           = @benchmarkable sum($data)
suite["Simple C function"]       = @benchmarkable c_sum($data)
suite["Python builtin (list)"]   = @benchmarkable $pysum($apy_list)
suite["Python builtin (numpy)"]  = @benchmarkable $numpy_sum($apy_numpy)
suite["Python handwritten"]      = @benchmarkable $sum_py($apy_list)

# If a cache of tuned parameters already exists, use it, otherwise, tune and cache
# the benchmark parameters. Reusing cached parameters is faster and more reliable
# than re-tuning `suite` every time the file is included.
paramspath = joinpath(@__DIR__, "sum_bench.json")

if isfile(paramspath)
    loadparams!(suite, BenchmarkTools.load(paramspath)[1], :evals);
else
    tune!(suite)
    BenchmarkTools.save(paramspath, params(suite));
end

6-element BenchmarkTools.BenchmarkGroup:
  tags: []
  "Julia builtin" => Benchmark(evals=1, seconds=5.0, samples=10000)
  "Julia handwritten" => Benchmark(evals=1, seconds=5.0, samples=10000)
  ⋮

In [13]:
results = run(suite)

6-element BenchmarkTools.BenchmarkGroup:
  tags: []
  "Julia builtin" => Trial(4.111 ms)
  "Julia handwritten" => Trial(11.346 ms)
  ⋮

In [14]:
for (name, trial) in sort(collect(results), by=x->time(x[2]))
    t = time(trial) / 1e6
    println(rpad(name, 25, "."), lpad(string(round(t, digits=2), " ms"), 20, "."))
end

Julia builtin.........................4.11 ms
Python builtin (numpy)................4.19 ms
Simple C function.....................11.3 ms
Julia handwritten....................11.35 ms
Python builtin (list).................42.7 ms
Python handwritten..................221.39 ms


# In conclusion:
The point is not that Julia has the fasted sum, the point is that a simple Julia implementation (that looks and feels like Python) is as fast as simple C, and that well optimised Julia code can be even faster.

We can improve performance by giving the compiler more information `@inbounds` and `@simd`

# Aside: Which method

In [15]:
@which sum(data)

In [16]:
@which mapreduce(identity, +, data)

In [17]:
@which Base._mapreduce(identity, Base.add_sum, IndexStyle(data), data)

In [18]:
inds = LinearIndices(data)
@which Base.mapreduce_impl(identity, Base.add_sum, data, first(inds), last(inds))

In [19]:
@show blksize = Base.pairwise_blocksize(identity, Base.add_sum)
@which Base.mapreduce_impl(identity, Base.add_sum, data, first(inds), last(inds),blksize)

blksize = Base.pairwise_blocksize(identity, Base.add_sum) = 1024
