## First Run with v0.6.2

In [24]:
using ForwardDiff, BenchmarkTools, StaticArrays
import Base.zero, Base.+, Base.-, Base.*, Base./

struct Sindx{N,P} end
struct StencilPoint{T,N,P}
        value::NTuple{P,T}
end
struct Stencil{T,N,P,A}
    v::A #Array{StencilPoint{T,N,P},N}
end
struct Grid{T,N,P,S<:AbstractArray}<:AbstractArray{T,N}
    A::S
end

Stencil(x::Array{StencilPoint{T,N,P},N}) where {T,N,P} = Stencil{T,N,P,typeof(x)}(x)

(::Sindx{3,7})() = ((-1,0,0),(0,-1,0),(0,0,-1),(0,0,0),(0,0,1),(0,1,0),(1,0,0))

Base.start(S::StencilPoint)                                      = 1
@inbounds Base.next(S::StencilPoint{T,N,P}, state) where {T,N,P} = ((Sindx{N,P}()()[state], S.value[state]), state + 1)
Base.done(S::StencilPoint{T,N,P}, state) where {T,N,P}           = state > P

Base.eltype(::Type{StencilPoint{T,N,P}}) where {T,N,P} = T
Base.getindex(g::Grid{T,3,7}, i, j, k) where {T}       = g.A[i+1,j+1,k+1]
Base.getindex(S::Stencil{T,3},i,j,k) where {T}         = S.v[i, j, k]
Base.setindex!(g::Grid{T,3,7}, a, i, j, k) where {T} = setindex!(g.A, a, i+1, j+1, k+1)
zero(x::Grid{T,N,P}) where {T,N,P}              = Grid{T,N,P,typeof(x.A)}(zero(x.A))
Base.size(g::Grid{T,3,7}) where {T} = (size(g.A, 1)-2, size(g.A, 2)-2, size(g.A, 3)-2)
Base.size(S::Stencil)               = size(S.v)

function makegrid(x::Array{T,3},P) where {T}
    r = Grid{T,3,P,Array{T,3}}(zeros(eltype(x),size(x,1)+2,size(x,2)+2,size(x,3)+2))
    r.A[2:end-1, 2:end-1, 2:end-1] = x
    r
end

## A*b version
function A_mul_B!(S::Stencil{TS,3,7}, x::Grid{Txy,3,7}, y::Grid{Txy,3,7}) where {Txy,TS}
    for k in 1:size(S.v,3), j in 1:size(S.v,2), i in 1:size(S.v,1)
        tmp = zero(Txy)
        @inbounds for (idx, value) in S.v[i,j,k] # @inbounds S[i,j,k] takes 0.8 ms, iterator takes 0.3ms
            ix, jy, kz = idx
            tmp += value*x[i+ix, j+jy, k+kz] # with inbounds - 0.3 ms
        end
        y[i,j,k] += tmp
    end
    return y
end
function fullm(S::Stencil{TS,3,7}, G) where {TS}
    nx, ny, nz = size(S)
    SS = spzeros(nx*ny*nz, nx*ny*nz)
    GG = zeros(nx*ny*nz)
    for i in 1:nx, j in 1:ny, k in 1:nz
        nd = (i-1)*ny*nz+(j-1)*nz+k
        Sv = S.v[i,j,k]
        SS[nd,nd] += Sv.value[4]
        GG[nd] += G[i,j,k]
        if i!=1  SS[nd,nd-ny*nz] = Sv.value[1] end
        if i!=nx SS[nd,nd+ny*nz] = Sv.value[7] end
        if j!=1  SS[nd,nd-nz] = Sv.value[2] end
        if j!=nx SS[nd,nd+nz] = Sv.value[6] end
        if k!=1  SS[nd,nd-1]  = Sv.value[3] end
        if k!=ny SS[nd,nd+1]  = Sv.value[5] end
    end
    return SS, GG
end

fullm (generic function with 1 method)

In [17]:
N = 50
Sarray = Array{StencilPoint{Float64,3,7},3}(N, N, N);
for i in 1:N, j in 1:N, k in 1:N
    spt = randn(7).*rand([zeros(1);randn()],7)
    if i==1 spt[1] = 0.0 end
    if j==1 spt[2] = 0.0 end
    if k==1 spt[3] = 0.0 end
    if k==N spt[5] = 0.0 end
    if j==N spt[6] = 0.0 end
    if i==N spt[7] = 0.0 end
    spt = Tuple(spt)
    Sarray[i,j,k] = StencilPoint{Float64,3,7}(spt)
end
S1 = Stencil{Float64,3,7,typeof(Sarray)}(Sarray);
G1 = makegrid(randn(N,N,N),7)
SA, GA = fullm(S1, G1);
length(SA.nzval)/(N*N*N*7-6*N*N)

0.49979302325581393

In [18]:
function A_mul_B2!(S::Stencil{TS,3,P}, x::Grid{Txy,3,P}, y::Grid{Txy,3,P}) where {Txy,TS,P}
    @inbounds for k in 1:size(S.v,3), j in 1:size(S.v,2), i in 1:size(S.v,1)
        tmp = zero(Txy)
        Sijk = S[i,j,k].value
        for count in 1:P
            #ix, jy, kz = Sindx{3,P}()()[count]
            tmp += Sijk[count]*x[i+Sindx{3,P}()()[count][1], j+Sindx{3,P}()()[count][2], k+Sindx{3,P}()()[count][3]]
        end
        y[i,j,k] += tmp
    end
    return y
end
@btime A_mul_B2!(S1, G1, zero(G1));

  1.559 ms (3 allocations: 1.07 MiB)


In [28]:
function A_mul_B2!(S::Stencil{TS,3,P}, x::Grid{Txy,3,P}, y::Grid{Txy,3,P}) where {Txy,TS,P}
    Sid = Sindx{3,P}()()
    @inbounds for k in 1:size(S.v,3), j in 1:size(S.v,2), i in 1:size(S.v,1)
        tmp = zero(Txy)
        Sijk = S.v[i,j,k].value # with inbounds -0.3ms
        for c in 1:P
            tmp += Sijk[c]*getindex(x, i+Sid[c][1], j+Sid[c][2], k+Sid[c][3])
        end
        y[i,j,k] += tmp
    end
    return ysS
end
@btime A_mul_B2!(S1, G1, zero(G1));

  1.859 ms (3 allocations: 1.07 MiB)


In [25]:
@btime A_mul_B!(S1, G1, zero(G1));

  1.073 ms (3 allocations: 1.07 MiB)


In [15]:
@benchmark SA*GA

BenchmarkTools.Trial: 
  memory estimate:  1.65 MiB
  allocs estimate:  2
  --------------
  minimum time:     2.587 ms (0.00% GC)
  median time:      2.883 ms (0.00% GC)
  mean time:        3.011 ms (4.29% GC)
  maximum time:     6.234 ms (30.83% GC)
  --------------
  samples:          1652
  evals/sample:     1

In [16]:
@benchmark A_mul_B2!(S1, G1, zero(G1))

BenchmarkTools.Trial: 
  memory estimate:  1.82 MiB
  allocs estimate:  3
  --------------
  minimum time:     2.635 ms (0.00% GC)
  median time:      3.100 ms (0.00% GC)
  mean time:        3.242 ms (4.60% GC)
  maximum time:     6.507 ms (30.56% GC)
  --------------
  samples:          1535
  evals/sample:     1

# Then run with v1.0.0 

In [11]:
using ForwardDiff, BenchmarkTools, StaticArrays, SparseArrays, LinearAlgebra

import Base.zero
import Base.+
import Base.-
import Base.*
import Base./

struct Sindx{N,P} end
struct StencilPoint{T,N,P}
        value::NTuple{P,T}
end
struct Stencil{T,N,P,A}
    v::A #Array{StencilPoint{T,N,P},N}
end
struct Grid{T,N,P,S<:AbstractArray}<:AbstractArray{T,N}
    A::S
end

Stencil(x::Array{StencilPoint{T,N,P},N}) where {T,N,P} = Stencil{T,N,P,typeof(x)}(x)

(::Sindx{3,7})() = ((-1,0,0),(0,-1,0),(0,0,-1),(0,0,0),(0,0,1),(0,1,0),(1,0,0))

Base.iterate(S::StencilPoint{T,N,P}, state = 1) where {T,N,P} = state > P ? nothing : ((Sindx{N,P}()()[state], S.value[state]), state + 1)


Base.eltype(::Type{StencilPoint{T,N,P}}) where {T,N,P} = T
Base.getindex(g::Grid{T,3,7}, i, j, k) where {T}       = g.A[i+1,j+1,k+1]
Base.getindex(S::Stencil{T,3},i,j,k) where {T}         = S.v[i, j, k]
Base.setindex!(g::Grid{T,3,7}, a, i, j, k) where {T} = setindex!(g.A, a, i+1, j+1, k+1)
zero(x::Grid{T,N,P}) where {T,N,P}              = Grid{T,N,P,typeof(x.A)}(zero(x.A))
Base.size(g::Grid{T,3,7}) where {T} = (size(g.A, 1)-2, size(g.A, 2)-2, size(g.A, 3)-2)
Base.size(S::Stencil)               = size(S.v)

function makegrid(x::Array{T,3},P) where {T}
    r = Grid{T,3,P,Array{T,3}}(zeros(eltype(x),size(x,1)+2,size(x,2)+2,size(x,3)+2))
    r.A[2:end-1, 2:end-1, 2:end-1] = x
    r
end

## A*b version
function A_mul_B!(S::Stencil{TS,3,7}, x::Grid{Txy,3,7}, y::Grid{Txy,3,7}) where {Txy,TS}
    for k in 1:size(S.v,3), j in 1:size(S.v,2), i in 1:size(S.v,1)
        tmp = zero(Txy)
        @inbounds for (idx, value) in S[i,j,k]
            ix, jy, kz = idx
            tmp += value*x[i+ix, j+jy, k+kz]
        end
        y[i,j,k] += tmp
    end
    return y
end
function fullm(S::Stencil{TS,3,7}, G) where {TS}
    nx, ny, nz = size(S)
    SS = spzeros(nx*ny*nz, nx*ny*nz)
    GG = zeros(nx*ny*nz)
    for i in 1:nx, j in 1:ny, k in 1:nz
        nd = (i-1)*ny*nz+(j-1)*nz+k
        Sv = S.v[i,j,k]
        SS[nd,nd] += Sv.value[4]
        GG[nd] += G[i,j,k]
        if i!=1  SS[nd,nd-ny*nz] = Sv.value[1] end
        if i!=nx SS[nd,nd+ny*nz] = Sv.value[7] end
        if j!=1  SS[nd,nd-nz] = Sv.value[2] end
        if j!=nx SS[nd,nd+nz] = Sv.value[6] end
        if k!=1  SS[nd,nd-1]  = Sv.value[3] end
        if k!=ny SS[nd,nd+1]  = Sv.value[5] end
    end
    return SS, GG
end

fullm (generic function with 1 method)

In [12]:
N = 50
Sarray = Array{StencilPoint{Float64,3,7},3}(undef,N, N, N);
for i in 1:N, j in 1:N, k in 1:N
    spt = randn(7).*rand([zeros(1);randn()],7)
    if i==1 spt[1] = 0.0 end
    if j==1 spt[2] = 0.0 end
    if k==1 spt[3] = 0.0 end
    if k==N spt[5] = 0.0 end
    if j==N spt[6] = 0.0 end
    if i==N spt[7] = 0.0 end
    spt = Tuple(spt)
    Sarray[i,j,k] = StencilPoint{Float64,3,7}(spt)
end
S1 = Stencil{Float64,3,7,typeof(Sarray)}(Sarray);
G1 = makegrid(randn(N,N,N),7)
SA, GA = fullm(S1, G1);
length(SA.nzval)/(N*N*N*7-6*N*N)

0.4996697674418605

In [13]:
@btime A_mul_B!(S1, G1, zero(G1));

  325.678 μs (3 allocations: 1.07 MiB)


In [6]:
@btime SA*GA;

  1.358 ms (2 allocations: 976.64 KiB)


# Compare kernels for different iteration task : Hadamard product-like task

# For Julia 0.6

In [9]:
@which @inbounds(1)

In [7]:
## Task
for 

775.75

In [7]:
? @Propagate_inbounds

No documentation found.

Binding `@Propagate_inbounds` does not exist.


In [18]:
@propagate_inbounds_meta

LoadError: LoadError: UndefVarError: @propagate_inbounds_meta not defined
in expression starting at In[18]:1