## First Run with v0.6.2

In [1]:
#### Testing with 0.6.2 Kernel

using ForwardDiff, BenchmarkTools, StaticArrays

import Base.zero
import Base.+
import Base.-
import Base.*
import Base./

struct Sindx{N,P} end
struct StencilPoint{T,N,P}
        value::NTuple{P,T}
end
struct Stencil{T,N,P,A}
    v::A #Array{StencilPoint{T,N,P},N}
end
struct Grid{T,N,P,S<:AbstractArray}<:AbstractArray{T,N}
    A::S
end

Stencil(x::Array{StencilPoint{T,N,P},N}) where {T,N,P} = Stencil{T,N,P,typeof(x)}(x)

(::Sindx{3,7})() = ((-1,0,0),(0,-1,0),(0,0,-1),(0,0,0),(0,0,1),(0,1,0),(1,0,0))

Base.start(S::StencilPoint)                                      = 1
@inbounds Base.next(S::StencilPoint{T,N,P}, state) where {T,N,P} = ((Sindx{N,P}()()[state], S.value[state]), state + 1)
Base.done(S::StencilPoint{T,N,P}, state) where {T,N,P}           = state > P

Base.eltype(::Type{StencilPoint{T,N,P}}) where {T,N,P} = T
Base.getindex(g::Grid{T,3,7}, i, j, k) where {T}       = g.A[i+1,j+1,k+1]
Base.getindex(S::Stencil{T,3},i,j,k) where {T}         = S.v[i, j, k]
Base.setindex!(g::Grid{T,3,7}, a, i, j, k) where {T} = setindex!(g.A, a, i+1, j+1, k+1)
zero(x::Grid{T,N,P}) where {T,N,P}              = Grid{T,N,P,typeof(x.A)}(zero(x.A))
Base.size(g::Grid{T,3,7}) where {T} = (size(g.A, 1)-2, size(g.A, 2)-2, size(g.A, 3)-2)
Base.size(S::Stencil)               = size(S.v)

function makegrid(x::Array{T,3},P) where {T}
    r = Grid{T,3,P,Array{T,3}}(zeros(eltype(x),size(x,1)+2,size(x,2)+2,size(x,3)+2))
    r.A[2:end-1, 2:end-1, 2:end-1] = x
    r
end

## A*b version
function A_mul_B!(S::Stencil{TS,3,7}, x::Grid{Txy,3,7}, y::Grid{Txy,3,7}) where {Txy,TS}
    for k in 1:size(S.v,3), j in 1:size(S.v,2), i in 1:size(S.v,1)
        tmp = zero(Txy)
        @inbounds for (idx, value) in S[i,j,k]
            ix, jy, kz = idx
            tmp += value*x[i+ix, j+jy, k+kz]
        end
        y[i,j,k] += tmp
    end
    return y
end
function fullm(S::Stencil{TS,3,7}, G) where {TS}
    nx, ny, nz = size(S)
    SS = spzeros(nx*ny*nz, nx*ny*nz)
    GG = zeros(nx*ny*nz)
    for i in 1:nx, j in 1:ny, k in 1:nz
        nd = (i-1)*ny*nz+(j-1)*nz+k
        Sv = S.v[i,j,k]
        SS[nd,nd] += Sv.value[4]
        GG[nd] += G[i,j,k]
        if i!=1  SS[nd,nd-ny*nz] = Sv.value[1] end
        if i!=nx SS[nd,nd+ny*nz] = Sv.value[7] end
        if j!=1  SS[nd,nd-nz] = Sv.value[2] end
        if j!=nx SS[nd,nd+nz] = Sv.value[6] end
        if k!=1  SS[nd,nd-1]  = Sv.value[3] end
        if k!=ny SS[nd,nd+1]  = Sv.value[5] end
    end
    return SS, GG
end

fullm (generic function with 1 method)

In [2]:
N = 50
Sarray = Array{StencilPoint{Float64,3,7},3}(N, N, N);
for i in 1:N, j in 1:N, k in 1:N
    spt = randn(7).*rand([zeros(1);randn()],7)
    if i==1 spt[1] = 0.0 end
    if j==1 spt[2] = 0.0 end
    if k==1 spt[3] = 0.0 end
    if k==N spt[5] = 0.0 end
    if j==N spt[6] = 0.0 end
    if i==N spt[7] = 0.0 end
    spt = Tuple(spt)
    Sarray[i,j,k] = StencilPoint{Float64,3,7}(spt)
end
S1 = Stencil{Float64,3,7,typeof(Sarray)}(Sarray);
G1 = makegrid(randn(N,N,N),7)
SA, GA = fullm(S1, G1);
length(SA.nzval)/(N*N*N*7-6*N*N)

0.49938255813953486

In [3]:
@btime A_mul_B!(S1, G1, zero(G1));

In [5]:
zG = zero(G1);

In [7]:
@code_warntype A_mul_B!(S1, G1, zG)

Variables:
  #self# <optimized out>
  S::Stencil{Float64,3,7,Array{StencilPoint{Float64,3,7},3}}
  x::Grid{Float64,3,7,Array{Float64,3}}
  y::Grid{Float64,3,7,Array{Float64,3}}
  idx::Tuple{Int64,Int64,Int64}
  value::Float64
  #temp#@_7 <optimized out>
  ix::Int64
  jy::Int64
  kz::Int64
  #temp#@_11 <optimized out>
  #temp#@_12::Int64
  i::Int64
  tmp::Float64
  #temp#@_15::Int64
  j::Int64
  #temp#@_17::Int64
  k::Int64
  #temp#@_19::Int64

Body:
  begin 
      SSAValue(15) = (Base.arraysize)((Core.getfield)(S::Stencil{Float64,3,7,Array{StencilPoint{Float64,3,7},3}}, :v)::Array{StencilPoint{Float64,3,7},3}, 3)::Int64
      SSAValue(18) = (Base.select_value)((Base.sle_int)(1, SSAValue(15))::Bool, SSAValue(15), (Base.sub_int)(1, 1)::Int64)::Int64
      #temp#@_19::Int64 = 1
      4: 
      unless (Base.not_int)((#temp#@_19::Int64 === (Base.add_int)(SSAValue(18), 1)::Int64)::Bool)::Bool goto 71
      SSAValue(19) = #temp#@_19::Int64
      SSAValue(20) = (Base.add_int)(#temp#@_19::Int64

In [4]:
@btime SA*GA;

  1.050 ms (3 allocations: 1.07 MiB)


In [11]:
using Plots



In [12]:
Plots.spy(Array(SA[1:100, 1:100]))

LoadError: UndefVarError: findnz not defined

# Then run with v1.0.0 

In [2]:
using ForwardDiff, BenchmarkTools, StaticArrays, SparseArrays, LinearAlgebra
import Base.zero, Base.+, Base.-, Base.*, Base./, Base.@propagate_inbounds

struct Sindx{N,P} end
struct StencilPoint{T,N,P}
    value::NTuple{P,T}
end
struct Stencil{T,N,P,A}
    v::A #Array{StencilPoint{T,N,P},N}
end
struct Grid{T,N,P,S<:AbstractArray}<:AbstractArray{T,N}
    A::S
end

Stencil(x::Array{StencilPoint{T,N,P},N}) where {T,N,P} = Stencil{T,N,P,typeof(x)}(x)
(::Sindx{3,7})() = ((-1,0,0),(0,-1,0),(0,0,-1),(0,0,0),(0,0,1),(0,1,0),(1,0,0))

@propagate_inbounds function Base.iterate(S::StencilPoint{T,N,P}, state::Int=1) where {T,N,P} 
    state-1==P && return nothing
    (Sindx{N,P}()()[state], S.value[state]), state + 1
end

Base.eltype(::Type{StencilPoint{T,N,P}}) where {T,N,P} = T
@propagate_inbounds Base.getindex(g::Grid{T,3,7}, i, j, k) where {T}       = g.A[i+1,j+1,k+1]
@propagate_inbounds Base.getindex(S::Stencil{T,3},i,j,k) where {T} = S.v[i, j, k]
@propagate_inbounds Base.setindex!(g::Grid{T,3,7}, a, i, j, k) where {T} = setindex!(g.A, a, i+1, j+1, k+1)
zero(x::Grid{T,N,P}) where {T,N,P}              = Grid{T,N,P,typeof(x.A)}(zero(x.A))
Base.size(g::Grid{T,3,7}) where {T} = (size(g.A, 1)-2, size(g.A, 2)-2, size(g.A, 3)-2)
Base.size(S::Stencil)               = size(S.v)

function makegrid(x::Array{T,3},P) where {T}
    r = Grid{T,3,P,Array{T,3}}(zeros(eltype(x),size(x,1)+2,size(x,2)+2,size(x,3)+2))
    r.A[2:end-1, 2:end-1, 2:end-1] = x
    r
end

## A*b version
function A_mul_B!(S::Stencil{TS,3,P}, x::Grid{Txy,3,P}, y::Grid{Txy,3,P}) where {Txy,TS,P}
    @inbounds for k in axes(S.v,3), j in axes(S.v,2), i in axes(S.v,1)
        tmp = zero(Txy)
        for (idx, val) in S.v[i,j,k]
            ix, jy, kz = idx
            tmp += val*x[i+ix, j+jy, k+kz]
        end
        y[i,j,k] += tmp
    end
    return y
end
function A_mul_B2!(S::Stencil{TS,3,P}, x::Grid{Txy,3,P}, y::Grid{Txy,3,P}) where {Txy,TS,P}
    Sid = Sindx{3,P}()()
    @inbounds for k in axes(S.v,3), j in axes(S.v,2), i in axes(S.v,1)
        tmp = zero(Txy)
        Sijk = S[i,j,k].value 
        for c in 1:P
            tmp += Sijk[c]*x[i+Sid[c][1], j+Sid[c][2], k+Sid[c][3]]
        end
        y[i,j,k] += tmp
    end
    return y
end
function fullm(S::Stencil{TS,3,P}, G) where {TS,P}
    nx, ny, nz = size(S)
    SS = spzeros(nx*ny*nz, nx*ny*nz)
    GG = zeros(nx*ny*nz)
    for i in 1:nx, j in 1:ny, k in 1:nz
        nd = (i-1)*ny*nz+(j-1)*nz+k
        Sv = S.v[i,j,k]
        SS[nd,nd] += Sv.value[4]
        GG[nd] += G[i,j,k]
        if i!=1  SS[nd,nd-ny*nz] = Sv.value[1] end
        if i!=nx SS[nd,nd+ny*nz] = Sv.value[7] end
        if j!=1  SS[nd,nd-nz] = Sv.value[2] end
        if j!=nx SS[nd,nd+nz] = Sv.value[6] end
        if k!=1  SS[nd,nd-1]  = Sv.value[3] end
        if k!=ny SS[nd,nd+1]  = Sv.value[5] end
    end
    return SS, GG
end

fullm (generic function with 1 method)

In [9]:
N = 50
Sarray = Array{StencilPoint{Float64,3,7},3}(undef,N, N, N);
for i in 1:N, j in 1:N, k in 1:N
    spt = randn(7).*rand([zeros(0);randn()],7)
    if i==1 spt[1] = 0.0 end
    if j==1 spt[2] = 0.0 end
    if k==1 spt[3] = 0.0 end
    if k==N spt[5] = 0.0 end
    if j==N spt[6] = 0.0 end
    if i==N spt[7] = 0.0 end
    spt = Tuple(spt)
    Sarray[i,j,k] = StencilPoint{Float64,3,7}(spt)
end
S1 = Stencil{Float64,3,7,typeof(Sarray)}(Sarray);
G1 = makegrid(randn(N,N,N),7)
SA, GA = fullm(S1, G1);
length(SA.nzval)/(N*N*N*7-6*N*N)

1.0

In [6]:
(::Sindx{3,8})() = ((-1,0,0),(0,-1,0),(0,0,-1),(0,0,0),(0,0,1),(0,1,0),(1,0,0),(0,0,0))
@propagate_inbounds Base.getindex(g::Grid{T,3,8}, i, j, k) where {T}       = g.A[i+1,j+1,k+1]
@propagate_inbounds Base.setindex!(g::Grid{T,3,8}, a, i, j, k) where {T} = setindex!(g.A, a, i+1, j+1, k+1)
N = 50
Sarray = Array{StencilPoint{Float64,3,8},3}(undef,N, N, N);
for i in 1:N, j in 1:N, k in 1:N
    spt = randn(7).*rand([zeros(1);randn()],7)
    if i==1 spt[1] = 0.0 end
    if j==1 spt[2] = 0.0 end
    if k==1 spt[3] = 0.0 end
    if k==N spt[5] = 0.0 end
    if j==N spt[6] = 0.0 end
    if i==N spt[7] = 0.0 end
    spt = Tuple(vcat(spt, 0.0))
    Sarray[i,j,k] = StencilPoint{Float64,3,8}(spt)
end
S1 = Stencil{Float64,3,8,typeof(Sarray)}(Sarray);
G1 = makegrid(randn(N,N,N),8)
SA, GA = fullm(S1, G1);
length(SA.nzval)/(N*N*N*7-6*N*N)

0.4996639534883721

In [7]:
@btime A_mul_B2!(S1, G1, zero(G1));

  815.094 μs (3 allocations: 1.07 MiB)


In [3]:
import Base.@propagate_inbounds

In [17]:
function A_mul_B2!(S::Stencil{TS,3,P}, x::Grid{Txy,3,P}, y::Grid{Txy,3,P}) where {Txy,TS,P}
    Sid = Sindx{3,P}()()
    @inbounds for k in axes(S.v,3), j in axes(S.v,2), i in axes(S.v,1)
        tmp = zero(Txy)
        Sijk = S[i,j,k].value # with inbounds -0.3ms
        for c in 1:P
            tmp += Sijk[c]*x[i+Sid[c][1], j+Sid[c][2], k+Sid[c][3]]
        end
        y[i,j,k] += tmp
    end
    return y
end
@btime A_mul_B2!($S1, $G1, zero($G1));

  751.277 μs (3 allocations: 1.07 MiB)


In [16]:
function A_mul_B2!(S::Stencil{TS,3,P}, x::Grid{Txy,3,P}, y::Grid{Txy,3,P}) where {Txy,TS,P}
    Sid = Sindx{3,P}()()
    @inbounds for k in axes(S.v,3), j in axes(S.v,2), i in axes(S.v,1)
        tmp = zero(Txy)
        Sijk = S.v[i,j,k].value # with inbounds -0.3ms
        for c in 1:P
            tmp += Sijk[c]*getindex(x.A, i+Sid[c][1]+1, j+Sid[c][2]+1, k+Sid[c][3]+1) 
        end
        y.A[i+1, j+1, k+1] += tmp
    end
    return y
end
@btime A_mul_B2!(S1, G1, zero(G1));

  769.626 μs (3 allocations: 1.07 MiB)


In [27]:
zG = zero(G1)
@code_llvm A_mul_B2!(S1, G1, zG)


; Function A_mul_B2!
; Location: In[16]:2
define nonnull %jl_value_t addrspace(10)* @"japi1_A_mul_B2!_37876"(%jl_value_t addrspace(10)*, %jl_value_t addrspace(10)**, i32) #0 {
top:
  %3 = alloca %jl_value_t addrspace(10)**, align 8
  store volatile %jl_value_t addrspace(10)** %1, %jl_value_t addrspace(10)*** %3, align 8
  %4 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %1, align 8
  %5 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %1, i64 2
  %6 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %5, align 8
; Location: In[16]:3
; Function getproperty; {
; Location: sysimg.jl:18
  %7 = addrspacecast %jl_value_t addrspace(10)* %4 to %jl_value_t addrspace(11)*
  %8 = bitcast %jl_value_t addrspace(11)* %7 to %jl_value_t addrspace(10)* addrspace(11)*
  %9 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %8, align 8
;}
; Function axes; {
; Location: abstractarray.jl:57
; Function axes; {
; Location:

In [17]:
@btime SA*GA;

  1.453 ms (2 allocations: 976.64 KiB)


In [87]:
@btime A_mul_B!(S1, G1, zero(G1));

  3.047 ms (3 allocations: 1.07 MiB)


In [75]:
@btime A_mul_B2!(S1, G1, zero(G1));

  1.692 ms (3 allocations: 1.07 MiB)


In [22]:
@code_llvm SA*GA


; Function *
; Location: /Users/osx/buildbot/slave/package_osx64/build/usr/share/julia/stdlib/v1.0/SparseArrays/src/linalg.jl:51
define nonnull %jl_value_t addrspace(10)* @"japi1_*_37895"(%jl_value_t addrspace(10)*, %jl_value_t addrspace(10)**, i32) #0 {
top:
  %gcframe = alloca %jl_value_t addrspace(10)*, i32 3
  %3 = bitcast %jl_value_t addrspace(10)** %gcframe to i8*
  call void @llvm.memset.p0i8.i32(i8* %3, i8 0, i32 24, i32 0, i1 false)
  %4 = alloca %jl_value_t addrspace(10)**, align 8
  store volatile %jl_value_t addrspace(10)** %1, %jl_value_t addrspace(10)*** %4, align 8
  %5 = call %jl_value_t*** inttoptr (i64 4397629216 to %jl_value_t*** ()*)() #2
  %6 = getelementptr %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %gcframe, i32 0
  %7 = bitcast %jl_value_t addrspace(10)** %6 to i64*
  store i64 2, i64* %7
  %8 = getelementptr %jl_value_t**, %jl_value_t*** %5, i32 0
  %9 = load %jl_value_t**, %jl_value_t*** %8
  %10 = getelementptr %jl_value_t addrspace(10)*, %jl_va

In [91]:
A_mul_B2!(S1, G1, zero(G1))

50×50×50 Grid{Float64,3,7,Array{Float64,3}}:
[:, :, 1] =
 -0.591424    0.50742     …  -2.43341     0.762652    0.0457852 
  0.0         1.01555        -0.068519   -0.0862447   0.236164  
 -0.31021    -0.188793       -0.288403    1.07372    -0.0163342 
  1.33999    -0.441516        0.290502   -0.317754    0.180335  
  3.50889    -1.09344        -0.22315     2.73823    -0.156153  
 -4.60465     3.18825     …  -0.31079    -0.222538   -0.00613052
 -0.135273    1.96966         0.804991   -0.536054    0.704434  
  3.08882     0.281173        1.68392    -2.90425    -1.01428   
 -0.116057    0.131772        1.2806      0.234359    0.0236462 
 -0.206054   -0.0316057       0.261326    0.588474    1.25335   
 -0.300393    0.658236    …  -0.647467    0.217657    0.507065  
 -2.08474    -0.00918388     -1.43574    -0.452453    0.389605  
  0.0439612  -0.579491        0.0         5.24767     0.0959032 
  ⋮                       ⋱                                     
  1.19962    -0.238956       -4.0

In [92]:
A_mul_B!(S1, G1, zero(G1))

50×50×50 Grid{Float64,3,7,Array{Float64,3}}:
[:, :, 1] =
 -0.591424    0.50742     …  -2.43341     0.762652    0.0457852 
  0.0         1.01555        -0.068519   -0.0862447   0.236164  
 -0.31021    -0.188793       -0.288403    1.07372    -0.0163342 
  1.33999    -0.441516        0.290502   -0.317754    0.180335  
  3.50889    -1.09344        -0.22315     2.73823    -0.156153  
 -4.60465     3.18825     …  -0.31079    -0.222538   -0.00613052
 -0.135273    1.96966         0.804991   -0.536054    0.704434  
  3.08882     0.281173        1.68392    -2.90425    -1.01428   
 -0.116057    0.131772        1.2806      0.234359    0.0236462 
 -0.206054   -0.0316057       0.261326    0.588474    1.25335   
 -0.300393    0.658236    …  -0.647467    0.217657    0.507065  
 -2.08474    -0.00918388     -1.43574    -0.452453    0.389605  
  0.0439612  -0.579491        0.0         5.24767     0.0959032 
  ⋮                       ⋱                                     
  1.19962    -0.238956       -4.0

In [93]:
SA*GA

125000-element Array{Float64,1}:
 -0.5914238350365022   
  0.14369514915121212  
 -0.0007047071100266973
 -1.0911852341830068   
 -1.238922920999253    
  0.08832647205031652  
  1.0218425084283953   
 -3.6464382783967886   
  0.42582532542235374  
 -0.8946493005771016   
  2.314214171764147    
 -0.407620342329592    
  0.05800123874320955  
  ⋮                    
 -0.046857695907202554 
 -2.1188374122652616   
  0.7299723004597661   
 -1.8944300325577461   
  0.09848298188036396  
  0.5142144826886544   
  0.05217600424746788  
  0.12897649081230012  
  0.6248294236652254   
  2.7502665887542403   
 -5.246110731576898    
  5.677757056242566    

In [105]:
iterate([4, 5, 6, 7],4)

(7, 5)

In [94]:
iterate([(2,3), (2,4), (3,5)])

((2, 3), 2)

In [101]:
dump(1:7:100)

StepRange{Int64,Int64}
  start: Int64 1
  step: Int64 7
  stop: Int64 99


In [115]:
for i in S1[5,5,5]
    println(i)
end

In [41]:
iterate(2:5)

(2, 2)

In [47]:
typeof(1:5) <: OrdinalRange

true

In [50]:
@which iterate(1:5,2 )

In [58]:
iterate(1:5)

(1, 1)

In [60]:
S = S1[5,5,5]

StencilPoint{Float64,3,7}((-0.0, -0.0, 0.0, -3.910250200911486, -0.0, 2.4790801174526433, 2.213012647491159))

In [None]:
iterate

In [17]:
Base.length(::StencilPoint{Float64,3,7}) = 7

In [35]:
@which Base.iterate(S1[1,1,1], 7)

In [33]:
for i in S1[1,1,1]
    println(i)
end

((-1, 0, 0), 0.0)
((0, -1, 0), 0.0)
((0, 0, -1), 0.0)
((0, 0, 0), 0.0)
((0, 0, 1), 0.0)
((0, 1, 0), 0.0)
((1, 0, 0), 0.0)


In [36]:
@which 2*3

In [8]:
@which SA*GA

In [4]:
@code_llvm A_mul_B!(S1, G1, zero(G1))


; Function A_mul_B!
; Location: In[1]:43
define nonnull %jl_value_t addrspace(10)* @"japi1_A_mul_B!_36435"(%jl_value_t addrspace(10)*, %jl_value_t addrspace(10)**, i32) #0 {
top:
  %3 = alloca %jl_value_t addrspace(10)**, align 8
  store volatile %jl_value_t addrspace(10)** %1, %jl_value_t addrspace(10)*** %3, align 8
  %4 = alloca { [7 x double] }, align 8
  %5 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %1, align 8
  %6 = getelementptr inbounds %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %1, i64 2
  %7 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)** %6, align 8
; Function getproperty; {
; Location: sysimg.jl:18
  %8 = addrspacecast %jl_value_t addrspace(10)* %5 to %jl_value_t addrspace(11)*
  %9 = bitcast %jl_value_t addrspace(11)* %8 to %jl_value_t addrspace(10)* addrspace(11)*
  %10 = load %jl_value_t addrspace(10)*, %jl_value_t addrspace(10)* addrspace(11)* %9, align 8
;}
; Function size; {
; Location: array.jl:154
  %11 = addrspaceca

In [4]:
@btime SA*GA;

  1.376 ms (2 allocations: 976.64 KiB)


# Compare kernels for different iteration task : Hadamard product-like task

# For Julia 0.6

In [None]:
struct mytype
    

In [7]:
## Task
for 

775.75