In [6]:
import Pkg; 
Pkg.add("FastChebInterp");
Pkg.add("ThreadsX");
Pkg.add("Zygote");
Pkg.add("Memoize");
Pkg.add("BenchmarkTools");
Pkg.add("Memoize")
Pkg.add("FiniteDifferences")

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\Zin Lin\.julia\environments\v1.7\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\Zin Lin\.julia\environments\v1.7\Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\Zin Lin\.julia\environments\v1.7\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\Zin Lin\.julia\environments\v1.7\Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\Zin Lin\.julia\environments\v1.7\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\Zin Lin\.julia\environments\v1.7\Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `C:\Users\Zin Lin\.julia\environments\v1.7\Project.toml`
[32m[1m  No Changes[22m[39m to `C:\Users\Zin Lin\.julia\environments\v1.7\Manifest.toml`
[32m[1m   Resolving[22m[39m package versions...


In [12]:
using DelimitedFiles
using FastChebInterp
using ThreadsX
using Base.Threads
using Zygote
using BenchmarkTools
using Memoize
using Profile

const cheb = FastChebInterp.ChebPoly

"""
    getmodel

Generates a chebyshev polynomial interpolated from the datafile. 
The latter must be in the format ipt, DoF, Re(t[freq1]), Im(t[freq2]) ... 
In other words, the dimensions of the datafile must be (order+1,2+2*nfreqs)
"""
function getmodel(lb,ub,filename)
    dat = readdlm(filename,' ',Float64,'\n')
    dat = dat[:,3:end]'
    dat = [dat[:,i] for i in 1:size(dat,2)]
    model = chebinterp(dat,lb,ub)
end

function getmodels(lb,ub,filename)
    
    dat = readdlm(filename,' ',Float64,'\n')
    nfreqs = (size(dat,2)-2)÷2
    models = Vector{cheb{1,ComplexF64,Float64}}(undef,nfreqs)
    Threads.@threads for i in 1:nfreqs
        models[i] = chebinterp(complex.(dat[:,3+2*(i-1)],dat[:,4+2*(i-1)]),lb,ub)
    end
    models
end

"""
    eval2c!(F,∂F, model,p)

In-place multi-threaded evaluation of meta-atom transmission coefficients for multiple frequencies using the chebyshev model. 

F and ∂F must be pre-allocated as
 F = Array{ComplexF64,2}(undef,#unit cells,#freqs)
∂F = Array{ComplexF64,2}(undef,#unit cells,#freqs)
"""
function eval2c!(F,∂F, model::cheb,p::Vector{Float64})
    ndof = size(p)[1]
    Threads.@threads for i in 1:ndof
        @inbounds t,∂t = chebjacobian(model,p[i])
        @inbounds @views @.  F[i,:] = complex( t[1:2:end], t[2:2:end])
        @inbounds @views @. ∂F[i,:] = complex(∂t[1:2:end],∂t[2:2:end])
    end
end

function eval2c!(F,∂F, models::AbstractVector{<:cheb},p::AbstractVector)
    Threads.@threads for c in CartesianIndices(F)
        i,j = Tuple(c)
        F[c],∂F[c] = chebgradient(models[j],p[i])
    end
end

"""
Explanation: for f(z=x+iy) ∈ ℜ, Zygote returns df = ∂f/∂x + i ∂f/∂y 
The Wirtinger derivative is ∂f/∂z = 1/2 (∂f/∂x - i ∂f/∂y) = 1/2 conj(df)
The chain rule is ∂f/∂p = ∂f/∂z ∂z/∂p + ∂f/∂z' ∂z'/∂p = 2 real( ∂f/∂z ∂z/∂p ) = real( conj(df) ∂z/∂p ) 
Gradient vector gdat must be pre-allocated as
gdat = Vector{Float64}(undef,#unit cells)
"""
# function end2end!(gdat, F,∂F, model::cheb, p::Vector{Float64}, getF!::Function, f::Function, fdat::Any)
#     getF!(F,∂F, model,p)
#     ret,back = Zygote.pullback(ξ->f(ξ,fdat),F)
#     gdat[:] .= real.(vec(sum(conj.(back(1)[1]) .* ∂F, dims=2)))
#     return ret
# end

function end2end!(gdat, F,∂F, models, p, getF!, f, fdat)
    getF!(F,∂F, models,p)
    ret,back = Zygote.pullback(ξ->f(ξ,fdat),F)
    gdat[:] .= real.(vec(sum(conj.(back(1)[1]) .* ∂F, dims=2)))
    return ret
end


# setup(;ncells::Int64=3000,npix::Int64=500,nintg::Int64=5,nspatl::Int64=120,
#        Dz::Float64=5000, freqs::Vector{Float64}=[1.2,1.1,1.0,0.9,0.8],
#        lb::Float64=0.11,ub::Float64=0.68,
#        filename::String="alldat_5wavs.dat", 
#        kwargs...)

end2end! (generic function with 2 methods)

In [14]:
lb,ub=0.11,0.68
filename="alldat_5wavs.dat"
models = getmodels(lb,ub,filename)
ncells = 10000000
p = rand(lb:0.01/ncells:ub,ncells)
F = Array{ComplexF64,2}(undef,ncells,5)
∂F = Array{ComplexF64,2}(undef,ncells,5)
@btime eval2c!($F,$∂F, $models, $p);

  35.684 s (133 allocations: 7.00 KiB)


In [7]:
lb,ub=0.11,0.68
filename="alldat_5wavs.dat"
model = getmodel(lb,ub,filename)
ncells = 10000000
p = rand(lb:0.01/ncells:ub,ncells)
F = Array{ComplexF64,2}(undef,ncells,5)
∂F = Array{ComplexF64,2}(undef,ncells,5)
#@btime eval2c!($F,$∂F, $model, $p);

function f(F,fdat)
    #sum(real.(F).*imag.(F).^2)
    real(sum(F))
end

gdat = Vector{Float64}(undef,ncells)
bch = @benchmark end2end!($gdat, $F,$∂F, $model,$p, $eval2c!, $f, Nothing)
#Profile.Allocs.@profile end2end!(gdat, F,∂F, model,p, eval2c!, f, Nothing)

BenchmarkTools.Trial: 1 sample with 1 evaluation.
 Single result which took [34m10.074 s[39m (0.01% GC) to evaluate,
 with a memory estimate of [33m991.83 MiB[39m, over [33m109[39m allocations.

In [14]:
Profile.Allocs.fetch()

Profile.Allocs.AllocResults(Profile.Allocs.Alloc[Profile.Allocs.Alloc(Vector{Any}, Base.StackTraces.StackFrame[maybe_record_alloc_to_profile at gc-alloc-profiler.h:42 [inlined], ...], 88), Profile.Allocs.Alloc(Profile.Allocs.UnknownType, Base.StackTraces.StackFrame[maybe_record_alloc_to_profile at gc-alloc-profiler.h:42 [inlined], ...], 16), Profile.Allocs.Alloc(Vector{Any}, Base.StackTraces.StackFrame[maybe_record_alloc_to_profile at gc-alloc-profiler.h:42 [inlined], ...], 56), Profile.Allocs.Alloc(Profile.Allocs.UnknownType, Base.StackTraces.StackFrame[maybe_record_alloc_to_profile at gc-alloc-profiler.h:42 [inlined], ...], 16), Profile.Allocs.Alloc(Profile.Allocs.UnknownType, Base.StackTraces.StackFrame[maybe_record_alloc_to_profile at gc-alloc-profiler.h:42 [inlined], ...], 16), Profile.Allocs.Alloc(Vector{Vector{Pair{Core.Compiler.NewSSAValue, Core.PhiNode}}}, Base.StackTraces.StackFrame[maybe_record_alloc_to_profile at gc-alloc-profiler.h:42 [inlined], ...], 48), Profile.Allocs.A

In [4]:
import Pkg; Pkg.add("Profile")

[32m[1m    Updating[22m[39m registry at `C:\Users\zinli\.julia\registries\General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m    Updating[22m[39m `C:\Users\zinli\.julia\environments\v1.8\Project.toml`
 [90m [9abbd945] [39m[92m+ Profile[39m
[32m[1m  No Changes[22m[39m to `C:\Users\zinli\.julia\environments\v1.8\Manifest.toml`


In [13]:
using FiniteDifferences
using LinearAlgebra
lb,ub=0.11,0.68
filename="alldat_5wavs.dat"
models = getmodels(lb,ub,filename)
ncells = 1000
p = rand(1.2*lb:0.01/ncells:0.8*ub,ncells)
F = Array{ComplexF64,2}(undef,ncells,5)
∂F = Array{ComplexF64,2}(undef,ncells,5)
gdat = Vector{Float64}(undef,ncells)
function f2(F,fdat)
    sum(real.(F).*imag.(F).^2)
end
end2end!(gdat, F,∂F, model,p, eval2c!, f2, Nothing)
tmp(x) = end2end!(gdat, F,∂F, models,x, eval2c!, f2, Nothing) 
Δ = grad(central_fdm(5,1), tmp, p)[1]
maximum( abs.(Δ .- gdat)[1:end-1] )/mean(abs.(Δ))

1.0969977172819404e-5

In [91]:
a = rand(3,3)
display(a)
sum(a,dims=2)[:,1]

3×3 Matrix{Float64}:
 0.5115    0.107716  0.761931
 0.436962  0.737397  0.0375001
 0.709564  0.873515  0.116737

3-element Vector{Float64}:
 1.3811471203333312
 1.2118592808459214
 1.6998166289301033