Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backend structure #7

Merged
merged 17 commits into from
Dec 1, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
set(SRC
allocator.f90
backend.f90
common.f90
solver.f90
tdsops.f90
time_integrator.f90
omp/backend.f90
omp/common.f90
omp/kernels_dist.f90
)
set(CUDASRC
cuda/backend.f90
cuda/common.f90
cuda/allocator.f90
cuda/exec_dist.f90
@@ -21,20 +26,34 @@ endif()
add_library(x3d2 STATIC ${SRC})
target_include_directories(x3d2 INTERFACE ${CMAKE_CURRENT_BINARY_DIR})

add_executable(xcompact xcompact.f90)
target_link_libraries(xcompact PRIVATE x3d2)

target_compile_options(x3d2 PRIVATE "-O3")
target_compile_options(xcompact PRIVATE "-O3")
target_compile_options(xcompact PRIVATE "-cpp")

if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI")
target_compile_options(x3d2 PRIVATE "-cuda")
target_compile_options(x3d2 PRIVATE "-fast")
target_link_options(x3d2 INTERFACE "-cuda")
target_compile_options(xcompact PRIVATE "-cuda")
target_compile_options(xcompact PRIVATE "-fast")

target_compile_options(xcompact PRIVATE "-DCUDA")
# target_link_options(xcompact INTERFACE "-cuda")
endif()

if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "GNU")
target_compile_options(x3d2 PRIVATE "-ffast-math")
target_compile_options(xcompact PRIVATE "-ffast-math")
endif()

find_package(OpenMP REQUIRED)
target_link_libraries(x3d2 PRIVATE OpenMP::OpenMP_Fortran)
target_link_libraries(xcompact PRIVATE OpenMP::OpenMP_Fortran)

find_package(MPI REQUIRED)
target_link_libraries(x3d2 PRIVATE MPI::MPI_Fortran)
target_link_libraries(xcompact PRIVATE MPI::MPI_Fortran)

4 changes: 4 additions & 0 deletions src/allocator.f90
Original file line number Diff line number Diff line change
@@ -63,6 +63,10 @@ module m_allocator
module procedure field_constructor
end interface field_t

type :: flist_t
class(field_t), pointer :: ptr
end type flist_t

contains

function field_constructor(dims, next, id) result(m)
131 changes: 131 additions & 0 deletions src/backend.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
module m_base_backend
use m_allocator, only: allocator_t, field_t
use m_common, only: dp
use m_tdsops, only: tdsops_t, dirps_t

implicit none

type, abstract :: base_backend_t
!! base_backend class defines all the abstract operations that the
!! solver class requires.
!!
!! For example, transport equation in solver class evaluates the
!! derivatives in x, y, and z directions, and reorders the input
!! fields as required. Then finally, combines all the directional
!! derivatives to obtain the divergence of U*.
!!
!! All these high level operations solver class executes are
!! defined here using the abstract interfaces. Every backend
!! implementation extends the present abstact backend class to
!! define the specifics of these operations based on the target
!! architecture.

real(dp) :: nu
class(allocator_t), pointer :: allocator
class(dirps_t), pointer :: xdirps, ydirps, zdirps
contains
procedure(transeq_ders), deferred :: transeq_x
procedure(transeq_ders), deferred :: transeq_y
procedure(transeq_ders), deferred :: transeq_z
procedure(transposer), deferred :: trans_x2y
procedure(transposer), deferred :: trans_x2z
procedure(sum9into3), deferred :: sum_yzintox
procedure(get_fields), deferred :: get_fields
procedure(set_fields), deferred :: set_fields
procedure(alloc_tdsops), deferred :: alloc_tdsops
end type base_backend_t

abstract interface
subroutine transeq_ders(self, du, dv, dw, u, v, w, dirps)
!! transeq equation obtains the derivatives direction by
!! direction, and the exact algorithm used to obtain these
!! derivatives are decided at runtime. Backend implementations
!! are responsible from directing calls to transeq_ders into
!! the correct algorithm.
import :: base_backend_t
import :: field_t
import :: dirps_t
implicit none

class(base_backend_t) :: self
class(field_t), intent(inout) :: du, dv, dw
class(field_t), intent(in) :: u, v, w
type(dirps_t), intent(in) :: dirps
end subroutine transeq_ders
end interface

abstract interface
subroutine transposer(self, u_, v_, w_, u, v, w)
!! transposer subroutines are straightforward, they rearrange
!! data into our specialist data structure so that regardless
!! of the direction tridiagonal systems are solved efficiently
!! and fast.
import :: base_backend_t
import :: field_t
implicit none

class(base_backend_t) :: self
class(field_t), intent(inout) :: u_, v_, w_
class(field_t), intent(in) :: u, v, w
end subroutine transposer
end interface

abstract interface
subroutine sum9into3(self, du, dv, dw, du_y, dv_y, dw_y, du_z, dv_z, dw_z)
!! sum9into3 subroutine combines all the directional velocity
!! derivatives into the corresponding x directional fields.
import :: base_backend_t
import :: field_t
implicit none

class(base_backend_t) :: self
class(field_t), intent(inout) :: du, dv, dw
class(field_t), intent(in) :: du_y, dv_y, dw_y, du_z, dv_z, dw_z
end subroutine sum9into3
end interface

abstract interface
subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
!! copy the specialist data structure from device or host back
!! to a regular 3D data structure.
import :: base_backend_t
import :: dp
import :: field_t
implicit none

class(base_backend_t) :: self
real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
class(field_t), intent(in) :: u, v, w
end subroutine get_fields

subroutine set_fields(self, u, v, w, u_in, v_in, w_in)
!! copy the initial condition stored in a regular 3D data
!! structure into the specialist data structure arrays on the
!! device or host.
import :: base_backend_t
import :: dp
import :: field_t
implicit none

class(base_backend_t) :: self
class(field_t), intent(inout) :: u, v, w
real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
end subroutine set_fields
end interface

abstract interface
subroutine alloc_tdsops(self, tdsops, n, dx, operation, scheme)
import :: base_backend_t
import :: dp
import :: tdsops_t
implicit none

class(base_backend_t) :: self
class(tdsops_t), allocatable, intent(inout) :: tdsops
integer, intent(in) :: n
real(dp), intent(in) :: dx
character(*), intent(in) :: operation, scheme
end subroutine alloc_tdsops
end interface

end module m_base_backend
67 changes: 67 additions & 0 deletions src/common.f90
Original file line number Diff line number Diff line change
@@ -4,4 +4,71 @@ module m_common
integer, parameter :: dp=kind(0.0d0)
real(dp), parameter :: pi = 4*atan(1.0_dp)

type :: globs_t
integer :: nx, ny, nz
integer :: nx_loc, ny_loc, nz_loc
integer :: n_groups_x, n_groups_y, n_groups_z
real(dp) :: Lx, Ly, Lz
real(dp) :: dx, dy, dz
integer :: nproc_x = 1, nproc_y = 1, nproc_z = 1
character(len=20) :: BC_x_s, BC_x_e, BC_y_s, BC_y_e, BC_z_s, BC_z_e
end type globs_t

contains

subroutine set_pprev_pnext(xprev, xnext, yprev, ynext, zprev, znext, &
xnproc, ynproc, znproc, nrank)
Comment on lines +19 to +20
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure how you want to go about tests, but this function would be well suited to have one as it is easy to make a typo with sign/variables and it is isolated from the rest of the codebase.

implicit none

integer, intent(out) :: xprev, xnext, yprev, ynext, zprev, znext
integer, intent(in) :: xnproc, ynproc, znproc, nrank

integer :: i, ix, iy, iz

ix = modulo(nrank, xnproc)
iy = modulo((nrank - ix)/xnproc, ynproc)
iz = (nrank - ix - iy*xnproc)/(xnproc*ynproc)
! nrank == ix + iy*xnproc + iz*xnproc*ynproc

! prev and next in x direction
if (ix == 0) then
xprev = nrank + (xnproc - 1)
else
xprev = nrank - 1
end if

if (ix == xnproc - 1) then
xnext = nrank - (xnproc - 1)
else
xnext = nrank + 1
end if

! prev and next in y direction
if (iy == 0) then
yprev = nrank + (xnproc*(ynproc - 1))
else
yprev = nrank - xnproc
end if

if (iy == ynproc - 1) then
ynext = nrank - (xnproc*(ynproc - 1))
else
ynext = nrank + xnproc
end if

! prev and next in z direction
if (iz == 0) then
zprev = nrank + (xnproc*ynproc*(znproc - 1))
else
zprev = nrank - xnproc*ynproc
end if

if (iz == znproc - 1) then
znext = nrank - (xnproc*ynproc*(znproc - 1))
else
znext = nrank + xnproc*ynproc
end if

end subroutine set_pprev_pnext

end module m_common
414 changes: 414 additions & 0 deletions src/cuda/backend.f90

Large diffs are not rendered by default.

208 changes: 208 additions & 0 deletions src/omp/backend.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
module m_omp_backend
use m_allocator, only: allocator_t, field_t
use m_base_backend, only: base_backend_t
use m_common, only: dp, globs_t
use m_tdsops, only: dirps_t, tdsops_t

use m_omp_common, only: SZ

implicit none

type, extends(base_backend_t) :: omp_backend_t
!character(len=*), parameter :: name = 'omp'
integer :: MPI_FP_PREC = dp
real(dp), allocatable, dimension(:, :, :) :: &
u_recv_s, u_recv_e, u_send_s, u_send_e, &
v_recv_s, v_recv_e, v_send_s, v_send_e, &
w_recv_s, w_recv_e, w_send_s, w_send_e, &
du_send_s, du_send_e, du_recv_s, du_recv_e, &
dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, &
d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e
contains
procedure :: alloc_tdsops => alloc_omp_tdsops
procedure :: transeq_x => transeq_x_omp
procedure :: transeq_y => transeq_y_omp
procedure :: transeq_z => transeq_z_omp
procedure :: trans_x2y => trans_x2y_omp
procedure :: trans_x2z => trans_x2z_omp
procedure :: sum_yzintox => sum_yzintox_omp
procedure :: set_fields => set_fields_omp
procedure :: get_fields => get_fields_omp
end type omp_backend_t

interface omp_backend_t
module procedure init
end interface omp_backend_t

contains

function init(globs, allocator) result(backend)
implicit none

class(globs_t) :: globs
class(allocator_t), target, intent(inout) :: allocator
type(omp_backend_t) :: backend

integer :: n_halo, n_block

select type(allocator)
type is (allocator_t)
! class level access to the allocator
backend%allocator => allocator
end select

n_halo = 4
n_block = globs%n_groups_x

allocate(backend%u_send_s(SZ, n_halo, n_block))
allocate(backend%u_send_e(SZ, n_halo, n_block))
allocate(backend%u_recv_s(SZ, n_halo, n_block))
allocate(backend%u_recv_e(SZ, n_halo, n_block))
allocate(backend%v_send_s(SZ, n_halo, n_block))
allocate(backend%v_send_e(SZ, n_halo, n_block))
allocate(backend%v_recv_s(SZ, n_halo, n_block))
allocate(backend%v_recv_e(SZ, n_halo, n_block))
allocate(backend%w_send_s(SZ, n_halo, n_block))
allocate(backend%w_send_e(SZ, n_halo, n_block))
allocate(backend%w_recv_s(SZ, n_halo, n_block))
allocate(backend%w_recv_e(SZ, n_halo, n_block))

allocate(backend%du_send_s(SZ, 1, n_block))
allocate(backend%du_send_e(SZ, 1, n_block))
allocate(backend%du_recv_s(SZ, 1, n_block))
allocate(backend%du_recv_e(SZ, 1, n_block))
allocate(backend%dud_send_s(SZ, 1, n_block))
allocate(backend%dud_send_e(SZ, 1, n_block))
allocate(backend%dud_recv_s(SZ, 1, n_block))
allocate(backend%dud_recv_e(SZ, 1, n_block))
allocate(backend%d2u_send_s(SZ, 1, n_block))
allocate(backend%d2u_send_e(SZ, 1, n_block))
allocate(backend%d2u_recv_s(SZ, 1, n_block))
allocate(backend%d2u_recv_e(SZ, 1, n_block))

end function init

subroutine alloc_omp_tdsops(self, tdsops, n, dx, operation, scheme)
implicit none

class(omp_backend_t) :: self
class(tdsops_t), allocatable, intent(inout) :: tdsops
integer, intent(in) :: n
real(dp), intent(in) :: dx
character(*), intent(in) :: operation, scheme

allocate(tdsops_t :: tdsops)

select type (tdsops)
type is (tdsops_t)
tdsops = tdsops_t(n, dx, operation, scheme)
end select

end subroutine alloc_omp_tdsops

subroutine transeq_x_omp(self, du, dv, dw, u, v, w, dirps)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: du, dv, dw
class(field_t), intent(in) :: u, v, w
type(dirps_t), intent(in) :: dirps

!call self%transeq_omp_dist(du, dv, dw, u, v, w, dirps)

end subroutine transeq_x_omp

subroutine transeq_y_omp(self, du, dv, dw, u, v, w, dirps)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: du, dv, dw
class(field_t), intent(in) :: u, v, w
type(dirps_t), intent(in) :: dirps

! u, v, w is reordered so that we pass v, u, w
!call self%transeq_omp_dist(dv, du, dw, v, u, w, dirps)

end subroutine transeq_y_omp

subroutine transeq_z_omp(self, du, dv, dw, u, v, w, dirps)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: du, dv, dw
class(field_t), intent(in) :: u, v, w
type(dirps_t), intent(in) :: dirps

! u, v, w is reordered so that we pass w, u, v
!call self%transeq_omp_dist(dw, du, dv, w, u, v, dirps)

end subroutine transeq_z_omp

subroutine trans_x2y_omp(self, u_, v_, w_, u, v, w)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: u_, v_, w_
class(field_t), intent(in) :: u, v, w

end subroutine trans_x2y_omp

subroutine trans_x2z_omp(self, u_, v_, w_, u, v, w)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: u_, v_, w_
class(field_t), intent(in) :: u, v, w

end subroutine trans_x2z_omp

subroutine sum_yzintox_omp(self, du, dv, dw, &
du_y, dv_y, dw_y, du_z, dv_z, dw_z)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: du, dv, dw
class(field_t), intent(in) :: du_y, dv_y, dw_y, du_z, dv_z, dw_z

end subroutine sum_yzintox_omp

subroutine copy_into_buffers(u_send_s, u_send_e, u, n)
implicit none

real(dp), dimension(:, :, :), intent(out) :: u_send_s, u_send_e
real(dp), dimension(:, :, :), intent(in) :: u
integer, intent(in) :: n

u_send_s(:, :, :) = u(:, 1:4, :)
u_send_e(:, :, :) = u(:, n - 3:n, :)

end subroutine copy_into_buffers

subroutine set_fields_omp(self, u, v, w, u_in, v_in, w_in)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: u, v, w
real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in

u%data = u_in
v%data = v_in
w%data = w_in

end subroutine set_fields_omp

subroutine get_fields_omp(self, u_out, v_out, w_out, u, v, w)
implicit none

class(omp_backend_t) :: self
real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
class(field_t), intent(in) :: u, v, w

u_out = u%data
v_out = v%data
w_out = w%data

end subroutine get_fields_omp

end module m_omp_backend

240 changes: 240 additions & 0 deletions src/solver.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
module m_solver
use m_allocator, only: allocator_t, field_t
use m_base_backend, only: base_backend_t
use m_common, only: dp, globs_t
use m_tdsops, only: tdsops_t, dirps_t
use m_time_integrator, only: time_intg_t

implicit none

type :: solver_t
!! solver class defines the Incompact3D algorithm at a very high level.
!!
!! Procedures defined here that are part of the Incompact3D algorithm
!! are: transeq, divergence, poisson, and gradient.
!!
!! The operations these high level procedures require are provided by
!! the relavant backend implementations.
!!
!! transeq procedure obtains the derivations in x, y, and z directions
!! using the transeq_x, transeq_y, and transeq_z operations provided by
!! the backend.
!! There are two different algorithms available for this operation, a
!! distributed algorithm and the Thomas algorithm. At the solver class
!! level it isn't known which algorithm will be executed, that is decided
!! at run time and therefore backend implementations are responsible for
!! executing the right subroutines.
!!
!! Allocator is responsible from giving us a field sized array when
!! requested. For example, when the derivations in x direction are
!! completed and we are ready for the y directional derivatives, we need
!! three fields to reorder and store the velocities in y direction. Also,
!! we need three more fields for storing the results, and the get_block
!! method of the allocator is used to arrange all these memory
!! assignments. Later, when a field is no more required, release_block
!! method of the allocator can be used to make this field available
!! for later use.

real(dp) :: dt, nu

class(field_t), pointer :: u, v, w, du, dv, dw

class(base_backend_t), pointer :: backend
class(dirps_t), pointer :: xdirps, ydirps, zdirps
class(time_intg_t), pointer :: time_integrator
contains
procedure :: transeq
procedure :: run
end type solver_t

interface solver_t
module procedure init
end interface solver_t
contains

function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
result(solver)
implicit none

class(base_backend_t), target, intent(inout) :: backend
class(time_intg_t), target, intent(inout) :: time_integrator
class(dirps_t), target, intent(inout) :: xdirps, ydirps, zdirps
class(globs_t), intent(in) :: globs
type(solver_t) :: solver

real(dp), allocatable, dimension(:, :, :) :: u_init, v_init, w_init
integer :: dims(3)

real(dp) :: dx, dy, dz
integer :: nx, ny, nz

solver%backend => backend
solver%time_integrator => time_integrator

solver%xdirps => xdirps
solver%ydirps => ydirps
solver%zdirps => zdirps

solver%u => solver%backend%allocator%get_block()
solver%v => solver%backend%allocator%get_block()
solver%w => solver%backend%allocator%get_block()

! Set initial conditions
dims(:) = solver%backend%allocator%dims(:)
allocate(u_init(dims(1), dims(2), dims(3)))
allocate(v_init(dims(1), dims(2), dims(3)))
allocate(w_init(dims(1), dims(2), dims(3)))

u_init = 0
v_init = 0
w_init = 0

call solver%backend%set_fields( &
solver%u, solver%v, solver%w, u_init, v_init, w_init &
)

deallocate(u_init, v_init, w_init)
print*, 'initial conditions are set'

! Allocate fields for storing the RHS
solver%du => solver%backend%allocator%get_block()
solver%dv => solver%backend%allocator%get_block()
solver%dw => solver%backend%allocator%get_block()

nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc
dx = globs%dx; dy = globs%dy; dz = globs%dz

! Allocate and set the tdsops
call solver%backend%alloc_tdsops(solver%xdirps%der1st, nx, dx, &
'first-deriv', 'compact6')
call solver%backend%alloc_tdsops(solver%ydirps%der1st, ny, dy, &
'first-deriv', 'compact6')
call solver%backend%alloc_tdsops(solver%zdirps%der1st, nz, dz, &
'first-deriv', 'compact6')
call solver%backend%alloc_tdsops(solver%xdirps%der1st_sym, nx, dx, &
'first-deriv', 'compact6')
call solver%backend%alloc_tdsops(solver%ydirps%der1st_sym, ny, dy, &
'first-deriv', 'compact6')
call solver%backend%alloc_tdsops(solver%zdirps%der1st_sym, nz, dz, &
'first-deriv', 'compact6')
call solver%backend%alloc_tdsops(solver%xdirps%der2nd, nx, dx, &
'second-deriv', 'compact6')
call solver%backend%alloc_tdsops(solver%ydirps%der2nd, ny, dy, &
'second-deriv', 'compact6')
call solver%backend%alloc_tdsops(solver%zdirps%der2nd, nz, dz, &
'second-deriv', 'compact6')
call solver%backend%alloc_tdsops(solver%xdirps%der2nd_sym, nx, dx, &
'second-deriv', 'compact6')
call solver%backend%alloc_tdsops(solver%ydirps%der2nd_sym, ny, dy, &
'second-deriv', 'compact6')
call solver%backend%alloc_tdsops(solver%zdirps%der2nd_sym, nz, dz, &
'second-deriv', 'compact6')

end function init

subroutine transeq(self, du, dv, dw, u, v, w)
implicit none

class(solver_t) :: self
class(field_t), intent(inout) :: du, dv, dw
class(field_t), intent(in) :: u, v, w

class(field_t), pointer :: u_y, v_y, w_y, u_z, v_z, w_z, &
du_y, dv_y, dw_y, du_z, dv_z, dw_z

! -1/2(nabla u curl u + u nabla u) + nu nablasq u

! call derivatives in x direction. Based on the run time arguments this
! executes a distributed algorithm or the Thomas algorithm.
call self%backend%transeq_x(du, dv, dw, u, v, w, self%xdirps)

! request fields from the allocator
u_y => self%backend%allocator%get_block()
v_y => self%backend%allocator%get_block()
w_y => self%backend%allocator%get_block()
du_y => self%backend%allocator%get_block()
dv_y => self%backend%allocator%get_block()
dw_y => self%backend%allocator%get_block()

! reorder data from x orientation to y orientation
call self%backend%trans_x2y(u_y, v_y, w_y, u, v, w)
! similar to the x direction, obtain derivatives in y.
call self%backend%transeq_y(du_y, dv_y, dw_y, u_y, v_y, w_y, self%ydirps)

! we don't need the velocities in y orientation any more, so release
! them to open up space.
! It is important that this doesn't actually deallocate any memory,
! it just makes the corresponding memory space available for use.
call self%backend%allocator%release_block(u_y)
call self%backend%allocator%release_block(v_y)
call self%backend%allocator%release_block(w_y)

! just like in y direction, get some fields for the z derivatives.
u_z => self%backend%allocator%get_block()
v_z => self%backend%allocator%get_block()
w_z => self%backend%allocator%get_block()
du_z => self%backend%allocator%get_block()
dv_z => self%backend%allocator%get_block()
dw_z => self%backend%allocator%get_block()

! reorder from x to z
call self%backend%trans_x2z(u_z, v_z, w_z, u, v, w)
! get the derivatives in z
call self%backend%transeq_z(du_z, dv_z, dw_z, u_z, v_z, w_z, self%zdirps)

! there is no need to keep velocities in z orientation around, so release
call self%backend%allocator%release_block(u_z)
call self%backend%allocator%release_block(v_z)
call self%backend%allocator%release_block(w_z)

! gather all the contributions into the x result array
! this function does the data reordering and summations at once.
call self%backend%sum_yzintox(du, dv, dw, &
du_y, dv_y, dw_y, du_z, dv_z, dw_z)

! release all the unnecessary blocks.
call self%backend%allocator%release_block(du_y)
call self%backend%allocator%release_block(dv_y)
call self%backend%allocator%release_block(dw_y)
call self%backend%allocator%release_block(du_z)
call self%backend%allocator%release_block(dv_z)
call self%backend%allocator%release_block(dw_z)

end subroutine transeq

subroutine run(self, n_iter, u_out, v_out, w_out)
implicit none

class(solver_t), intent(in) :: self
integer, intent(in) :: n_iter
real(dp), dimension(:, :, :), intent(inout) :: u_out, v_out, w_out

integer :: i

print*, 'start run'

do i = 1, n_iter
call self%transeq(self%du, self%dv, self%dw, self%u, self%v, self%w)

! time integration
call self%time_integrator%step(self%u, self%v, self%w, &
self%du, self%dv, self%dw, self%dt)

!! pressure stuff
!call self%divergence(udiv, u, v, w)
!call self%poisson(p, udiv)
!call self%gradient(px, py, pz, p)

!! velocity correction
!call self%backend%vec3add(u, v, w, px, py, pz)
end do

print*, 'run end'

call self%backend%get_fields( &
u_out, v_out, w_out, self%du, self%dv, self%dw &
)

end subroutine run

end module m_solver
8 changes: 8 additions & 0 deletions src/tdsops.f90
Original file line number Diff line number Diff line change
@@ -36,6 +36,14 @@ module m_tdsops
module procedure tdsops_init
end interface tdsops_t

type :: dirps_t
class(tdsops_t), allocatable :: der1st, der1st_sym, &
der2nd, der2nd_sym, &
stagder_v2p, stagder_p2v, &
interpl_v2p, interpl_p2v
integer :: nrank, nproc, pnext, pprev, n
end type dirps_t

contains

function tdsops_init(n, delta, operation, scheme, n_halo, from_to, &
73 changes: 73 additions & 0 deletions src/time_integrator.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
module m_time_integrator
use m_allocator, only: allocator_t, field_t, flist_t
use m_base_backend, only: base_backend_t
use m_common, only: dp

implicit none

type :: time_intg_t
integer :: istep, nsteps, nsubsteps, order, nvars, nolds
type(flist_t), allocatable :: olds(:,:)
class(base_backend_t), pointer :: backend
class(allocator_t), pointer :: allocator
contains
procedure :: step
end type time_intg_t

interface time_intg_t
module procedure constructor
end interface time_intg_t

contains

function constructor(backend, allocator, nvars)
implicit none

type(time_intg_t) :: constructor
class(base_backend_t), pointer :: backend
class(allocator_t), pointer :: allocator
integer, intent(in), optional :: nvars

integer :: i, j

constructor%backend => backend
constructor%allocator => allocator

if (present(nvars)) then
constructor%nvars = nvars
else
constructor%nvars = 3
end if

constructor%nolds = 0

allocate(constructor%olds(constructor%nvars, constructor%nolds))

! Request all the storage for old timesteps
do i = 1, constructor%nvars
do j = 1, constructor%nolds
constructor%olds(i, j)%ptr => allocator%get_block()
end do
end do

end function constructor

subroutine step(self, u, v, w, du, dv, dw, dt)
implicit none

class(time_intg_t), intent(in) :: self
class(field_t), intent(inout) :: u, v, w
class(field_t), intent(in) :: du, dv, dw

real(dp), intent(in) :: dt

end subroutine step

subroutine adams_bashford_1st(vels, olds, coeffs)
type(flist_t) :: vels(:), olds(:)
real :: coeffs(:)

!call vec_add(vels, olds, coeffs)
end subroutine adams_bashford_1st

end module m_time_integrator
136 changes: 136 additions & 0 deletions src/xcompact.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
program xcompact
use mpi

use m_allocator
use m_base_backend
use m_common, only: pi, globs_t, set_pprev_pnext
use m_solver, only: solver_t
use m_time_integrator, only: time_intg_t
use m_tdsops, only: tdsops_t

#ifdef CUDA
use m_cuda_allocator
use m_cuda_backend
use m_cuda_common, only: SZ
use m_cuda_tdsops, only: cuda_tdsops_t
#else
use m_omp_backend
use m_omp_common, only: SZ
#endif

implicit none

type(globs_t) :: globs
class(base_backend_t), pointer :: backend
class(allocator_t), pointer :: allocator
type(solver_t) :: solver
type(time_intg_t) :: time_integrator
type(dirps_t) :: xdirps, ydirps, zdirps

#ifdef CUDA
type(cuda_backend_t), target :: cuda_backend
type(cuda_allocator_t), target :: cuda_allocator
#else
type(omp_backend_t), target :: omp_backend
type(allocator_t), target :: omp_allocator
#endif

real(dp), allocatable, dimension(:, :, :) :: u, v, w

real(dp) :: t_start, t_end
integer :: nrank, nproc, ierr, ndevs, devnum

call MPI_Init(ierr)
call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr)
call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr)

if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks'

#ifdef CUDA
ierr = cudaGetDeviceCount(ndevs)
ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin
ierr = cudaGetDevice(devnum)
#endif

! read L_x/y/z from the input file
globs%Lx = 2*pi; globs%Ly = 2*pi; globs%Lz = 2*pi

! read ns from the input file
globs%nx = 512; globs%ny = 512; globs%nz = 512

! set nprocs based on run time arguments
globs%nproc_x = 1; globs%nproc_y = 1; globs%nproc_z = 1

! Lets allow a 1D decomposition for the moment
!globs%nproc_x = nproc

xdirps%nproc = globs%nproc_x
ydirps%nproc = globs%nproc_y
zdirps%nproc = globs%nproc_z

! Better if we move this somewhere else
! Set the pprev and pnext for each rank
call set_pprev_pnext( &
xdirps%pprev, xdirps%pnext, &
ydirps%pprev, ydirps%pnext, &
zdirps%pprev, zdirps%pnext, &
xdirps%nproc, ydirps%nproc, zdirps%nproc, nrank &
)

! lets assume simple cases for now
globs%nx_loc = globs%nx/globs%nproc_x
globs%ny_loc = globs%ny/globs%nproc_y
globs%nz_loc = globs%nz/globs%nproc_z

globs%n_groups_x = globs%ny_loc*globs%nz_loc/SZ
globs%n_groups_y = globs%nx_loc*globs%nz_loc/SZ
globs%n_groups_z = globs%nx_loc*globs%ny_loc/SZ

globs%dx = globs%Lx/globs%nx
globs%dy = globs%Ly/globs%ny
globs%dz = globs%Lz/globs%nz

xdirps%n = globs%nx_loc
ydirps%n = globs%ny_loc
zdirps%n = globs%nz_loc

#ifdef CUDA
cuda_allocator = cuda_allocator_t([SZ, globs%nx_loc, globs%n_groups_x])
allocator => cuda_allocator
print*, 'CUDA allocator instantiated'

cuda_backend = cuda_backend_t(globs, allocator)
backend => cuda_backend
print*, 'CUDA backend instantiated'
#else
omp_allocator = allocator_t([SZ, globs%nx_loc, globs%n_groups_x])
allocator => omp_allocator
print*, 'OpenMP allocator instantiated'

omp_backend = omp_backend_t(globs, allocator)
backend => omp_backend
print*, 'OpenMP backend instantiated'
#endif

backend%nu = 1._dp

allocate(u(SZ, globs%nx_loc, globs%n_groups_x))
allocate(v(SZ, globs%nx_loc, globs%n_groups_x))
allocate(w(SZ, globs%nx_loc, globs%n_groups_x))

time_integrator = time_intg_t(allocator=allocator, backend=backend)
print*, 'time integrator instantiated'
solver = solver_t(backend, time_integrator, xdirps, ydirps, zdirps, globs)
print*, 'solver instantiated'

call cpu_time(t_start)

call solver%run(100, u, v, w)

call cpu_time(t_end)

print*, 'Time: ', t_end - t_start

print*, 'norms', norm2(u), norm2(v), norm2(w)

end program xcompact