xcompact3d · slaizet · Dec 1, 2023 · Sep 12, 2023 · Sep 14, 2023 · Sep 26, 2023
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -1,11 +1,16 @@
 set(SRC
   allocator.f90
+  backend.f90
   common.f90
+  solver.f90
   tdsops.f90
+  time_integrator.f90
+  omp/backend.f90
   omp/common.f90
   omp/kernels_dist.f90
 )
 set(CUDASRC
+  cuda/backend.f90
   cuda/common.f90
   cuda/allocator.f90
   cuda/exec_dist.f90
@@ -21,20 +26,34 @@ endif()
 add_library(x3d2 STATIC ${SRC})
 target_include_directories(x3d2 INTERFACE ${CMAKE_CURRENT_BINARY_DIR})
 
+add_executable(xcompact xcompact.f90)
+target_link_libraries(xcompact PRIVATE x3d2)
+
 target_compile_options(x3d2 PRIVATE "-O3")
+target_compile_options(xcompact PRIVATE "-O3")
+target_compile_options(xcompact PRIVATE "-cpp")
 
 if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI")
   target_compile_options(x3d2 PRIVATE "-cuda")
   target_compile_options(x3d2 PRIVATE "-fast")
   target_link_options(x3d2 INTERFACE "-cuda")
+  target_compile_options(xcompact PRIVATE "-cuda")
+  target_compile_options(xcompact PRIVATE "-fast")
+
+  target_compile_options(xcompact PRIVATE "-DCUDA")
+  #  target_link_options(xcompact INTERFACE "-cuda")
 endif()
 
 if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "GNU")
   target_compile_options(x3d2 PRIVATE "-ffast-math")
+  target_compile_options(xcompact PRIVATE "-ffast-math")
 endif()
 
 find_package(OpenMP REQUIRED)
 target_link_libraries(x3d2 PRIVATE OpenMP::OpenMP_Fortran)
+target_link_libraries(xcompact PRIVATE OpenMP::OpenMP_Fortran)
 
 find_package(MPI REQUIRED)
 target_link_libraries(x3d2 PRIVATE MPI::MPI_Fortran)
+target_link_libraries(xcompact PRIVATE MPI::MPI_Fortran)
+
diff --git a/src/allocator.f90 b/src/allocator.f90
@@ -63,6 +63,10 @@ module m_allocator
       module procedure field_constructor
    end interface field_t
 
+   type :: flist_t
+      class(field_t), pointer :: ptr
+   end type flist_t
+
 contains
 
    function field_constructor(dims, next, id) result(m)

diff --git a/src/backend.f90 b/src/backend.f90
@@ -0,0 +1,131 @@
+module m_base_backend
+   use m_allocator, only: allocator_t, field_t
+   use m_common, only: dp
+   use m_tdsops, only: tdsops_t, dirps_t
+
+   implicit none
+
+   type, abstract :: base_backend_t
+      !! base_backend class defines all the abstract operations that the
+      !! solver class requires.
+      !!
+      !! For example, transport equation in solver class evaluates the
+      !! derivatives in x, y, and z directions, and reorders the input
+      !! fields as required. Then finally, combines all the directional
+      !! derivatives to obtain the divergence of U*.
+      !!
+      !! All these high level operations solver class executes are
+      !! defined here using the abstract interfaces. Every backend
+      !! implementation extends the present abstact backend class to
+      !! define the specifics of these operations based on the target
+      !! architecture.
+
+      real(dp) :: nu
+      class(allocator_t), pointer :: allocator
+      class(dirps_t), pointer :: xdirps, ydirps, zdirps
+   contains
+      procedure(transeq_ders), deferred :: transeq_x
+      procedure(transeq_ders), deferred :: transeq_y
+      procedure(transeq_ders), deferred :: transeq_z
+      procedure(transposer), deferred :: trans_x2y
+      procedure(transposer), deferred :: trans_x2z
+      procedure(sum9into3), deferred :: sum_yzintox
+      procedure(get_fields), deferred :: get_fields
+      procedure(set_fields), deferred :: set_fields
+      procedure(alloc_tdsops), deferred :: alloc_tdsops
+   end type base_backend_t
+
+   abstract interface
+      subroutine transeq_ders(self, du, dv, dw, u, v, w, dirps)
+         !! transeq equation obtains the derivatives direction by
+         !! direction, and the exact algorithm used to obtain these
+         !! derivatives are decided at runtime. Backend implementations
+         !! are responsible from directing calls to transeq_ders into
+         !! the correct algorithm.
+         import :: base_backend_t
+         import :: field_t
+         import :: dirps_t
+         implicit none
+
+         class(base_backend_t) :: self
+         class(field_t), intent(inout) :: du, dv, dw
+         class(field_t), intent(in) :: u, v, w
+         type(dirps_t), intent(in) :: dirps
+      end subroutine transeq_ders
+   end interface
+
+   abstract interface
+      subroutine transposer(self, u_, v_, w_, u, v, w)
+         !! transposer subroutines are straightforward, they rearrange
+         !! data into our specialist data structure so that regardless
+         !! of the direction tridiagonal systems are solved efficiently
+         !! and fast.
+         import :: base_backend_t
+         import :: field_t
+         implicit none
+
+         class(base_backend_t) :: self
+         class(field_t), intent(inout) :: u_, v_, w_
+         class(field_t), intent(in) :: u, v, w
+      end subroutine transposer
+   end interface
+
+   abstract interface
+      subroutine sum9into3(self, du, dv, dw, du_y, dv_y, dw_y, du_z, dv_z, dw_z)
+         !! sum9into3 subroutine combines all the directional velocity
+         !! derivatives into the corresponding x directional fields.
+         import :: base_backend_t
+         import :: field_t
+         implicit none
+
+         class(base_backend_t) :: self
+         class(field_t), intent(inout) :: du, dv, dw
+         class(field_t), intent(in) :: du_y, dv_y, dw_y, du_z, dv_z, dw_z
+      end subroutine sum9into3
+   end interface
+
+   abstract interface
+      subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
+         !! copy the specialist data structure from device or host back
+         !! to a regular 3D data structure.
+         import :: base_backend_t
+         import :: dp
+         import :: field_t
+         implicit none
+
+         class(base_backend_t) :: self
+         real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
+         class(field_t), intent(in) :: u, v, w
+      end subroutine get_fields
+
+      subroutine set_fields(self, u, v, w, u_in, v_in, w_in)
+         !! copy the initial condition stored in a regular 3D data
+         !! structure into the specialist data structure arrays on the
+         !! device or host.
+         import :: base_backend_t
+         import :: dp
+         import :: field_t
+         implicit none
+
+         class(base_backend_t) :: self
+         class(field_t), intent(inout) :: u, v, w
+         real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
+      end subroutine set_fields
+   end interface
+
+   abstract interface
+      subroutine alloc_tdsops(self, tdsops, n, dx, operation, scheme)
+         import :: base_backend_t
+         import :: dp
+         import :: tdsops_t
+         implicit none
+
+         class(base_backend_t) :: self
+         class(tdsops_t), allocatable, intent(inout) :: tdsops
+         integer, intent(in) :: n
+         real(dp), intent(in) :: dx
+         character(*), intent(in) :: operation, scheme
+      end subroutine alloc_tdsops
+   end interface
+
+end module m_base_backend
diff --git a/src/common.f90 b/src/common.f90
@@ -4,4 +4,71 @@ module m_common
    integer, parameter :: dp=kind(0.0d0)
    real(dp), parameter :: pi = 4*atan(1.0_dp)
 
+   type :: globs_t
+      integer :: nx, ny, nz
+      integer :: nx_loc, ny_loc, nz_loc
+      integer :: n_groups_x, n_groups_y, n_groups_z
+      real(dp) :: Lx, Ly, Lz
+      real(dp) :: dx, dy, dz
+      integer :: nproc_x = 1, nproc_y = 1, nproc_z = 1
+      character(len=20) :: BC_x_s, BC_x_e, BC_y_s, BC_y_e, BC_z_s, BC_z_e
+   end type globs_t
+
+contains
+
+   subroutine set_pprev_pnext(xprev, xnext, yprev, ynext, zprev, znext, &
+                              xnproc, ynproc, znproc, nrank)
+      implicit none
+
+      integer, intent(out) :: xprev, xnext, yprev, ynext, zprev, znext
+      integer, intent(in) :: xnproc, ynproc, znproc, nrank
+
+      integer :: i, ix, iy, iz
+
+      ix = modulo(nrank, xnproc)
+      iy = modulo((nrank - ix)/xnproc, ynproc)
+      iz = (nrank - ix - iy*xnproc)/(xnproc*ynproc)
+      ! nrank == ix + iy*xnproc + iz*xnproc*ynproc
+
+      ! prev and next in x direction
+      if (ix == 0) then
+         xprev = nrank + (xnproc - 1)
+      else
+         xprev = nrank - 1
+      end if
+
+      if (ix == xnproc - 1) then
+         xnext = nrank - (xnproc - 1)
+      else
+         xnext = nrank + 1
+      end if
+
+      ! prev and next in y direction
+      if (iy == 0) then
+         yprev = nrank + (xnproc*(ynproc - 1))
+      else
+         yprev = nrank - xnproc
+      end if
+
+      if (iy == ynproc - 1) then
+         ynext = nrank - (xnproc*(ynproc - 1))
+      else
+         ynext = nrank + xnproc
+      end if
+
+      ! prev and next in z direction
+      if (iz == 0) then
+         zprev = nrank + (xnproc*ynproc*(znproc - 1))
+      else
+         zprev = nrank - xnproc*ynproc
+      end if
+
+      if (iz == znproc - 1) then
+         znext = nrank - (xnproc*ynproc*(znproc - 1))
+      else
+         znext = nrank + xnproc*ynproc
+      end if
+
+   end subroutine set_pprev_pnext
+
 end module m_common
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
@@ -0,0 +1,208 @@
+module m_omp_backend
+   use m_allocator, only: allocator_t, field_t
+   use m_base_backend, only: base_backend_t
+   use m_common, only: dp, globs_t
+   use m_tdsops, only: dirps_t, tdsops_t
+
+   use m_omp_common, only: SZ
+
+   implicit none
+
+   type, extends(base_backend_t) :: omp_backend_t
+      !character(len=*), parameter :: name = 'omp'
+      integer :: MPI_FP_PREC = dp
+      real(dp), allocatable, dimension(:, :, :) :: &
+         u_recv_s, u_recv_e, u_send_s, u_send_e, &
+         v_recv_s, v_recv_e, v_send_s, v_send_e, &
+         w_recv_s, w_recv_e, w_send_s, w_send_e, &
+         du_send_s, du_send_e, du_recv_s, du_recv_e, &
+         dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, &
+         d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e
+    contains
+      procedure :: alloc_tdsops => alloc_omp_tdsops
+      procedure :: transeq_x => transeq_x_omp
+      procedure :: transeq_y => transeq_y_omp
+      procedure :: transeq_z => transeq_z_omp
+      procedure :: trans_x2y => trans_x2y_omp
+      procedure :: trans_x2z => trans_x2z_omp
+      procedure :: sum_yzintox => sum_yzintox_omp
+      procedure :: set_fields => set_fields_omp
+      procedure :: get_fields => get_fields_omp
+   end type omp_backend_t
+
+   interface omp_backend_t
+      module procedure init
+   end interface omp_backend_t
+
+ contains
+
+   function init(globs, allocator) result(backend)
+      implicit none
+
+      class(globs_t) :: globs
+      class(allocator_t), target, intent(inout) :: allocator
+      type(omp_backend_t) :: backend
+
+      integer :: n_halo, n_block
+
+      select type(allocator)
+      type is (allocator_t)
+         ! class level access to the allocator
+         backend%allocator => allocator
+      end select
+
+      n_halo = 4
+      n_block = globs%n_groups_x
+
+      allocate(backend%u_send_s(SZ, n_halo, n_block))
+      allocate(backend%u_send_e(SZ, n_halo, n_block))
+      allocate(backend%u_recv_s(SZ, n_halo, n_block))
+      allocate(backend%u_recv_e(SZ, n_halo, n_block))
+      allocate(backend%v_send_s(SZ, n_halo, n_block))
+      allocate(backend%v_send_e(SZ, n_halo, n_block))
+      allocate(backend%v_recv_s(SZ, n_halo, n_block))
+      allocate(backend%v_recv_e(SZ, n_halo, n_block))
+      allocate(backend%w_send_s(SZ, n_halo, n_block))
+      allocate(backend%w_send_e(SZ, n_halo, n_block))
+      allocate(backend%w_recv_s(SZ, n_halo, n_block))
+      allocate(backend%w_recv_e(SZ, n_halo, n_block))
+
+      allocate(backend%du_send_s(SZ, 1, n_block))
+      allocate(backend%du_send_e(SZ, 1, n_block))
+      allocate(backend%du_recv_s(SZ, 1, n_block))
+      allocate(backend%du_recv_e(SZ, 1, n_block))
+      allocate(backend%dud_send_s(SZ, 1, n_block))
+      allocate(backend%dud_send_e(SZ, 1, n_block))
+      allocate(backend%dud_recv_s(SZ, 1, n_block))
+      allocate(backend%dud_recv_e(SZ, 1, n_block))
+      allocate(backend%d2u_send_s(SZ, 1, n_block))
+      allocate(backend%d2u_send_e(SZ, 1, n_block))
+      allocate(backend%d2u_recv_s(SZ, 1, n_block))
+      allocate(backend%d2u_recv_e(SZ, 1, n_block))
+
+   end function init
+
+   subroutine alloc_omp_tdsops(self, tdsops, n, dx, operation, scheme)
+      implicit none
+
+      class(omp_backend_t) :: self
+      class(tdsops_t), allocatable, intent(inout) :: tdsops
+      integer, intent(in) :: n
+      real(dp), intent(in) :: dx
+      character(*), intent(in) :: operation, scheme
+
+      allocate(tdsops_t :: tdsops)
+
+      select type (tdsops)
+      type is (tdsops_t)
+         tdsops = tdsops_t(n, dx, operation, scheme)
+      end select
+
+   end subroutine alloc_omp_tdsops
+
+   subroutine transeq_x_omp(self, du, dv, dw, u, v, w, dirps)
+      implicit none
+
+      class(omp_backend_t) :: self
+      class(field_t), intent(inout) :: du, dv, dw
+      class(field_t), intent(in) :: u, v, w
+      type(dirps_t), intent(in) :: dirps
+
+      !call self%transeq_omp_dist(du, dv, dw, u, v, w, dirps)
+
+   end subroutine transeq_x_omp
+
+   subroutine transeq_y_omp(self, du, dv, dw, u, v, w, dirps)
+      implicit none
+
+      class(omp_backend_t) :: self
+      class(field_t), intent(inout) :: du, dv, dw
+      class(field_t), intent(in) :: u, v, w
+      type(dirps_t), intent(in) :: dirps
+
+      ! u, v, w is reordered so that we pass v, u, w
+      !call self%transeq_omp_dist(dv, du, dw, v, u, w, dirps)
+
+   end subroutine transeq_y_omp
+
+   subroutine transeq_z_omp(self, du, dv, dw, u, v, w, dirps)
+      implicit none
+
+      class(omp_backend_t) :: self
+      class(field_t), intent(inout) :: du, dv, dw
+      class(field_t), intent(in) :: u, v, w
+      type(dirps_t), intent(in) :: dirps
+
+      ! u, v, w is reordered so that we pass w, u, v
+      !call self%transeq_omp_dist(dw, du, dv, w, u, v, dirps)
+
+   end subroutine transeq_z_omp
+
+   subroutine trans_x2y_omp(self, u_, v_, w_, u, v, w)
+      implicit none
+
+      class(omp_backend_t) :: self
+      class(field_t), intent(inout) :: u_, v_, w_
+      class(field_t), intent(in) :: u, v, w
+
+   end subroutine trans_x2y_omp
+
+   subroutine trans_x2z_omp(self, u_, v_, w_, u, v, w)
+      implicit none
+
+      class(omp_backend_t) :: self
+      class(field_t), intent(inout) :: u_, v_, w_
+      class(field_t), intent(in) :: u, v, w
+
+   end subroutine trans_x2z_omp
+
+   subroutine sum_yzintox_omp(self, du, dv, dw, &
+                               du_y, dv_y, dw_y, du_z, dv_z, dw_z)
+      implicit none
+
+      class(omp_backend_t) :: self
+      class(field_t), intent(inout) :: du, dv, dw
+      class(field_t), intent(in) :: du_y, dv_y, dw_y, du_z, dv_z, dw_z
+
+   end subroutine sum_yzintox_omp
+
+   subroutine copy_into_buffers(u_send_s, u_send_e, u, n)
+      implicit none
+
+      real(dp), dimension(:, :, :), intent(out) :: u_send_s, u_send_e
+      real(dp), dimension(:, :, :), intent(in) :: u
+      integer, intent(in) :: n
+
+      u_send_s(:, :, :) = u(:, 1:4, :)
+      u_send_e(:, :, :) = u(:, n - 3:n, :)
+
+   end subroutine copy_into_buffers
+
+   subroutine set_fields_omp(self, u, v, w, u_in, v_in, w_in)
+      implicit none
+
+      class(omp_backend_t) :: self
+      class(field_t), intent(inout) :: u, v, w
+      real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
+
+      u%data = u_in
+      v%data = v_in
+      w%data = w_in
+
+   end subroutine set_fields_omp
+
+   subroutine get_fields_omp(self, u_out, v_out, w_out, u, v, w)
+      implicit none
+
+      class(omp_backend_t) :: self
+      real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
+      class(field_t), intent(in) :: u, v, w
+
+      u_out = u%data
+      v_out = v%data
+      w_out = w%data
+
+   end subroutine get_fields_omp
+
+end module m_omp_backend
+
diff --git a/src/solver.f90 b/src/solver.f90
@@ -0,0 +1,240 @@
+module m_solver
+   use m_allocator, only: allocator_t, field_t
+   use m_base_backend, only: base_backend_t
+   use m_common, only: dp, globs_t
+   use m_tdsops, only: tdsops_t, dirps_t
+   use m_time_integrator, only: time_intg_t
+
+   implicit none
+
+   type :: solver_t
+      !! solver class defines the Incompact3D algorithm at a very high level.
+      !!
+      !! Procedures defined here that are part of the Incompact3D algorithm
+      !! are: transeq, divergence, poisson, and gradient.
+      !!
+      !! The operations these high level procedures require are provided by
+      !! the relavant backend implementations.
+      !!
+      !! transeq procedure obtains the derivations in x, y, and z directions
+      !! using the transeq_x, transeq_y, and transeq_z operations provided by
+      !! the backend.
+      !! There are two different algorithms available for this operation, a
+      !! distributed algorithm and the Thomas algorithm. At the solver class
+      !! level it isn't known which algorithm will be executed, that is decided
+      !! at run time and therefore backend implementations are responsible for
+      !! executing the right subroutines.
+      !!
+      !! Allocator is responsible from giving us a field sized array when
+      !! requested. For example, when the derivations in x direction are
+      !! completed and we are ready for the y directional derivatives, we need
+      !! three fields to reorder and store the velocities in y direction. Also,
+      !! we need three more fields for storing the results, and the get_block
+      !! method of the allocator is used to arrange all these memory
+      !! assignments. Later, when a field is no more required, release_block
+      !! method of the allocator can be used to make this field available
+      !! for later use.
+
+      real(dp) :: dt, nu
+
+      class(field_t), pointer :: u, v, w, du, dv, dw
+
+      class(base_backend_t), pointer :: backend
+      class(dirps_t), pointer :: xdirps, ydirps, zdirps
+      class(time_intg_t), pointer :: time_integrator
+   contains
+      procedure :: transeq
+      procedure :: run
+   end type solver_t
+
+   interface solver_t
+      module procedure init
+   end interface solver_t
+contains
+
+   function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
+      result(solver)
+      implicit none
+
+      class(base_backend_t), target, intent(inout) :: backend
+      class(time_intg_t), target, intent(inout) :: time_integrator
+      class(dirps_t), target, intent(inout) :: xdirps, ydirps, zdirps
+      class(globs_t), intent(in) :: globs
+      type(solver_t) :: solver
+
+      real(dp), allocatable, dimension(:, :, :) :: u_init, v_init, w_init
+      integer :: dims(3)
+
+      real(dp) :: dx, dy, dz
+      integer :: nx, ny, nz
+
+      solver%backend => backend
+      solver%time_integrator => time_integrator
+
+      solver%xdirps => xdirps
+      solver%ydirps => ydirps
+      solver%zdirps => zdirps
+
+      solver%u => solver%backend%allocator%get_block()
+      solver%v => solver%backend%allocator%get_block()
+      solver%w => solver%backend%allocator%get_block()
+
+      ! Set initial conditions
+      dims(:) = solver%backend%allocator%dims(:)
+      allocate(u_init(dims(1), dims(2), dims(3)))
+      allocate(v_init(dims(1), dims(2), dims(3)))
+      allocate(w_init(dims(1), dims(2), dims(3)))
+
+      u_init = 0
+      v_init = 0
+      w_init = 0
+
+      call solver%backend%set_fields( &
+         solver%u, solver%v, solver%w, u_init, v_init, w_init &
+      )
+
+      deallocate(u_init, v_init, w_init)
+      print*, 'initial conditions are set'
+
+      ! Allocate fields for storing the RHS
+      solver%du => solver%backend%allocator%get_block()
+      solver%dv => solver%backend%allocator%get_block()
+      solver%dw => solver%backend%allocator%get_block()
+
+      nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc
+      dx = globs%dx; dy = globs%dy; dz = globs%dz
+
+      ! Allocate and set the tdsops
+      call solver%backend%alloc_tdsops(solver%xdirps%der1st, nx, dx, &
+                                       'first-deriv', 'compact6')
+      call solver%backend%alloc_tdsops(solver%ydirps%der1st, ny, dy, &
+                                       'first-deriv', 'compact6')
+      call solver%backend%alloc_tdsops(solver%zdirps%der1st, nz, dz, &
+                                       'first-deriv', 'compact6')
+      call solver%backend%alloc_tdsops(solver%xdirps%der1st_sym, nx, dx, &
+                                       'first-deriv', 'compact6')
+      call solver%backend%alloc_tdsops(solver%ydirps%der1st_sym, ny, dy, &
+                                       'first-deriv', 'compact6')
+      call solver%backend%alloc_tdsops(solver%zdirps%der1st_sym, nz, dz, &
+                                       'first-deriv', 'compact6')
+      call solver%backend%alloc_tdsops(solver%xdirps%der2nd, nx, dx, &
+                                       'second-deriv', 'compact6')
+      call solver%backend%alloc_tdsops(solver%ydirps%der2nd, ny, dy, &
+                                       'second-deriv', 'compact6')
+      call solver%backend%alloc_tdsops(solver%zdirps%der2nd, nz, dz, &
+                                       'second-deriv', 'compact6')
+      call solver%backend%alloc_tdsops(solver%xdirps%der2nd_sym, nx, dx, &
+                                       'second-deriv', 'compact6')
+      call solver%backend%alloc_tdsops(solver%ydirps%der2nd_sym, ny, dy, &
+                                       'second-deriv', 'compact6')
+      call solver%backend%alloc_tdsops(solver%zdirps%der2nd_sym, nz, dz, &
+                                       'second-deriv', 'compact6')
+
+   end function init
+
+   subroutine transeq(self, du, dv, dw, u, v, w)
+      implicit none
+
+      class(solver_t) :: self
+      class(field_t), intent(inout) :: du, dv, dw
+      class(field_t), intent(in) :: u, v, w
+
+      class(field_t), pointer :: u_y, v_y, w_y, u_z, v_z, w_z, &
+                                 du_y, dv_y, dw_y, du_z, dv_z, dw_z
+
+      ! -1/2(nabla u curl u + u nabla u) + nu nablasq u
+
+      ! call derivatives in x direction. Based on the run time arguments this
+      ! executes a distributed algorithm or the Thomas algorithm.
+      call self%backend%transeq_x(du, dv, dw, u, v, w, self%xdirps)
+
+      ! request fields from the allocator
+      u_y => self%backend%allocator%get_block()
+      v_y => self%backend%allocator%get_block()
+      w_y => self%backend%allocator%get_block()
+      du_y => self%backend%allocator%get_block()
+      dv_y => self%backend%allocator%get_block()
+      dw_y => self%backend%allocator%get_block()
+
+      ! reorder data from x orientation to y orientation
+      call self%backend%trans_x2y(u_y, v_y, w_y, u, v, w)
+      ! similar to the x direction, obtain derivatives in y.
+      call self%backend%transeq_y(du_y, dv_y, dw_y, u_y, v_y, w_y, self%ydirps)
+
+      ! we don't need the velocities in y orientation any more, so release
+      ! them to open up space.
+      ! It is important that this doesn't actually deallocate any memory,
+      ! it just makes the corresponding memory space available for use.
+      call self%backend%allocator%release_block(u_y)
+      call self%backend%allocator%release_block(v_y)
+      call self%backend%allocator%release_block(w_y)
+
+      ! just like in y direction, get some fields for the z derivatives.
+      u_z => self%backend%allocator%get_block()
+      v_z => self%backend%allocator%get_block()
+      w_z => self%backend%allocator%get_block()
+      du_z => self%backend%allocator%get_block()
+      dv_z => self%backend%allocator%get_block()
+      dw_z => self%backend%allocator%get_block()
+
+      ! reorder from x to z
+      call self%backend%trans_x2z(u_z, v_z, w_z, u, v, w)
+      ! get the derivatives in z
+      call self%backend%transeq_z(du_z, dv_z, dw_z, u_z, v_z, w_z, self%zdirps)
+
+      ! there is no need to keep velocities in z orientation around, so release
+      call self%backend%allocator%release_block(u_z)
+      call self%backend%allocator%release_block(v_z)
+      call self%backend%allocator%release_block(w_z)
+
+      ! gather all the contributions into the x result array
+      ! this function does the data reordering and summations at once.
+      call self%backend%sum_yzintox(du, dv, dw, &
+                                    du_y, dv_y, dw_y, du_z, dv_z, dw_z)
+
+      ! release all the unnecessary blocks.
+      call self%backend%allocator%release_block(du_y)
+      call self%backend%allocator%release_block(dv_y)
+      call self%backend%allocator%release_block(dw_y)
+      call self%backend%allocator%release_block(du_z)
+      call self%backend%allocator%release_block(dv_z)
+      call self%backend%allocator%release_block(dw_z)
+
+   end subroutine transeq
+
+   subroutine run(self, n_iter, u_out, v_out, w_out)
+      implicit none
+
+      class(solver_t), intent(in) :: self
+      integer, intent(in) :: n_iter
+      real(dp), dimension(:, :, :), intent(inout) :: u_out, v_out, w_out
+
+      integer :: i
+
+      print*, 'start run'
+
+      do i = 1, n_iter
+         call self%transeq(self%du, self%dv, self%dw, self%u, self%v, self%w)
+
+         ! time integration
+         call self%time_integrator%step(self%u, self%v, self%w, &
+                                        self%du, self%dv, self%dw, self%dt)
+
+         !! pressure stuff
+         !call self%divergence(udiv, u, v, w)
+         !call self%poisson(p, udiv)
+         !call self%gradient(px, py, pz, p)
+
+         !! velocity correction
+         !call self%backend%vec3add(u, v, w, px, py, pz)
+      end do
+
+      print*, 'run end'
+
+      call self%backend%get_fields( &
+         u_out, v_out, w_out, self%du, self%dv, self%dw &
+      )
+
+   end subroutine run
+
+end module m_solver
diff --git a/src/tdsops.f90 b/src/tdsops.f90
@@ -36,6 +36,14 @@ module m_tdsops
       module procedure tdsops_init
    end interface tdsops_t
 
+   type :: dirps_t
+      class(tdsops_t), allocatable :: der1st, der1st_sym, &
+                                      der2nd, der2nd_sym, &
+                                      stagder_v2p, stagder_p2v, &
+                                      interpl_v2p, interpl_p2v
+      integer :: nrank, nproc, pnext, pprev, n
+   end type dirps_t
+
 contains
 
    function tdsops_init(n, delta, operation, scheme, n_halo, from_to, &

diff --git a/src/time_integrator.f90 b/src/time_integrator.f90
@@ -0,0 +1,73 @@
+module m_time_integrator
+   use m_allocator, only: allocator_t, field_t, flist_t
+   use m_base_backend, only: base_backend_t
+   use m_common, only: dp
+
+   implicit none
+
+   type :: time_intg_t
+      integer :: istep, nsteps, nsubsteps, order, nvars, nolds
+      type(flist_t), allocatable :: olds(:,:)
+      class(base_backend_t), pointer :: backend
+      class(allocator_t), pointer :: allocator
+   contains
+      procedure :: step
+   end type time_intg_t
+
+   interface time_intg_t
+      module procedure constructor
+   end interface time_intg_t
+
+contains
+
+   function constructor(backend, allocator, nvars)
+      implicit none
+
+      type(time_intg_t) :: constructor
+      class(base_backend_t), pointer :: backend
+      class(allocator_t), pointer :: allocator
+      integer, intent(in), optional :: nvars
+
+      integer :: i, j
+
+      constructor%backend => backend
+      constructor%allocator => allocator
+
+      if (present(nvars)) then
+         constructor%nvars = nvars
+      else
+         constructor%nvars = 3
+      end if
+
+      constructor%nolds = 0
+
+      allocate(constructor%olds(constructor%nvars, constructor%nolds))
+
+      ! Request all the storage for old timesteps
+      do i = 1, constructor%nvars
+         do j = 1, constructor%nolds
+            constructor%olds(i, j)%ptr => allocator%get_block()
+         end do
+      end do
+
+   end function constructor
+
+   subroutine step(self, u, v, w, du, dv, dw, dt)
+      implicit none
+
+      class(time_intg_t), intent(in) :: self
+      class(field_t), intent(inout) :: u, v, w
+      class(field_t), intent(in) :: du, dv, dw
+
+      real(dp), intent(in) :: dt
+
+   end subroutine step
+
+   subroutine adams_bashford_1st(vels, olds, coeffs)
+      type(flist_t) :: vels(:), olds(:)
+      real :: coeffs(:)
+
+      !call vec_add(vels, olds, coeffs)
+   end subroutine adams_bashford_1st
+
+end module m_time_integrator
diff --git a/src/xcompact.f90 b/src/xcompact.f90
@@ -0,0 +1,136 @@
+program xcompact
+   use mpi
+
+   use m_allocator
+   use m_base_backend
+   use m_common, only: pi, globs_t, set_pprev_pnext
+   use m_solver, only: solver_t
+   use m_time_integrator, only: time_intg_t
+   use m_tdsops, only: tdsops_t
+
+#ifdef CUDA
+   use m_cuda_allocator
+   use m_cuda_backend
+   use m_cuda_common, only: SZ
+   use m_cuda_tdsops, only: cuda_tdsops_t
+#else
+   use m_omp_backend
+   use m_omp_common, only: SZ
+#endif
+
+   implicit none
+
+   type(globs_t) :: globs
+   class(base_backend_t), pointer :: backend
+   class(allocator_t), pointer :: allocator
+   type(solver_t) :: solver
+   type(time_intg_t) :: time_integrator
+   type(dirps_t) :: xdirps, ydirps, zdirps
+
+#ifdef CUDA
+   type(cuda_backend_t), target :: cuda_backend
+   type(cuda_allocator_t), target :: cuda_allocator
+#else
+   type(omp_backend_t), target :: omp_backend
+   type(allocator_t), target :: omp_allocator
+#endif
+
+   real(dp), allocatable, dimension(:, :, :) :: u, v, w
+
+   real(dp) :: t_start, t_end
+   integer :: nrank, nproc, ierr, ndevs, devnum
+
+   call MPI_Init(ierr)
+   call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr)
+   call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr)
+
+   if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks'
+
+#ifdef CUDA
+   ierr = cudaGetDeviceCount(ndevs)
+   ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin
+   ierr = cudaGetDevice(devnum)
+#endif
+
+   ! read L_x/y/z from the input file
+   globs%Lx = 2*pi; globs%Ly = 2*pi; globs%Lz = 2*pi
+
+   ! read ns from the input file
+   globs%nx = 512; globs%ny = 512; globs%nz = 512
+
+   ! set nprocs based on run time arguments
+   globs%nproc_x = 1; globs%nproc_y = 1; globs%nproc_z = 1
+
+   ! Lets allow a 1D decomposition for the moment
+   !globs%nproc_x = nproc
+
+   xdirps%nproc = globs%nproc_x
+   ydirps%nproc = globs%nproc_y
+   zdirps%nproc = globs%nproc_z
+
+   ! Better if we move this somewhere else
+   ! Set the pprev and pnext for each rank
+   call set_pprev_pnext( &
+      xdirps%pprev, xdirps%pnext, &
+      ydirps%pprev, ydirps%pnext, &
+      zdirps%pprev, zdirps%pnext, &
+      xdirps%nproc, ydirps%nproc, zdirps%nproc, nrank &
+   )
+
+   ! lets assume simple cases for now
+   globs%nx_loc = globs%nx/globs%nproc_x
+   globs%ny_loc = globs%ny/globs%nproc_y
+   globs%nz_loc = globs%nz/globs%nproc_z
+
+   globs%n_groups_x = globs%ny_loc*globs%nz_loc/SZ
+   globs%n_groups_y = globs%nx_loc*globs%nz_loc/SZ
+   globs%n_groups_z = globs%nx_loc*globs%ny_loc/SZ
+
+   globs%dx = globs%Lx/globs%nx
+   globs%dy = globs%Ly/globs%ny
+   globs%dz = globs%Lz/globs%nz
+
+   xdirps%n = globs%nx_loc
+   ydirps%n = globs%ny_loc
+   zdirps%n = globs%nz_loc
+
+#ifdef CUDA
+   cuda_allocator = cuda_allocator_t([SZ, globs%nx_loc, globs%n_groups_x])
+   allocator => cuda_allocator
+   print*, 'CUDA allocator instantiated'
+
+   cuda_backend = cuda_backend_t(globs, allocator)
+   backend => cuda_backend
+   print*, 'CUDA backend instantiated'
+#else
+   omp_allocator = allocator_t([SZ, globs%nx_loc, globs%n_groups_x])
+   allocator => omp_allocator
+   print*, 'OpenMP allocator instantiated'
+
+   omp_backend = omp_backend_t(globs, allocator)
+   backend => omp_backend
+   print*, 'OpenMP backend instantiated'
+#endif
+
+   backend%nu = 1._dp
+
+   allocate(u(SZ, globs%nx_loc, globs%n_groups_x))
+   allocate(v(SZ, globs%nx_loc, globs%n_groups_x))
+   allocate(w(SZ, globs%nx_loc, globs%n_groups_x))
+
+   time_integrator = time_intg_t(allocator=allocator, backend=backend)
+   print*, 'time integrator instantiated'
+   solver = solver_t(backend, time_integrator, xdirps, ydirps, zdirps, globs)
+   print*, 'solver instantiated'
+
+   call cpu_time(t_start)
+
+   call solver%run(100, u, v, w)
+
+   call cpu_time(t_end)
+
+   print*, 'Time: ', t_end - t_start
+
+   print*, 'norms', norm2(u), norm2(v), norm2(w)
+
+end program xcompact