diff --git a/src/allocator.f90 b/src/allocator.f90 index 1134d4db..8dc504c8 100644 --- a/src/allocator.f90 +++ b/src/allocator.f90 @@ -40,7 +40,8 @@ module m_allocator !> Padded dimensions for x, y, and z oriented fields integer :: xdims_padded(3), ydims_padded(3), zdims_padded(3) !> Padded dimensions for natural Cartesian ordering - integer :: cdims_padded(3) + integer :: cdims_padded(3), cdims(3) + integer :: n_groups_x, n_groups_y, n_groups_z !> The pointer to the first block on the list. Non associated if !> the list is empty ! TODO: Rename first to head @@ -66,9 +67,13 @@ module m_allocator class(field_t), pointer :: next real(dp), pointer, private :: p_data(:) real(dp), pointer, contiguous :: data(:, :, :) - integer :: dir integer :: refcount = 0 integer :: id !! An integer identifying the memory block. + integer :: dir + integer :: n !! number of cells in the `dir` direction + integer :: n_padded !! number of cells in the `dir` direction including padding + integer :: SZ + integer :: n_groups contains procedure :: set_shape end type field_t @@ -119,10 +124,15 @@ function allocator_init(nx, ny, nz, sz) result(allocator) allocator%ngrid = nx_padded*ny_padded*nz_padded allocator%sz = sz - allocator%xdims_padded = [sz, nx_padded, ny_padded*nz_padded/sz] - allocator%ydims_padded = [sz, ny_padded, nx_padded*nz_padded/sz] - allocator%zdims_padded = [sz, nz_padded, nx_padded*ny_padded/sz] + allocator%n_groups_x = ny_padded*nz_padded/sz + allocator%n_groups_y = nx_padded*nz_padded/sz + allocator%n_groups_z = nx_padded*ny_padded/sz + + allocator%xdims_padded = [sz, nx_padded, allocator%n_groups_x] + allocator%ydims_padded = [sz, ny_padded, allocator%n_groups_y] + allocator%zdims_padded = [sz, nz_padded, allocator%n_groups_z] allocator%cdims_padded = [nx_padded, ny_padded, nz_padded] + allocator%cdims = [nx, ny, nz] end function allocator_init function create_block(self, next) result(ptr) @@ -170,12 +180,28 @@ function get_block(self, dir) result(handle) select case (dir) case (DIR_X) dims = self%xdims_padded + handle%n = self%cdims(1) + handle%SZ = self%xdims_padded(1) + handle%n_padded = self%xdims_padded(2) + handle%n_groups = self%xdims_padded(3) case (DIR_Y) dims = self%ydims_padded + handle%n = self%cdims(2) + handle%SZ = self%ydims_padded(1) + handle%n_padded = self%ydims_padded(2) + handle%n_groups = self%ydims_padded(3) case (DIR_Z) dims = self%zdims_padded + handle%n = self%cdims(3) + handle%SZ = self%zdims_padded(1) + handle%n_padded = self%zdims_padded(2) + handle%n_groups = self%zdims_padded(3) case (DIR_C) dims = self%cdims_padded + handle%n = -1 + handle%SZ = -1 + handle%n_padded = -1 + handle%n_groups = -1 case default error stop 'Undefined direction, allocator cannot provide a shape.' end select diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 8395498a..7194e7e3 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -64,15 +64,14 @@ module m_cuda_backend contains - function init(globs, allocator) result(backend) + function init(allocator) result(backend) implicit none - class(globs_t) :: globs class(allocator_t), target, intent(inout) :: allocator type(cuda_backend_t) :: backend type(cuda_poisson_fft_t) :: cuda_poisson_fft - integer :: n_halo, n_block + integer :: n_halo, n_groups, sz select type (allocator) type is (cuda_allocator_t) @@ -80,46 +79,48 @@ function init(globs, allocator) result(backend) backend%allocator => allocator end select - backend%xthreads = dim3(SZ, 1, 1) - backend%xblocks = dim3(globs%n_groups_x, 1, 1) - backend%ythreads = dim3(SZ, 1, 1) - backend%yblocks = dim3(globs%n_groups_y, 1, 1) - backend%zthreads = dim3(SZ, 1, 1) - backend%zblocks = dim3(globs%n_groups_z, 1, 1) + sz = allocator%sz - backend%nx_loc = globs%nx_loc - backend%ny_loc = globs%ny_loc - backend%nz_loc = globs%nz_loc + backend%xthreads = dim3(sz, 1, 1) + backend%xblocks = dim3(allocator%n_groups_x, 1, 1) + backend%ythreads = dim3(sz, 1, 1) + backend%yblocks = dim3(allocator%n_groups_y, 1, 1) + backend%zthreads = dim3(sz, 1, 1) + backend%zblocks = dim3(allocator%n_groups_z, 1, 1) + + backend%nx_loc = allocator%dims(1) + backend%ny_loc = allocator%dims(2) + backend%nz_loc = allocator%dims(3) n_halo = 4 ! Buffer size should be big enough for the largest MPI exchange. - n_block = max(globs%n_groups_x, globs%n_groups_y, globs%n_groups_z) - - allocate (backend%u_send_s_dev(SZ, n_halo, n_block)) - allocate (backend%u_send_e_dev(SZ, n_halo, n_block)) - allocate (backend%u_recv_s_dev(SZ, n_halo, n_block)) - allocate (backend%u_recv_e_dev(SZ, n_halo, n_block)) - allocate (backend%v_send_s_dev(SZ, n_halo, n_block)) - allocate (backend%v_send_e_dev(SZ, n_halo, n_block)) - allocate (backend%v_recv_s_dev(SZ, n_halo, n_block)) - allocate (backend%v_recv_e_dev(SZ, n_halo, n_block)) - allocate (backend%w_send_s_dev(SZ, n_halo, n_block)) - allocate (backend%w_send_e_dev(SZ, n_halo, n_block)) - allocate (backend%w_recv_s_dev(SZ, n_halo, n_block)) - allocate (backend%w_recv_e_dev(SZ, n_halo, n_block)) - - allocate (backend%du_send_s_dev(SZ, 1, n_block)) - allocate (backend%du_send_e_dev(SZ, 1, n_block)) - allocate (backend%du_recv_s_dev(SZ, 1, n_block)) - allocate (backend%du_recv_e_dev(SZ, 1, n_block)) - allocate (backend%dud_send_s_dev(SZ, 1, n_block)) - allocate (backend%dud_send_e_dev(SZ, 1, n_block)) - allocate (backend%dud_recv_s_dev(SZ, 1, n_block)) - allocate (backend%dud_recv_e_dev(SZ, 1, n_block)) - allocate (backend%d2u_send_s_dev(SZ, 1, n_block)) - allocate (backend%d2u_send_e_dev(SZ, 1, n_block)) - allocate (backend%d2u_recv_s_dev(SZ, 1, n_block)) - allocate (backend%d2u_recv_e_dev(SZ, 1, n_block)) + n_groups = max(allocator%n_groups_x, allocator%n_groups_y, allocator%n_groups_z) + + allocate (backend%u_send_s_dev(sz, n_halo, n_groups)) + allocate (backend%u_send_e_dev(sz, n_halo, n_groups)) + allocate (backend%u_recv_s_dev(sz, n_halo, n_groups)) + allocate (backend%u_recv_e_dev(sz, n_halo, n_groups)) + allocate (backend%v_send_s_dev(sz, n_halo, n_groups)) + allocate (backend%v_send_e_dev(sz, n_halo, n_groups)) + allocate (backend%v_recv_s_dev(sz, n_halo, n_groups)) + allocate (backend%v_recv_e_dev(sz, n_halo, n_groups)) + allocate (backend%w_send_s_dev(sz, n_halo, n_groups)) + allocate (backend%w_send_e_dev(sz, n_halo, n_groups)) + allocate (backend%w_recv_s_dev(sz, n_halo, n_groups)) + allocate (backend%w_recv_e_dev(sz, n_halo, n_groups)) + + allocate (backend%du_send_s_dev(sz, 1, n_groups)) + allocate (backend%du_send_e_dev(sz, 1, n_groups)) + allocate (backend%du_recv_s_dev(sz, 1, n_groups)) + allocate (backend%du_recv_e_dev(sz, 1, n_groups)) + allocate (backend%dud_send_s_dev(sz, 1, n_groups)) + allocate (backend%dud_send_e_dev(sz, 1, n_groups)) + allocate (backend%dud_recv_s_dev(sz, 1, n_groups)) + allocate (backend%dud_recv_e_dev(sz, 1, n_groups)) + allocate (backend%d2u_send_s_dev(sz, 1, n_groups)) + allocate (backend%d2u_send_e_dev(sz, 1, n_groups)) + allocate (backend%d2u_recv_s_dev(sz, 1, n_groups)) + allocate (backend%d2u_recv_e_dev(sz, 1, n_groups)) end function init @@ -226,7 +227,7 @@ subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, dirps, & type is (cuda_tdsops_t); der2nd_sym => tdsops end select - call transeq_halo_exchange(self, u_dev, v_dev, w_dev, dirps) + call transeq_halo_exchange(self, u, v, w, dirps) call transeq_dist_component(self, du_dev, u_dev, u_dev, & self%u_recv_s_dev, self%u_recv_e_dev, & @@ -246,22 +247,26 @@ subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, dirps, & end subroutine transeq_cuda_dist - subroutine transeq_halo_exchange(self, u_dev, v_dev, w_dev, dirps) + subroutine transeq_halo_exchange(self, u, v, w, dirps) class(cuda_backend_t) :: self - real(dp), device, dimension(:, :, :), intent(in) :: u_dev, v_dev, w_dev + class(field_t), intent(in) :: u, v, w type(dirps_t), intent(in) :: dirps + real(dp), device, dimension(:, :, :) :: u_dev, v_dev, w_dev integer :: n_halo + call resolve_field_t(u_dev, u) + call resolve_field_t(v_dev, v) + call resolve_field_t(w_dev, w) ! TODO: don't hardcode n_halo n_halo = 4 ! Copy halo data into buffer arrays call copy_into_buffers(self%u_send_s_dev, self%u_send_e_dev, u_dev, & - dirps%n) + u%n, u%sz) call copy_into_buffers(self%v_send_s_dev, self%v_send_e_dev, v_dev, & - dirps%n) + v%n, u%sz) call copy_into_buffers(self%w_send_s_dev, self%w_send_e_dev, w_dev, & - dirps%n) + w%n, u%sz) ! halo exchange call sendrecv_3fields( & @@ -271,7 +276,7 @@ subroutine transeq_halo_exchange(self, u_dev, v_dev, w_dev, dirps) self%u_send_s_dev, self%u_send_e_dev, & self%v_send_s_dev, self%v_send_e_dev, & self%w_send_s_dev, self%w_send_e_dev, & - SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext & + u%sz*n_halo*u%n_groups, dirps%nproc, dirps%pprev, dirps%pnext & ) end subroutine transeq_halo_exchange @@ -360,7 +365,7 @@ subroutine tds_solve_cuda(self, du, u, dirps, tdsops) error stop 'DIR mismatch between fields and dirps in tds_solve.' end if - blocks = dim3(dirps%n_blocks, 1, 1); threads = dim3(SZ, 1, 1) + blocks = dim3(u%n_groups, 1, 1); threads = dim3(u%sz, 1, 1) call tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads) @@ -397,7 +402,7 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads) call sendrecv_fields(self%u_recv_s_dev, self%u_recv_e_dev, & self%u_send_s_dev, self%u_send_e_dev, & - SZ*n_halo*dirps%n_blocks, dirps%nproc, & + u%sz*n_halo*u%n_groups, dirps%nproc, & dirps%pprev, dirps%pnext) ! call exec_dist @@ -612,19 +617,19 @@ real(dp) function scalar_product_cuda(self, x, y) result(s) end function scalar_product_cuda - subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) + subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n, sz) implicit none real(dp), device, dimension(:, :, :), intent(out) :: u_send_s_dev, & u_send_e_dev real(dp), device, dimension(:, :, :), intent(in) :: u_dev - integer, intent(in) :: n + integer, intent(in) :: n, sz type(dim3) :: blocks, threads integer :: n_halo = 4 blocks = dim3(size(u_dev, dim=3), 1, 1) - threads = dim3(SZ, 1, 1) + threads = dim3(sz, 1, 1) call buffer_copy<<>>(u_send_s_dev, u_send_e_dev, & !& u_dev, n, n_halo) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index dac437fd..9df76f7a 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -8,7 +8,6 @@ module m_omp_backend use m_omp_exec_dist, only: exec_dist_tds_compact, exec_dist_transeq_compact use m_omp_sendrecv, only: sendrecv_fields - use m_omp_common, only: SZ use m_omp_poisson_fft, only: omp_poisson_fft_t implicit none @@ -48,14 +47,13 @@ module m_omp_backend contains - function init(globs, allocator) result(backend) + function init(allocator) result(backend) implicit none - class(globs_t) :: globs class(allocator_t), target, intent(inout) :: allocator type(omp_backend_t) :: backend - integer :: n_halo, n_block + integer :: n_halo, n_groups, sz select type (allocator) type is (allocator_t) @@ -64,33 +62,36 @@ function init(globs, allocator) result(backend) end select n_halo = 4 - n_block = globs%n_groups_x - - allocate (backend%u_send_s(SZ, n_halo, n_block)) - allocate (backend%u_send_e(SZ, n_halo, n_block)) - allocate (backend%u_recv_s(SZ, n_halo, n_block)) - allocate (backend%u_recv_e(SZ, n_halo, n_block)) - allocate (backend%v_send_s(SZ, n_halo, n_block)) - allocate (backend%v_send_e(SZ, n_halo, n_block)) - allocate (backend%v_recv_s(SZ, n_halo, n_block)) - allocate (backend%v_recv_e(SZ, n_halo, n_block)) - allocate (backend%w_send_s(SZ, n_halo, n_block)) - allocate (backend%w_send_e(SZ, n_halo, n_block)) - allocate (backend%w_recv_s(SZ, n_halo, n_block)) - allocate (backend%w_recv_e(SZ, n_halo, n_block)) - - allocate (backend%du_send_s(SZ, 1, n_block)) - allocate (backend%du_send_e(SZ, 1, n_block)) - allocate (backend%du_recv_s(SZ, 1, n_block)) - allocate (backend%du_recv_e(SZ, 1, n_block)) - allocate (backend%dud_send_s(SZ, 1, n_block)) - allocate (backend%dud_send_e(SZ, 1, n_block)) - allocate (backend%dud_recv_s(SZ, 1, n_block)) - allocate (backend%dud_recv_e(SZ, 1, n_block)) - allocate (backend%d2u_send_s(SZ, 1, n_block)) - allocate (backend%d2u_send_e(SZ, 1, n_block)) - allocate (backend%d2u_recv_s(SZ, 1, n_block)) - allocate (backend%d2u_recv_e(SZ, 1, n_block)) + ! Buffer size should be big enough for the largest MPI exchange. + n_groups = max(allocator%n_groups_x, allocator%n_groups_y, allocator%n_groups_z) + print *, "n_groups new", n_groups + sz = allocator%sz + + allocate (backend%u_send_s(sz, n_halo, n_groups)) + allocate (backend%u_send_e(sz, n_halo, n_groups)) + allocate (backend%u_recv_s(sz, n_halo, n_groups)) + allocate (backend%u_recv_e(sz, n_halo, n_groups)) + allocate (backend%v_send_s(sz, n_halo, n_groups)) + allocate (backend%v_send_e(sz, n_halo, n_groups)) + allocate (backend%v_recv_s(sz, n_halo, n_groups)) + allocate (backend%v_recv_e(sz, n_halo, n_groups)) + allocate (backend%w_send_s(sz, n_halo, n_groups)) + allocate (backend%w_send_e(sz, n_halo, n_groups)) + allocate (backend%w_recv_s(sz, n_halo, n_groups)) + allocate (backend%w_recv_e(sz, n_halo, n_groups)) + + allocate (backend%du_send_s(sz, 1, n_groups)) + allocate (backend%du_send_e(sz, 1, n_groups)) + allocate (backend%du_recv_s(sz, 1, n_groups)) + allocate (backend%du_recv_e(sz, 1, n_groups)) + allocate (backend%dud_send_s(sz, 1, n_groups)) + allocate (backend%dud_send_e(sz, 1, n_groups)) + allocate (backend%dud_recv_s(sz, 1, n_groups)) + allocate (backend%dud_recv_e(sz, 1, n_groups)) + allocate (backend%d2u_send_s(sz, 1, n_groups)) + allocate (backend%d2u_send_e(sz, 1, n_groups)) + allocate (backend%d2u_recv_s(sz, 1, n_groups)) + allocate (backend%d2u_recv_e(sz, 1, n_groups)) end function init @@ -195,24 +196,21 @@ subroutine transeq_halo_exchange(self, u, v, w, dirps) ! TODO: don't hardcode n_halo n_halo = 4 - call copy_into_buffers(self%u_send_s, self%u_send_e, u%data, & - dirps%n, dirps%n_blocks) - call copy_into_buffers(self%v_send_s, self%v_send_e, v%data, & - dirps%n, dirps%n_blocks) - call copy_into_buffers(self%w_send_s, self%w_send_e, w%data, & - dirps%n, dirps%n_blocks) + call copy_into_buffers(self%u_send_s, self%u_send_e, u) + call copy_into_buffers(self%v_send_s, self%v_send_e, v) + call copy_into_buffers(self%w_send_s, self%w_send_e, w) call sendrecv_fields(self%u_recv_s, self%u_recv_e, & self%u_send_s, self%u_send_e, & - SZ*n_halo*dirps%n_blocks, & + u%sz*n_halo*u%n_groups, & dirps%nproc, dirps%pprev, dirps%pnext) call sendrecv_fields(self%v_recv_s, self%v_recv_e, & self%v_send_s, self%v_send_e, & - SZ*n_halo*dirps%n_blocks, & + v%sz*n_halo*v%n_groups, & dirps%nproc, dirps%pprev, dirps%pnext) call sendrecv_fields(self%w_recv_s, self%w_recv_e, & self%w_send_s, self%w_send_e, & - SZ*n_halo*dirps%n_blocks, & + w%sz*n_halo*w%n_groups, & dirps%nproc, dirps%pprev, dirps%pnext) end subroutine transeq_halo_exchange @@ -247,7 +245,7 @@ subroutine transeq_dist_component(self, rhs, u, conv, & u%data, u_recv_s, u_recv_e, & conv%data, conv_recv_s, conv_recv_e, & tdsops_du, tdsops_dud, tdsops_d2u, self%nu, & - dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks) + dirps%nproc, dirps%pprev, dirps%pnext, u%n_groups, u%sz) call self%allocator%release_block(du) call self%allocator%release_block(dud) @@ -285,20 +283,18 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops) ! TODO: don't hardcode n_halo n_halo = 4 - call copy_into_buffers(self%u_send_s, self%u_send_e, u%data, & - dirps%n, dirps%n_blocks) + call copy_into_buffers(self%u_send_s, self%u_send_e, u) ! halo exchange call sendrecv_fields(self%u_recv_s, self%u_recv_e, & self%u_send_s, self%u_send_e, & - SZ*n_halo*dirps%n_blocks, & + u%sz*n_halo*u%n_groups, & dirps%nproc, dirps%pprev, dirps%pnext) call exec_dist_tds_compact( & du%data, u%data, self%u_recv_s, self%u_recv_e, & self%du_send_s, self%du_send_e, self%du_recv_s, self%du_recv_e, & - tdsops, dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks & - ) + tdsops, dirps%nproc, dirps%pprev, dirps%pnext, u%n_groups, u%sz) end subroutine tds_solve_dist @@ -309,43 +305,16 @@ subroutine reorder_omp(self, u_, u, direction) class(field_t), intent(inout) :: u_ class(field_t), intent(in) :: u integer, intent(in) :: direction - integer :: ndir_loc, ndir_groups integer :: i, j, k integer :: out_i, out_j, out_k - select case (direction) - case (RDR_X2Y) - ndir_loc = self%xdirps%n - ndir_groups = self%xdirps%n_blocks - case (RDR_X2Z) - ndir_loc = self%xdirps%n - ndir_groups = self%xdirps%n_blocks - case (RDR_Y2X) - ndir_loc = self%ydirps%n - ndir_groups = self%ydirps%n_blocks - case (RDR_Y2Z) - ndir_loc = self%ydirps%n - ndir_groups = self%ydirps%n_blocks - case (RDR_Z2X) - ndir_loc = self%zdirps%n - ndir_groups = self%zdirps%n_blocks - case (RDR_Z2Y) - ndir_loc = self%zdirps%n - ndir_groups = self%zdirps%n_blocks - case default - ndir_loc = 0 - ndir_groups = 0 - error stop 'unsuported reordering' - end select - !$omp parallel do private(out_i, out_j, out_k) collapse(2) - do k = 1, ndir_groups - do j = 1, ndir_loc - do i = 1, SZ + do k = 1, u%n_groups + do j = 1, u%n + do i = 1, u%sz call get_index_reordering( & - out_i, out_j, out_k, i, j, k, direction, & - SZ, self%xdirps%n, self%ydirps%n, self%zdirps%n & - ) + out_i, out_j, out_k, i, j, k, direction, u%sz, & + self%allocator%cdims(1), self%allocator%cdims(2), self%allocator%cdims(3)) u_%data(out_i, out_j, out_k) = u%data(i, j, k) end do end do @@ -393,23 +362,21 @@ real(dp) function scalar_product_omp(self, x, y) result(s) end function scalar_product_omp - subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) + subroutine copy_into_buffers(u_send_s, u_send_e, u) implicit none real(dp), dimension(:, :, :), intent(out) :: u_send_s, u_send_e - real(dp), dimension(:, :, :), intent(in) :: u - integer, intent(in) :: n - integer, intent(in) :: n_blocks + class(field_t), intent(in) :: u integer :: i, j, k integer :: n_halo = 4 !$omp parallel do - do k = 1, n_blocks + do k = 1, u%n_groups do j = 1, n_halo !$omp simd - do i = 1, SZ - u_send_s(i, j, k) = u(i, j, k) - u_send_e(i, j, k) = u(i, n - n_halo + j, k) + do i = 1, u%sz + u_send_s(i, j, k) = u%data(i, j, k) + u_send_e(i, j, k) = u%data(i, u%n - n_halo + j, k) end do !$omp end simd end do @@ -444,7 +411,7 @@ subroutine init_omp_poisson_fft(self, xdirps, ydirps, zdirps) select type (poisson_fft => self%poisson_fft) type is (omp_poisson_fft_t) - poisson_fft = omp_poisson_fft_t(xdirps, ydirps, zdirps) + poisson_fft = omp_poisson_fft_t(xdirps, ydirps, zdirps, self%allocator) end select end subroutine init_omp_poisson_fft diff --git a/src/omp/exec_dist.f90 b/src/omp/exec_dist.f90 index 926ad1db..2daf4a16 100644 --- a/src/omp/exec_dist.f90 +++ b/src/omp/exec_dist.f90 @@ -2,7 +2,6 @@ module m_omp_exec_dist use mpi use m_common, only: dp - use m_omp_common, only: SZ use m_omp_kernels_dist, only: der_univ_dist, der_univ_subs use m_tdsops, only: tdsops_t use m_omp_sendrecv, only: sendrecv_fields @@ -13,7 +12,7 @@ module m_omp_exec_dist subroutine exec_dist_tds_compact( & du, u, u_recv_s, u_recv_e, du_send_s, du_send_e, du_recv_s, du_recv_e, & - tdsops, nproc, pprev, pnext, n_block & + tdsops, nproc, pprev, pnext, n_groups, sz & ) implicit none @@ -29,15 +28,16 @@ subroutine exec_dist_tds_compact( & type(tdsops_t), intent(in) :: tdsops integer, intent(in) :: nproc, pprev, pnext - integer, intent(in) :: n_block + integer, intent(in) :: n_groups + integer, intent(in) :: sz integer :: n_data integer :: k - n_data = SZ*n_block + n_data = sz*n_groups !$omp parallel do - do k = 1, n_block + do k = 1, n_groups call der_univ_dist( & du(:, :, k), du_send_s(:, :, k), du_send_e(:, :, k), u(:, :, k), & u_recv_s(:, :, k), u_recv_e(:, :, k), & @@ -52,7 +52,7 @@ subroutine exec_dist_tds_compact( & n_data, nproc, pprev, pnext) !$omp parallel do - do k = 1, n_block + do k = 1, n_groups call der_univ_subs(du(:, :, k), & du_recv_s(:, :, k), du_recv_e(:, :, k), & tdsops%n, tdsops%dist_sa, tdsops%dist_sc) @@ -68,7 +68,7 @@ subroutine exec_dist_transeq_compact( & d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e, & u, u_recv_s, u_recv_e, & v, v_recv_s, v_recv_e, & - tdsops_du, tdsops_dud, tdsops_d2u, nu, nproc, pprev, pnext, n_block) + tdsops_du, tdsops_dud, tdsops_d2u, nu, nproc, pprev, pnext, n_groups, sz) implicit none @@ -93,7 +93,8 @@ subroutine exec_dist_transeq_compact( & real(dp), dimension(:, :), allocatable :: ud, ud_recv_s, ud_recv_e real(dp) :: nu integer, intent(in) :: nproc, pprev, pnext - integer, intent(in) :: n_block + integer, intent(in) :: n_groups + integer, intent(in) :: sz integer :: n_data, n_halo integer :: k, i, j, n @@ -101,14 +102,14 @@ subroutine exec_dist_transeq_compact( & ! TODO: don't hardcode n_halo n_halo = 4 n = tdsops_d2u%n - n_data = SZ*n_block + n_data = sz*n_groups - allocate (ud(SZ, n)) - allocate (ud_recv_e(SZ, n_halo)) - allocate (ud_recv_s(SZ, n_halo)) + allocate (ud(sz, n)) + allocate (ud_recv_e(sz, n_halo)) + allocate (ud_recv_s(sz, n_halo)) !$omp parallel do private(ud, ud_recv_e, ud_recv_s) - do k = 1, n_block + do k = 1, n_groups call der_univ_dist( & du(:, :, k), du_send_s(:, :, k), du_send_e(:, :, k), u(:, :, k), & u_recv_s(:, :, k), u_recv_e(:, :, k), & @@ -127,7 +128,7 @@ subroutine exec_dist_transeq_compact( & ! Handle dud by locally generating u*v do j = 1, n !$omp simd - do i = 1, SZ + do i = 1, sz ud(i, j) = u(i, j, k)*v(i, j, k) end do !$omp end simd @@ -135,7 +136,7 @@ subroutine exec_dist_transeq_compact( & do j = 1, n_halo !$omp simd - do i = 1, SZ + do i = 1, sz ud_recv_s(i, j) = u_recv_s(i, j, k)*v_recv_s(i, j, k) ud_recv_e(i, j) = u_recv_e(i, j, k)*v_recv_e(i, j, k) end do @@ -162,7 +163,7 @@ subroutine exec_dist_transeq_compact( & n_data, nproc, pprev, pnext) !$omp parallel do - do k = 1, n_block + do k = 1, n_groups call der_univ_subs(du(:, :, k), & du_recv_s(:, :, k), du_recv_e(:, :, k), & tdsops_du%n, tdsops_du%dist_sa, tdsops_du%dist_sc) @@ -177,7 +178,7 @@ subroutine exec_dist_transeq_compact( & do j = 1, n !$omp simd - do i = 1, SZ + do i = 1, sz rhs(i, j, k) = -0.5_dp*(v(i, j, k)*du(i, j, k) + dud(i, j, k)) & + nu*d2u(i, j, k) end do diff --git a/src/omp/poisson_fft.f90 b/src/omp/poisson_fft.f90 index 835cf75a..928ef90f 100644 --- a/src/omp/poisson_fft.f90 +++ b/src/omp/poisson_fft.f90 @@ -1,5 +1,5 @@ module m_omp_poisson_fft - use m_allocator, only: field_t + use m_allocator, only: field_t, allocator_t use m_common, only: dp use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: dirps_t @@ -26,14 +26,15 @@ module m_omp_poisson_fft contains - function init(xdirps, ydirps, zdirps) result(poisson_fft) + function init(xdirps, ydirps, zdirps, allocator) result(poisson_fft) implicit none class(dirps_t), intent(in) :: xdirps, ydirps, zdirps + class(allocator_t), target, intent(in) :: allocator type(omp_poisson_fft_t) :: poisson_fft - call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) + call poisson_fft%base_init(xdirps, ydirps, zdirps, allocator) end function init diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90 index d95f0940..626b3388 100644 --- a/src/poisson_fft.f90 +++ b/src/poisson_fft.f90 @@ -1,5 +1,5 @@ module m_poisson_fft - use m_allocator, only: field_t + use m_allocator, only: field_t, allocator_t use m_common, only: dp, pi use m_tdsops, only: dirps_t @@ -48,14 +48,18 @@ end subroutine fft_postprocess contains - subroutine base_init(self, xdirps, ydirps, zdirps, sz) + subroutine base_init(self, xdirps, ydirps, zdirps, allocator) implicit none class(poisson_fft_t) :: self class(dirps_t), intent(in) :: xdirps, ydirps, zdirps - integer, intent(in) :: sz + class(allocator_t), target, intent(in) :: allocator + integer :: sz - self%nx = xdirps%n; self%ny = ydirps%n; self%nz = zdirps%n + sz = allocator%sz + self%nx = allocator%cdims(1) + self%ny = allocator%cdims(2) + self%nz = allocator%cdims(3) allocate (self%ax(self%nx), self%bx(self%nx)) allocate (self%ay(self%nx), self%by(self%nx)) @@ -64,7 +68,7 @@ subroutine base_init(self, xdirps, ydirps, zdirps, sz) allocate (self%waves(sz, self%nx, (self%ny*(self%nz/2 + 1))/sz)) ! waves_set requires some of the preprocessed tdsops variables. - call self%waves_set(xdirps, ydirps, zdirps, sz) + call self%waves_set(xdirps, ydirps, zdirps, allocator%sz) end subroutine base_init @@ -85,7 +89,7 @@ subroutine waves_set(self, xdirps, ydirps, zdirps, sz) integer :: i, j, ka, kb, ix, iy, iz - nx = xdirps%n; ny = ydirps%n; nz = zdirps%n + nx = self%nx; ny = self%ny; nz = self%nz do i = 1, nx self%ax(i) = sin((i - 1)*pi/nx) diff --git a/src/solver.f90 b/src/solver.f90 index d944d120..b25f9f96 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -85,7 +85,7 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & type(solver_t) :: solver real(dp), allocatable, dimension(:, :, :) :: u_init, v_init, w_init - integer :: dims(3) + integer :: dims(3), dims_padded(3) real(dp) :: x, y, z integer :: nx, ny, nz, i, j, k @@ -102,17 +102,18 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & solver%w => solver%backend%allocator%get_block(DIR_X) ! Set initial conditions - dims(:) = solver%backend%allocator%cdims_padded(:) - allocate (u_init(dims(1), dims(2), dims(3))) - allocate (v_init(dims(1), dims(2), dims(3))) - allocate (w_init(dims(1), dims(2), dims(3))) + dims_padded(:) = solver%backend%allocator%cdims_padded(:) + allocate (u_init(dims_padded(1), dims_padded(2), dims_padded(3))) + allocate (v_init(dims_padded(1), dims_padded(2), dims_padded(3))) + allocate (w_init(dims_padded(1), dims_padded(2), dims_padded(3))) + dims(:) = solver%backend%allocator%cdims(:) solver%dt = globs%dt solver%backend%nu = globs%nu solver%n_iters = globs%n_iters solver%n_output = globs%n_output - nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc + nx = dims(1); ny = dims(2); nz = dims(3) do k = 1, nz do j = 1, ny do i = 1, nx @@ -565,15 +566,18 @@ subroutine output(self, t, u_out) real(dp), dimension(:, :, :), intent(inout) :: u_out class(field_t), pointer :: du, dv, dw, div_u + integer :: cdims(3) integer :: ngrid - ngrid = self%xdirps%n*self%ydirps%n*self%zdirps%n print *, 'time = ', t du => self%backend%allocator%get_block(DIR_X) dv => self%backend%allocator%get_block(DIR_X) dw => self%backend%allocator%get_block(DIR_X) + cdims = self%backend%allocator%cdims + ngrid = cdims(1)*cdims(2)*cdims(3) + call self%curl(du, dv, dw, self%u, self%v, self%w) print *, 'enstrophy:', 0.5_dp*( & self%backend%scalar_product(du, du) & diff --git a/src/tdsops.f90 b/src/tdsops.f90 index e5421f17..f8cb3135 100644 --- a/src/tdsops.f90 +++ b/src/tdsops.f90 @@ -46,7 +46,7 @@ module m_tdsops der2nd, der2nd_sym, & stagder_v2p, stagder_p2v, & interpl_v2p, interpl_p2v - integer :: nrank, nproc, pnext, pprev, n, n_blocks, dir + integer :: nrank, nproc, pnext, pprev, dir real(dp) :: L, d end type dirps_t diff --git a/src/xcompact.f90 b/src/xcompact.f90 index 3de6a7f4..fbb144cd 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -94,24 +94,12 @@ program xcompact globs%ny_loc = globs%ny/globs%nproc_y globs%nz_loc = globs%nz/globs%nproc_z - globs%n_groups_x = globs%ny_loc*globs%nz_loc/SZ - globs%n_groups_y = globs%nx_loc*globs%nz_loc/SZ - globs%n_groups_z = globs%nx_loc*globs%ny_loc/SZ - globs%dx = globs%Lx/globs%nx globs%dy = globs%Ly/globs%ny globs%dz = globs%Lz/globs%nz xdirps%d = globs%dx; ydirps%d = globs%dy; zdirps%d = globs%dz - xdirps%n = globs%nx_loc - ydirps%n = globs%ny_loc - zdirps%n = globs%nz_loc - - xdirps%n_blocks = globs%n_groups_x - ydirps%n_blocks = globs%n_groups_y - zdirps%n_blocks = globs%n_groups_z - xdirps%dir = DIR_X; ydirps%dir = DIR_Y; zdirps%dir = DIR_Z #ifdef CUDA @@ -120,7 +108,7 @@ program xcompact allocator => cuda_allocator print *, 'CUDA allocator instantiated' - cuda_backend = cuda_backend_t(globs, allocator) + cuda_backend = cuda_backend_t(allocator) backend => cuda_backend print *, 'CUDA backend instantiated' #else @@ -128,7 +116,7 @@ program xcompact allocator => omp_allocator print *, 'OpenMP allocator instantiated' - omp_backend = omp_backend_t(globs, allocator) + omp_backend = omp_backend_t(allocator) backend => omp_backend print *, 'OpenMP backend instantiated' #endif diff --git a/tests/omp/test_omp_dist_transeq.f90 b/tests/omp/test_omp_dist_transeq.f90 index fd695a00..aecc6c66 100644 --- a/tests/omp/test_omp_dist_transeq.f90 +++ b/tests/omp/test_omp_dist_transeq.f90 @@ -115,7 +115,7 @@ program test_transeq d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e, & u, u_recv_s, u_recv_e, & v, v_recv_s, v_recv_e, & - der1st, der1st, der2nd, nu, nproc, pprev, pnext, n_block & + der1st, der1st, der2nd, nu, nproc, pprev, pnext, n_block, sz & ) ! check error diff --git a/tests/omp/test_omp_transeq.f90 b/tests/omp/test_omp_transeq.f90 index 8fa02ef7..dfa3c78b 100644 --- a/tests/omp/test_omp_transeq.f90 +++ b/tests/omp/test_omp_transeq.f90 @@ -60,23 +60,15 @@ program test_omp_transeq xdirps%nproc, ydirps%nproc, zdirps%nproc, nrank & ) - xdirps%n = globs%nx_loc - ydirps%n = globs%ny_loc - zdirps%n = globs%nz_loc - - xdirps%n_blocks = globs%n_groups_x - ydirps%n_blocks = globs%n_groups_y - zdirps%n_blocks = globs%n_groups_z - xdirps%dir = DIR_X ydirps%dir = DIR_Y zdirps%dir = DIR_Z - omp_allocator = allocator_t(xdirps%n, ydirps%n, zdirps%n, SZ) + omp_allocator = allocator_t(globs%nx_loc, globs%ny_loc, globs%nz_loc, SZ) allocator => omp_allocator print *, 'OpenMP allocator instantiated' - omp_backend = omp_backend_t(globs, allocator) + omp_backend = omp_backend_t(allocator) backend => omp_backend print *, 'OpenMP backend instantiated' @@ -84,7 +76,7 @@ program test_omp_transeq n_glob = globs%nx n = n_glob/nproc - n_block = xdirps%n_blocks + n_block = globs%n_groups_x nu = 1._dp omp_backend%nu = nu diff --git a/tests/omp/test_omp_tridiag.f90 b/tests/omp/test_omp_tridiag.f90 index bc053845..7d6bdf2e 100644 --- a/tests/omp/test_omp_tridiag.f90 +++ b/tests/omp/test_omp_tridiag.f90 @@ -299,7 +299,7 @@ subroutine run_kernel(n_iters, n_block, u, du, tdsops, n, & call exec_dist_tds_compact(du, u, u_recv_s, u_recv_e, & send_s, send_e, recv_s, recv_e, & - tdsops, nproc, pprev, pnext, n_block) + tdsops, nproc, pprev, pnext, n_block, sz) end do end subroutine run_kernel diff --git a/tests/test_reordering.f90 b/tests/test_reordering.f90 index e7b772b4..7e04ee99 100644 --- a/tests/test_reordering.f90 +++ b/tests/test_reordering.f90 @@ -32,6 +32,7 @@ program test_reorder real(dp), allocatable, dimension(:, :, :) :: u_array, temp_1, temp_2 integer :: dims(3) + integer :: cdims(3) integer :: nrank, nproc integer :: ierr, i, j, k @@ -72,25 +73,13 @@ program test_reorder globs%ny_loc = globs%ny/nproc globs%nz_loc = globs%nz/nproc - globs%n_groups_x = globs%ny_loc*globs%nz_loc/SZ - globs%n_groups_y = globs%nx_loc*globs%nz_loc/SZ - globs%n_groups_z = globs%nx_loc*globs%ny_loc/SZ - - xdirps%n = globs%nx_loc - ydirps%n = globs%ny_loc - zdirps%n = globs%nz_loc - - xdirps%n_blocks = globs%n_groups_x - ydirps%n_blocks = globs%n_groups_y - zdirps%n_blocks = globs%n_groups_z - #ifdef CUDA cuda_allocator = cuda_allocator_t(globs%nx_loc, globs%ny_loc, globs%nz_loc, & SZ) allocator => cuda_allocator print *, 'CUDA allocator instantiated' - cuda_backend = cuda_backend_t(globs, allocator) + cuda_backend = cuda_backend_t(allocator) backend => cuda_backend print *, 'CUDA backend instantiated' #else @@ -98,7 +87,7 @@ program test_reorder allocator => omp_allocator print *, 'OpenMP allocator instantiated' - omp_backend = omp_backend_t(globs, allocator) + omp_backend = omp_backend_t(allocator) backend => omp_backend print *, 'OpenMP backend instantiated' #endif @@ -112,16 +101,18 @@ program test_reorder pass_Y = .true. pass_Z = .true. + cdims = allocator%cdims + ! Test indexing only - do k = 1, zdirps%n - do j = 1, ydirps%n - do i = 1, xdirps%n + do k = 1, cdims(3) + do j = 1, cdims(2) + do i = 1, cdims(1) call test_index_reversing(pass_X, i, j, k, DIR_X, & - SZ, xdirps%n, ydirps%n, zdirps%n) + allocator%sz, cdims(1), cdims(2), cdims(3)) call test_index_reversing(pass_Y, i, j, k, DIR_Y, & - SZ, xdirps%n, ydirps%n, zdirps%n) + allocator%sz, cdims(1), cdims(2), cdims(3)) call test_index_reversing(pass_Z, i, j, k, DIR_Z, & - SZ, xdirps%n, ydirps%n, zdirps%n) + allocator%sz, cdims(1), cdims(2), cdims(3)) end do end do end do