diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 133f43c5..c1d775c2 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -3,6 +3,8 @@ module m_omp_backend use m_base_backend, only: base_backend_t use m_common, only: dp, globs_t use m_tdsops, only: dirps_t, tdsops_t + use m_omp_exec_dist, only: exec_dist_tds_compact, exec_dist_transeq_compact + use m_omp_sendrecv, only: sendrecv_fields use m_omp_common, only: SZ @@ -30,6 +32,7 @@ module m_omp_backend procedure :: vecadd => vecadd_omp procedure :: set_fields => set_fields_omp procedure :: get_fields => get_fields_omp + procedure :: transeq_omp_dist end type omp_backend_t interface omp_backend_t @@ -118,7 +121,7 @@ subroutine transeq_x_omp(self, du, dv, dw, u, v, w, dirps) class(field_t), intent(in) :: u, v, w type(dirps_t), intent(in) :: dirps - !call self%transeq_omp_dist(du, dv, dw, u, v, w, dirps) + call self%transeq_omp_dist(du, dv, dw, u, v, w, dirps) end subroutine transeq_x_omp @@ -131,7 +134,7 @@ subroutine transeq_y_omp(self, du, dv, dw, u, v, w, dirps) type(dirps_t), intent(in) :: dirps ! u, v, w is reordered so that we pass v, u, w - !call self%transeq_omp_dist(dv, du, dw, v, u, w, dirps) + call self%transeq_omp_dist(dv, du, dw, v, u, w, dirps) end subroutine transeq_y_omp @@ -144,10 +147,86 @@ subroutine transeq_z_omp(self, du, dv, dw, u, v, w, dirps) type(dirps_t), intent(in) :: dirps ! u, v, w is reordered so that we pass w, u, v - !call self%transeq_omp_dist(dw, du, dv, w, u, v, dirps) + call self%transeq_omp_dist(dw, du, dv, w, u, v, dirps) end subroutine transeq_z_omp + subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, dirps) + implicit none + + class(omp_backend_t) :: self + class(field_t), intent(inout) :: du, dv, dw + class(field_t), intent(in) :: u, v, w + type(dirps_t), intent(in) :: dirps + integer :: n_halo + + call transeq_halo_exchange(self, u, v, w, dirps) + + call transeq_dist_component(self, du, u, u, & + dirps%der1st, dirps%der1st_sym, dirps%der2nd, dirps) + call transeq_dist_component(self, dv, v, u, & + dirps%der1st_sym, dirps%der1st, dirps%der2nd_sym, dirps) + call transeq_dist_component(self, dw, w, u, & + dirps%der1st_sym, dirps%der1st, dirps%der2nd_sym, dirps) + + end subroutine transeq_omp_dist + + + subroutine transeq_halo_exchange(self, u, v, w, dirps) + class(omp_backend_t) :: self + class(field_t), intent(in) :: u, v, w + type(dirps_t), intent(in) :: dirps + integer :: n_halo + + ! TODO: don't hardcode n_halo + n_halo = 4 + + call copy_into_buffers(self%u_send_s, self%u_send_e, u%data, dirps%n, dirps%n_blocks) + call copy_into_buffers(self%v_send_s, self%v_send_e, v%data, dirps%n, dirps%n_blocks) + call copy_into_buffers(self%w_send_s, self%w_send_e, w%data, dirps%n, dirps%n_blocks) + + call sendrecv_fields(self%u_recv_s, self%u_recv_e, self%u_send_s, self%u_send_e, & + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) + call sendrecv_fields(self%v_recv_s, self%v_recv_e, self%v_send_s, self%v_send_e, & + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) + call sendrecv_fields(self%w_recv_s, self%w_recv_e, self%w_send_s, self%w_send_e, & + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) + + end subroutine transeq_halo_exchange + + !> Computes RHS_x^v following: + ! rhs_x^v = -0.5*(u*dv/dx + duv/dx) + nu*d2v/dx2 + subroutine transeq_dist_component(self, rhs, v, u, tdsops_du, tdsops_dud, tdsops_d2u, dirps) + + class(omp_backend_t) :: self + class(field_t), intent(inout) :: rhs + class(field_t), intent(in) :: u, v + class(tdsops_t), intent(in) :: tdsops_du + class(tdsops_t), intent(in) :: tdsops_dud + class(tdsops_t), intent(in) :: tdsops_d2u + type(dirps_t), intent(in) :: dirps + class(field_t), pointer :: du, d2u, dud + + du => self%allocator%get_block() + dud => self%allocator%get_block() + d2u => self%allocator%get_block() + + call exec_dist_transeq_compact(& + rhs%data, du%data, dud%data, d2u%data, & + self%du_send_s, self%du_send_e, self%du_recv_s, self%du_recv_e, & + self%dud_send_s, self%dud_send_e, self%dud_recv_s, self%dud_recv_e, & + self%d2u_send_s, self%d2u_send_e, self%d2u_recv_s, self%d2u_recv_e, & + u%data, self%u_recv_s, self%u_recv_e, & + v%data, self%v_recv_s, self%v_recv_e, & + tdsops_du, tdsops_dud, tdsops_d2u, self%nu, & + dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks) + + call self%allocator%release_block(du) + call self%allocator%release_block(dud) + call self%allocator%release_block(d2u) + + end subroutine transeq_dist_component + subroutine tds_solve_omp(self, du, u, dirps, tdsops) implicit none @@ -157,10 +236,36 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops) type(dirps_t), intent(in) :: dirps class(tdsops_t), intent(in) :: tdsops - !call self%tds_solve_dist(self, du, u, dirps, tdsops) + call tds_solve_dist(self, du, u, dirps, tdsops) end subroutine tds_solve_omp + subroutine tds_solve_dist(self, du, u, dirps, tdsops) + implicit none + + class(omp_backend_t) :: self + class(field_t), intent(inout) :: du + class(field_t), intent(in) :: u + type(dirps_t), intent(in) :: dirps + class(tdsops_t), intent(in) :: tdsops + integer :: n_halo + + ! TODO: don't hardcode n_halo + n_halo = 4 + call copy_into_buffers(self%u_send_s, self%u_send_e, u%data, dirps%n, dirps%n_blocks) + + ! halo exchange + call sendrecv_fields(self%u_recv_s, self%u_recv_e, self%u_send_s, self%u_send_e, & + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) + + + call exec_dist_tds_compact( & + du%data, u%data, self%u_recv_s, self%u_recv_e, self%du_send_s, self%du_send_e, & + self%du_recv_s, self%du_recv_e, & + tdsops, dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks) + + end subroutine tds_solve_dist + subroutine reorder_omp(self, u_, u, direction) implicit none @@ -200,15 +305,28 @@ subroutine vecadd_omp(self, a, x, b, y) end subroutine vecadd_omp - subroutine copy_into_buffers(u_send_s, u_send_e, u, n) + subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) implicit none real(dp), dimension(:, :, :), intent(out) :: u_send_s, u_send_e real(dp), dimension(:, :, :), intent(in) :: u integer, intent(in) :: n - - u_send_s(:, :, :) = u(:, 1:4, :) - u_send_e(:, :, :) = u(:, n - 3:n, :) + integer, intent(in) :: n_blocks + integer :: i, j, k + integer :: n_halo = 4 + + !$omp parallel do + do k=1, n_blocks + do j=1, n_halo + !$omp simd + do i=1, SZ + u_send_s(i, j, k) = u(i, j, k) + u_send_e(i, j, k) = u(i, n - n_halo + j, k) + end do + !$omp end simd + end do + end do + !$omp end parallel do end subroutine copy_into_buffers diff --git a/src/omp/exec_dist.f90 b/src/omp/exec_dist.f90 index ac46e1a4..e713c08a 100644 --- a/src/omp/exec_dist.f90 +++ b/src/omp/exec_dist.f90 @@ -61,5 +61,134 @@ subroutine exec_dist_tds_compact( & end subroutine exec_dist_tds_compact + + subroutine exec_dist_transeq_compact(& + rhs, du, dud, d2u, & + du_send_s, du_send_e, du_recv_s, du_recv_e, & + dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & + d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e, & + u, u_recv_s, u_recv_e, & + v, v_recv_s, v_recv_e, & + tdsops_du, tdsops_dud, tdsops_d2u, nu, nproc, pprev, pnext, n_block) + + implicit none + + ! du = d(u) + real(dp), dimension(:, :, :), intent(out) :: rhs, du, dud, d2u + + ! The ones below are intent(out) just so that we can write data in them, + ! not because we actually need the data they store later where this + ! subroutine is called. We absolutely don't care about the data they pass back + real(dp), dimension(:, :, :), intent(out) :: & + du_send_s, du_send_e, du_recv_s, du_recv_e + real(dp), dimension(:, :, :), intent(out) :: & + dud_send_s, dud_send_e, dud_recv_s, dud_recv_e + real(dp), dimension(:, :, :), intent(out) :: & + d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e + + real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e + real(dp), dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e + + type(tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u + + real(dp), dimension(:, :), allocatable :: ud, ud_recv_s, ud_recv_e + real(dp) :: nu + integer, intent(in) :: nproc, pprev, pnext + integer, intent(in) :: n_block + + integer :: n_data, n_halo + integer :: k, i, j, n + + ! TODO: don't hardcode n_halo + n_halo = 4 + n = tdsops_d2u%n + n_data = SZ*n_block + + allocate(ud(SZ, n)) + allocate(ud_recv_e(SZ, n_halo)) + allocate(ud_recv_s(SZ, n_halo)) + + !$omp parallel do private(ud, ud_recv_e, ud_recv_s) + do k = 1, n_block + call der_univ_dist( & + du(:, :, k), du_send_s(:, :, k), du_send_e(:, :, k), u(:, :, k), & + u_recv_s(:, :, k), u_recv_e(:, :, k), & + tdsops_du%coeffs_s, tdsops_du%coeffs_e, tdsops_du%coeffs, tdsops_du%n, & + tdsops_du%dist_fw, tdsops_du%dist_bw, tdsops_du%dist_af & + ) + + call der_univ_dist( & + d2u(:, :, k), d2u_send_s(:, :, k), d2u_send_e(:, :, k), u(:, :, k), & + u_recv_s(:, :, k), u_recv_e(:, :, k), & + tdsops_d2u%coeffs_s, tdsops_d2u%coeffs_e, tdsops_d2u%coeffs, tdsops_d2u%n, & + tdsops_d2u%dist_fw, tdsops_d2u%dist_bw, tdsops_d2u%dist_af & + ) + + ! Handle dud by locally generating u*v + do j = 1, n + !$omp simd + do i = 1, SZ + ud(i, j) = u(i, j, k) * v(i, j, k) + end do + !$omp end simd + end do + + do j = 1, n_halo + !$omp simd + do i = 1, SZ + ud_recv_s(i, j) = u_recv_s(i, j, k) * v_recv_s(i, j, k) + ud_recv_e(i, j) = u_recv_e(i, j, k) * v_recv_e(i, j, k) + end do + !$omp end simd + end do + + call der_univ_dist( & + dud(:, :, k), dud_send_s(:, :, k), dud_send_e(:, :, k), ud(:, :), & + ud_recv_s(:, :), ud_recv_e(:, :), & + tdsops_dud%coeffs_s, tdsops_dud%coeffs_e, tdsops_dud%coeffs, tdsops_dud%n, & + tdsops_dud%dist_fw, tdsops_dud%dist_bw, tdsops_dud%dist_af & + ) + + end do + !$omp end parallel do + + ! halo exchange for 2x2 systems + call sendrecv_fields(du_recv_s, du_recv_e, du_send_s, du_send_e, & + n_data, nproc, pprev, pnext) + call sendrecv_fields(dud_recv_s, dud_recv_e, dud_send_s, dud_send_e, & + n_data, nproc, pprev, pnext) + call sendrecv_fields(d2u_recv_s, d2u_recv_e, d2u_send_s, d2u_send_e, & + n_data, nproc, pprev, pnext) + + !$omp parallel do + do k = 1, n_block + call der_univ_subs(du(:, :, k), & + du_recv_s(:, :, k), du_recv_e(:, :, k), & + tdsops_du%n, tdsops_du%dist_sa, tdsops_du%dist_sc) + + call der_univ_subs(dud(:, :, k), & + dud_recv_s(:, :, k), dud_recv_e(:, :, k), & + tdsops_dud%n, tdsops_dud%dist_sa, tdsops_dud%dist_sc) + + call der_univ_subs(d2u(:, :, k), & + d2u_recv_s(:, :, k), d2u_recv_e(:, :, k), & + tdsops_d2u%n, tdsops_d2u%dist_sa, tdsops_d2u%dist_sc) + + do j = 1, n + !$omp simd + do i = 1, SZ + rhs(i, j, k) = -0.5_dp*(v(i, j, k)*du(i, j, k) + dud(i, j, k)) + nu*d2u(i, j, k) + end do + !$omp end simd + end do + + end do + !$omp end parallel do + + + end subroutine exec_dist_transeq_compact + + + end module m_omp_exec_dist diff --git a/src/omp/kernels_dist.f90 b/src/omp/kernels_dist.f90 index 35c883ec..bf706904 100644 --- a/src/omp/kernels_dist.f90 +++ b/src/omp/kernels_dist.f90 @@ -37,29 +37,45 @@ subroutine der_univ_dist( & !$omp simd do i = 1, SZ - du(i, 1) = coeffs_s(1, 1)*u_s(i, 1) + coeffs_s(2, 1)*u_s(i, 2) & - + coeffs_s(3, 1)*u_s(i, 3) + coeffs_s(4, 1)*u_s(i, 4) & + du(i, 1) = coeffs_s(1, 1)*u_s(i, 1) & + + coeffs_s(2, 1)*u_s(i, 2) & + + coeffs_s(3, 1)*u_s(i, 3) & + + coeffs_s(4, 1)*u_s(i, 4) & + coeffs_s(5, 1)*u(i, 1) & - + coeffs_s(6, 1)*u(i, 2) + coeffs_s(7, 1)*u(i, 3) & - + coeffs_s(8, 1)*u(i, 4) + coeffs_s(9, 1)*u(i, 5) + + coeffs_s(6, 1)*u(i, 2) & + + coeffs_s(7, 1)*u(i, 3) & + + coeffs_s(8, 1)*u(i, 4) & + + coeffs_s(9, 1)*u(i, 5) du(i, 1) = du(i, 1)*faf(1) - du(i, 2) = coeffs_s(1, 2)*u_s(i, 2) + coeffs_s(2, 2)*u_s(i, 3) & - + coeffs_s(3, 2)*u_s(i, 4) + coeffs_s(4, 2)*u(i, 1) & + du(i, 2) = coeffs_s(1, 2)*u_s(i, 2) & + + coeffs_s(2, 2)*u_s(i, 3) & + + coeffs_s(3, 2)*u_s(i, 4) & + + coeffs_s(4, 2)*u(i, 1) & + coeffs_s(5, 2)*u(i, 2) & - + coeffs_s(6, 2)*u(i, 3) + coeffs_s(7, 2)*u(i, 4) & - + coeffs_s(8, 2)*u(i, 5) + coeffs_s(9, 2)*u(i, 6) + + coeffs_s(6, 2)*u(i, 3) & + + coeffs_s(7, 2)*u(i, 4) & + + coeffs_s(8, 2)*u(i, 5) & + + coeffs_s(9, 2)*u(i, 6) du(i, 2) = du(i, 2)*faf(2) - du(i, 3) = coeffs_s(1, 3)*u_s(i, 3) + coeffs_s(2, 3)*u_s(i, 4) & - + coeffs_s(3, 3)*u(i, 1) + coeffs_s(4, 3)*u(i, 2) & + du(i, 3) = coeffs_s(1, 3)*u_s(i, 3) & + + coeffs_s(2, 3)*u_s(i, 4) & + + coeffs_s(3, 3)*u(i, 1) & + + coeffs_s(4, 3)*u(i, 2) & + coeffs_s(5, 3)*u(i, 3) & - + coeffs_s(6, 3)*u(i, 4) + coeffs_s(7, 3)*u(i, 5) & - + coeffs_s(8, 3)*u(i, 6) + coeffs_s(9, 3)*u(i, 7) + + coeffs_s(6, 3)*u(i, 4) & + + coeffs_s(7, 3)*u(i, 5) & + + coeffs_s(8, 3)*u(i, 6) & + + coeffs_s(9, 3)*u(i, 7) du(i, 3) = ffr(3)*(du(i, 3) - faf(3)*du(i, 2)) - du(i, 4) = coeffs_s(1, 4)*u_s(i, 4) + coeffs_s(2, 4)*u(i, 1) & - + coeffs_s(3, 4)*u(i, 2) + coeffs_s(4, 4)*u(i, 3) & + du(i, 4) = coeffs_s(1, 4)*u_s(i, 4) & + + coeffs_s(2, 4)*u(i, 1) & + + coeffs_s(3, 4)*u(i, 2) & + + coeffs_s(4, 4)*u(i, 3) & + coeffs_s(5, 4)*u(i, 4) & - + coeffs_s(6, 4)*u(i, 5) + coeffs_s(7, 4)*u(i, 6) & - + coeffs_s(8, 4)*u(i, 7) + coeffs_s(9, 4)*u(i, 8) + + coeffs_s(6, 4)*u(i, 5) & + + coeffs_s(7, 4)*u(i, 6) & + + coeffs_s(8, 4)*u(i, 7) & + + coeffs_s(9, 4)*u(i, 8) du(i, 4) = ffr(4)*(du(i, 4) - faf(4)*du(i, 3)) end do !$omp end simd @@ -83,32 +99,48 @@ subroutine der_univ_dist( & !$omp simd do i = 1, SZ j = n - 3 - du(i, j) = coeffs_e(1, 1)*u(i, j - 4) + coeffs_e(2, 1)*u(i, j - 3) & - + coeffs_e(3, 1)*u(i, j - 2) + coeffs_e(4, 1)*u(i, j - 1) & + du(i, j) = coeffs_e(1, 1)*u(i, j - 4) & + + coeffs_e(2, 1)*u(i, j - 3) & + + coeffs_e(3, 1)*u(i, j - 2) & + + coeffs_e(4, 1)*u(i, j - 1) & + coeffs_e(5, 1)*u(i, j) & - + coeffs_e(6, 1)*u(i, j + 1) + coeffs_e(7, 1)*u(i, j + 2) & - + coeffs_e(8, 1)*u(i, j + 3) + coeffs_e(9, 1)*u_e(i, 1) + + coeffs_e(6, 1)*u(i, j + 1) & + + coeffs_e(7, 1)*u(i, j + 2) & + + coeffs_e(8, 1)*u(i, j + 3) & + + coeffs_e(9, 1)*u_e(i, 1) du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) j = n - 2 - du(i, j) = coeffs_e(1, 2)*u(i, j - 4) + coeffs_e(2, 2)*u(i, j - 3) & - + coeffs_e(3, 2)*u(i, j - 2) + coeffs_e(4, 2)*u(i, j - 1) & + du(i, j) = coeffs_e(1, 2)*u(i, j - 4) & + + coeffs_e(2, 2)*u(i, j - 3) & + + coeffs_e(3, 2)*u(i, j - 2) & + + coeffs_e(4, 2)*u(i, j - 1) & + coeffs_e(5, 2)*u(i, j) & - + coeffs_e(6, 2)*u(i, j + 1) + coeffs_e(7, 2)*u(i, j + 2) & - + coeffs_e(8, 2)*u_e(i, 1) + coeffs_e(9, 2)*u_e(i, 2) + + coeffs_e(6, 2)*u(i, j + 1) & + + coeffs_e(7, 2)*u(i, j + 2) & + + coeffs_e(8, 2)*u_e(i, 1) & + + coeffs_e(9, 2)*u_e(i, 2) du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) j = n - 1 - du(i, j) = coeffs_e(1, 3)*u(i, j - 4) + coeffs_e(2, 3)*u(i, j - 3) & - + coeffs_e(3, 3)*u(i, j - 2) + coeffs_e(4, 3)*u(i, j - 1) & + du(i, j) = coeffs_e(1, 3)*u(i, j - 4) & + + coeffs_e(2, 3)*u(i, j - 3) & + + coeffs_e(3, 3)*u(i, j - 2) & + + coeffs_e(4, 3)*u(i, j - 1) & + coeffs_e(5, 3)*u(i, j) & - + coeffs_e(6, 3)*u(i, j + 1) + coeffs_e(7, 3)*u_e(i, 1) & - + coeffs_e(8, 3)*u_e(i, 2) + coeffs_e(9, 3)*u_e(i, 3) + + coeffs_e(6, 3)*u(i, j + 1) & + + coeffs_e(7, 3)*u_e(i, 1) & + + coeffs_e(8, 3)*u_e(i, 2) & + + coeffs_e(9, 3)*u_e(i, 3) du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) j = n - du(i, j) = coeffs_e(1, 4)*u(i, j - 4) + coeffs_e(2, 4)*u(i, j - 3) & - + coeffs_e(3, 4)*u(i, j - 2) + coeffs_e(4, 4)*u(i, j - 1) & + du(i, j) = coeffs_e(1, 4)*u(i, j - 4) & + + coeffs_e(2, 4)*u(i, j - 3) & + + coeffs_e(3, 4)*u(i, j - 2) & + + coeffs_e(4, 4)*u(i, j - 1) & + coeffs_e(5, 4)*u(i, j) & - + coeffs_e(6, 4)*u_e(i, 1) + coeffs_e(7, 4)*u_e(i, 2) & - + coeffs_e(8, 4)*u_e(i, 3) + coeffs_e(9, 4)*u_e(i, 4) + + coeffs_e(6, 4)*u_e(i, 1) & + + coeffs_e(7, 4)*u_e(i, 2) & + + coeffs_e(8, 4)*u_e(i, 3) & + + coeffs_e(9, 4)*u_e(i, 4) du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) end do !$omp end simd diff --git a/src/solver.f90 b/src/solver.f90 index e4855781..adaa4066 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -102,59 +102,38 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & dx = globs%dx; dy = globs%dy; dz = globs%dz ! Allocate and set the tdsops - call solver%backend%alloc_tdsops(solver%xdirps%der1st, nx, dx, & - 'first-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%ydirps%der1st, ny, dy, & - 'first-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%zdirps%der1st, nz, dz, & - 'first-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%xdirps%der1st_sym, nx, dx, & - 'first-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%ydirps%der1st_sym, ny, dy, & - 'first-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%zdirps%der1st_sym, nz, dz, & - 'first-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%xdirps%der2nd, nx, dx, & - 'second-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%ydirps%der2nd, ny, dy, & - 'second-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%zdirps%der2nd, nz, dz, & - 'second-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%xdirps%der2nd_sym, nx, dx, & - 'second-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%ydirps%der2nd_sym, ny, dy, & - 'second-deriv', 'compact6') - call solver%backend%alloc_tdsops(solver%zdirps%der2nd_sym, nz, dz, & - 'second-deriv', 'compact6') - - call solver%backend%alloc_tdsops(solver%xdirps%interpl_v2p, nx, dx, & - 'interpolate', 'classic', from_to='v2p') - call solver%backend%alloc_tdsops(solver%ydirps%interpl_v2p, ny, dy, & - 'interpolate', 'classic', from_to='v2p') - call solver%backend%alloc_tdsops(solver%zdirps%interpl_v2p, nz, dz, & - 'interpolate', 'classic', from_to='v2p') - call solver%backend%alloc_tdsops(solver%xdirps%interpl_p2v, nx, dx, & - 'interpolate', 'classic', from_to='p2v') - call solver%backend%alloc_tdsops(solver%ydirps%interpl_p2v, ny, dy, & - 'interpolate', 'classic', from_to='p2v') - call solver%backend%alloc_tdsops(solver%zdirps%interpl_p2v, nz, dz, & - 'interpolate', 'classic', from_to='p2v') - - call solver%backend%alloc_tdsops(solver%xdirps%stagder_v2p, nx, dx, & - 'stag-deriv', 'compact6', from_to='v2p') - call solver%backend%alloc_tdsops(solver%ydirps%stagder_v2p, ny, dy, & - 'stag-deriv', 'compact6', from_to='v2p') - call solver%backend%alloc_tdsops(solver%zdirps%stagder_v2p, nz, dz, & - 'stag-deriv', 'compact6', from_to='v2p') - call solver%backend%alloc_tdsops(solver%xdirps%stagder_p2v, nx, dx, & - 'stag-deriv', 'compact6', from_to='p2v') - call solver%backend%alloc_tdsops(solver%ydirps%stagder_p2v, ny, dy, & - 'stag-deriv', 'compact6', from_to='p2v') - call solver%backend%alloc_tdsops(solver%zdirps%stagder_p2v, nz, dz, & - 'stag-deriv', 'compact6', from_to='p2v') + call allocate_tdsops(solver%xdirps, nx, dx, solver%backend) + call allocate_tdsops(solver%ydirps, ny, dy, solver%backend) + call allocate_tdsops(solver%zdirps, nz, dz, solver%backend) end function init + subroutine allocate_tdsops(dirps, nx, dx, backend) + class(dirps_t), intent(inout) :: dirps + real(dp), intent(in) :: dx + integer, intent(in) :: nx + class(base_backend_t), intent(in) :: backend + + call backend%alloc_tdsops(dirps%der1st, nx, dx, & + 'first-deriv', 'compact6') + call backend%alloc_tdsops(dirps%der1st_sym, nx, dx, & + 'first-deriv', 'compact6') + call backend%alloc_tdsops(dirps%der2nd, nx, dx, & + 'second-deriv', 'compact6') + call backend%alloc_tdsops(dirps%der2nd_sym, nx, dx, & + 'second-deriv', 'compact6') + call backend%alloc_tdsops(dirps%interpl_v2p, nx, dx, & + 'interpolate', 'classic', from_to='v2p') + call backend%alloc_tdsops(dirps%interpl_p2v, nx, dx, & + 'interpolate', 'classic', from_to='p2v') + call backend%alloc_tdsops(dirps%stagder_v2p, nx, dx, & + 'stag-deriv', 'compact6', from_to='v2p') + call backend%alloc_tdsops(dirps%stagder_p2v, nx, dx, & + 'stag-deriv', 'compact6', from_to='p2v') + + + end subroutine + subroutine transeq(self, du, dv, dw, u, v, w) implicit none diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 808ada66..cbd27246 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,6 +1,8 @@ set(TESTSRC test_allocator.f90 omp/test_omp_tridiag.f90 + omp/test_omp_transeq.f90 + omp/test_omp_dist_transeq.f90 ) set(CUDATESTSRC cuda/test_cuda_allocator.f90 diff --git a/tests/omp/test_omp_dist_transeq.f90 b/tests/omp/test_omp_dist_transeq.f90 new file mode 100644 index 00000000..1a46df80 --- /dev/null +++ b/tests/omp/test_omp_dist_transeq.f90 @@ -0,0 +1,151 @@ +program test_transeq + use iso_fortran_env, only: stderr => error_unit + use mpi + + use m_common, only: dp, pi + use m_omp_common, only: SZ + use m_omp_exec_dist, only: exec_dist_transeq_compact + use m_omp_sendrecv, only: sendrecv_fields + use m_tdsops, only: tdsops_t + + implicit none + + logical :: allpass = .true. + real(dp), allocatable, dimension(:, :, :) :: u, v, r_u + real(dp), allocatable, dimension(:, :, :) :: du, dud, d2u ! intermediate solution arrays + real(dp), allocatable, dimension(:, :, :) :: & + du_recv_s, du_recv_e, du_send_s, du_send_e, & + dud_recv_s, dud_recv_e, dud_send_s, dud_send_e, & + d2u_recv_s, d2u_recv_e, d2u_send_s, d2u_send_e + + real(dp), allocatable, dimension(:, :, :) :: & + u_send_s, u_send_e, u_recv_s, u_recv_e, & + v_send_s, v_send_e, v_recv_s, v_recv_e + + type(tdsops_t) :: der1st, der2nd + + integer :: n, n_block, i, j, k, n_halo, n_iters + integer :: n_glob + integer :: nrank, nproc, pprev, pnext + integer :: ierr + + real(dp) :: dx, dx_per, nu, norm_du, tol = 1d-8 + + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + + if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' + + pnext = modulo(nrank - nproc + 1, nproc) + pprev = modulo(nrank - 1, nproc) + + n_glob = 32*4 + n = n_glob/nproc + n_block = 32*32/SZ + n_iters = 1 + + nu = 1._dp + + allocate(u(SZ, n, n_block), v(SZ, n, n_block), r_u(SZ, n, n_block)) + + ! main input fields + ! field for storing the result + ! intermediate solution fields + allocate(du(SZ, n, n_block)) + allocate(dud(SZ, n, n_block)) + allocate(d2u(SZ, n, n_block)) + + dx_per = 2*pi/n_glob + dx = 2*pi/(n_glob - 1) + + do k = 1, n_block + do j = 1, n + do i = 1, SZ + u(i, j, k) = sin((j - 1 + nrank*n)*dx_per) + v(i, j, k) = cos((j - 1 + nrank*n)*dx_per) + end do + end do + end do + + n_halo = 4 + + ! arrays for exchanging data between ranks + allocate(u_send_s(SZ, n_halo, n_block)) + allocate(u_send_e(SZ, n_halo, n_block)) + allocate(u_recv_s(SZ, n_halo, n_block)) + allocate(u_recv_e(SZ, n_halo, n_block)) + allocate(v_send_s(SZ, n_halo, n_block)) + allocate(v_send_e(SZ, n_halo, n_block)) + allocate(v_recv_s(SZ, n_halo, n_block)) + allocate(v_recv_e(SZ, n_halo, n_block)) + + allocate(du_send_s(SZ, 1, n_block), du_send_e(SZ, 1, n_block)) + allocate(du_recv_s(SZ, 1, n_block), du_recv_e(SZ, 1, n_block)) + allocate(dud_send_s(SZ, 1, n_block), dud_send_e(SZ, 1, n_block)) + allocate(dud_recv_s(SZ, 1, n_block), dud_recv_e(SZ, 1, n_block)) + allocate(d2u_send_s(SZ, 1, n_block), d2u_send_e(SZ, 1, n_block)) + allocate(d2u_recv_s(SZ, 1, n_block), d2u_recv_e(SZ, 1, n_block)) + + ! preprocess the operator and coefficient arrays + der1st = tdsops_t(n, dx_per, operation='first-deriv', & + scheme='compact6') + der2nd = tdsops_t(n, dx_per, operation='second-deriv', & + scheme='compact6') + + + u_send_s(:, :, :) = u(:, 1:4, :) + u_send_e(:, :, :) = u(:, n - n_halo + 1:n, :) + v_send_s(:, :, :) = v(:, 1:4, :) + v_send_e(:, :, :) = v(:, n - n_halo + 1:n, :) + + + ! halo exchange + call sendrecv_fields(u_recv_s, u_recv_e, & + u_send_s, u_send_e, & + SZ*4*n_block, nproc, pprev, pnext) + + call sendrecv_fields(v_recv_s, v_recv_e, & + v_send_s, v_send_e, & + SZ*4*n_block, nproc, pprev, pnext) + + call exec_dist_transeq_compact( & + r_u, & + du, dud, d2u, & + du_send_s, du_send_e, du_recv_s, du_recv_e, & + dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & + d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e, & + u, u_recv_s, u_recv_e, & + v, v_recv_s, v_recv_e, & + der1st, der1st, der2nd, nu, nproc, pprev, pnext, n_block & + ) + + ! check error + r_u = r_u - (-v*v + 0.5_dp*u*u - nu*u) + norm_du = norm2(r_u) + norm_du = norm_du*norm_du/n_glob/n_block/SZ + call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & + MPI_SUM, MPI_COMM_WORLD, ierr) + norm_du = sqrt(norm_du) + + if (nrank == 0) print*, 'error norm', norm_du + + if (nrank == 0) then + if ( norm_du > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check second derivatives... failed' + else + write(stderr, '(a)') 'Check second derivatives... passed' + end if + end if + + if (allpass) then + if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' + else + error stop 'SOME TESTS FAILED.' + end if + + call MPI_Finalize(ierr) + +end program test_transeq + diff --git a/tests/omp/test_omp_transeq.f90 b/tests/omp/test_omp_transeq.f90 new file mode 100644 index 00000000..b793c52a --- /dev/null +++ b/tests/omp/test_omp_transeq.f90 @@ -0,0 +1,150 @@ +program test_omp_transeq + use iso_fortran_env, only: stderr => error_unit + use mpi + + use m_allocator, only: allocator_t, field_t + use m_common, only: dp, pi, globs_t, set_pprev_pnext + use m_omp_common, only: SZ + use m_omp_sendrecv, only: sendrecv_fields + use m_omp_backend, only: omp_backend_t, transeq_x_omp, base_backend_t + use m_tdsops, only: dirps_t, tdsops_t + use m_solver, only: allocate_tdsops + + implicit none + + logical :: allpass = .true. + class(field_t), pointer :: u, v, w + class(field_t), pointer :: du, dv, dw + real(dp), dimension(:, :, :), allocatable :: r_u + + integer :: n, n_block, i, j, k + integer :: n_glob + integer :: nrank, nproc + integer :: ierr + + real(dp) :: dx, dx_per, nu, norm_du, tol = 1d-8, tstart, tend + + type(globs_t) :: globs + class(base_backend_t), pointer :: backend + class(allocator_t), pointer :: allocator + + type(omp_backend_t), target :: omp_backend + type(allocator_t), target :: omp_allocator + type(dirps_t) :: xdirps, ydirps, zdirps + + ! Initialise variables and arrays + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + + globs%nx = 96 + globs%ny = 96 + globs%nz = 96 + + globs%nx_loc = globs%nx/nproc + globs%ny_loc = globs%ny/nproc + globs%nz_loc = globs%nz/nproc + + globs%n_groups_x = globs%ny_loc*globs%nz_loc/SZ + globs%n_groups_y = globs%nx_loc*globs%nz_loc/SZ + globs%n_groups_z = globs%nx_loc*globs%ny_loc/SZ + + xdirps%nproc = nproc + ydirps%nproc = nproc + zdirps%nproc = nproc + + call set_pprev_pnext( & + xdirps%pprev, xdirps%pnext, & + ydirps%pprev, ydirps%pnext, & + zdirps%pprev, zdirps%pnext, & + xdirps%nproc, ydirps%nproc, zdirps%nproc, nrank & + ) + + xdirps%n = globs%nx_loc + ydirps%n = globs%ny_loc + zdirps%n = globs%nz_loc + + xdirps%n_blocks = globs%n_groups_x + ydirps%n_blocks = globs%n_groups_y + zdirps%n_blocks = globs%n_groups_z + + omp_allocator = allocator_t([SZ, globs%nx_loc, globs%n_groups_x]) + allocator => omp_allocator + print*, 'OpenMP allocator instantiated' + + omp_backend = omp_backend_t(globs, allocator) + backend => omp_backend + print*, 'OpenMP backend instantiated' + + + if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' + + n_glob = globs%nx + n = n_glob/nproc + n_block = xdirps%n_blocks + + nu = 1._dp + omp_backend%nu = nu + + + u => allocator%get_block() + v => allocator%get_block() + w => allocator%get_block() + + du => allocator%get_block() + dv => allocator%get_block() + dw => allocator%get_block() + + dx_per = 2*pi/n_glob + dx = 2*pi/(n_glob - 1) + globs%dx = dx + + + do k = 1, n_block + do j = 1, n + do i = 1, SZ + u%data(i, j, k) = sin((j - 1 + nrank*n)*dx_per) + v%data(i, j, k) = cos((j - 1 + nrank*n)*dx_per) + end do + end do + end do + w%data(:, :, :) = 0.d0 + + call allocate_tdsops(xdirps, globs%nx_loc, dx_per, omp_backend) + + call cpu_time(tstart) + call transeq_x_omp(omp_backend, du, dv, dw, u, v, w, xdirps) + call cpu_time(tend) + + if (nrank == 0) print*, 'Total time', tend - tstart + + allocate(r_u(SZ, n, n_block)) + + ! check error + r_u = dv%data - (-v%data*v%data + 0.5_dp*u%data*u%data - nu*u%data) + norm_du = norm2(r_u) + norm_du = norm_du*norm_du/n_glob/n_block/SZ + call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & + MPI_SUM, MPI_COMM_WORLD, ierr) + norm_du = sqrt(norm_du) + + if (nrank == 0) print*, 'error norm', norm_du + if (nrank == 0) then + if ( norm_du > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check second derivatives... failed' + else + write(stderr, '(a)') 'Check second derivatives... passed' + end if + end if + + if (allpass) then + if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' + else + error stop 'SOME TESTS FAILED.' + end if + + call MPI_Finalize(ierr) + +end program test_omp_transeq + diff --git a/tests/omp/test_omp_tridiag.f90 b/tests/omp/test_omp_tridiag.f90 index da747250..41b706e9 100644 --- a/tests/omp/test_omp_tridiag.f90 +++ b/tests/omp/test_omp_tridiag.f90 @@ -48,7 +48,7 @@ program test_omp_tridiag n_glob = 1024 n = n_glob/nproc - n_block = 512*512/SZ + n_block = 64*64/SZ n_iters = 1 allocate (u(SZ, n, n_block), du(SZ, n, n_block))