Skip to content

Commit

Permalink
Use reshapes to carry out local transposes.
Browse files Browse the repository at this point in the history
  • Loading branch information
semi-h committed Mar 1, 2024
1 parent 03a8a34 commit 6399546
Showing 1 changed file with 28 additions and 7 deletions.
35 changes: 28 additions & 7 deletions src/cuda/poisson_fft.f90
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ module m_cuda_poisson_fft
real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, &
ay_dev, by_dev, az_dev, bz_dev

real(dp), device, allocatable, dimension(:, :, :) :: f_tmp

integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy
contains
procedure :: fft_forward => fft_forward_cuda
Expand Down Expand Up @@ -67,6 +69,11 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft)
allocate (poisson_fft%c_y_dev(nx*ny*(nz/2 + 1)))
allocate (poisson_fft%c_z_dev(nx*ny*(nz/2 + 1)))

! We can't currently ask allocator to pass us an array with
! exact shape we want, so we allocate an extra array here.
! This will be removed when allocator is fixed.
allocate (poisson_fft%f_tmp(nz, SZ, nx*ny/SZ))

! set cufft plans
ierrfft = cufftCreate(poisson_fft%planD2Zz)
ierrfft = cufftMakePlanMany(poisson_fft%planD2Zz, 1, nz, &
Expand Down Expand Up @@ -110,9 +117,12 @@ subroutine fft_forward_cuda(self, f)
select type(f); type is (cuda_field_t); f_dev => f%data_d; end select

! First reorder f into cartesian-like data structure
blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1)
threads = dim3(SZ, SZ, 1)
call reshapeDSF<<<blocks, threads>>>(self%f_tmp, f_dev)

! Forward FFT transform in z from real to complex
ierrfft = cufftExecD2Z(self%planD2Zz, f_dev, self%c_z_dev)
ierrfft = cufftExecD2Z(self%planD2Zz, self%f_tmp, self%c_z_dev)

! Reorder from z to y
blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx)
Expand Down Expand Up @@ -184,9 +194,12 @@ subroutine fft_backward_cuda(self, f)
self%nx, self%nz/2 + 1)

! Backward FFT transform in z from complex to real
ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, f_dev)
ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, self%f_tmp)

! Finally reorder f back into our specialist data structure
blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1)
threads = dim3(SZ, SZ, 1)
call reshapeDSB<<<blocks, threads>>>(f_dev, self%f_tmp)

end subroutine fft_backward_cuda

Expand All @@ -195,23 +208,31 @@ subroutine fft_postprocess_cuda(self)

class(cuda_poisson_fft_t) :: self

complex(dp), device, dimension(:, :, :), pointer :: c_dev
complex(dp), device, dimension(:, :, :), pointer :: c_dev, c_x_ptr
type(dim3) :: blocks, threads

! Get some complex array pointers with right shape
c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev
c_dev(1:SZ, 1:self%nx, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_y_dev

! Reshape from cartesian-like to our specialist data structure
blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1)
threads = dim3(SZ, SZ, 1)
call reshapeCDSB<<<blocks, threads>>>(c_dev, c_x_ptr)

blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1 , 1)
! Postprocess
blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1, 1)
threads = dim3(SZ, 1, 1)

c_dev(1:SZ, 1:self%nx, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_y_dev

call processfftdiv<<<blocks, threads>>>( &
c_dev, self%waves_dev, self%nx, self%ny, self%nz, &
self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, &
self%az_dev, self%bz_dev &
)

! Reshape from our specialist data structure to cartesian-like
blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1)
threads = dim3(SZ, SZ, 1)
call reshapeCDSF<<<blocks, threads>>>(c_x_ptr, c_dev)

end subroutine fft_postprocess_cuda

Expand Down

0 comments on commit 6399546

Please sign in to comment.