@@ -40,7 +40,7 @@ __host__ void testCombine2DRun(fftDir dir, cpx *in, cpx **dev_in, cpx **dev_out,

__host__ int tsCombine2D_Validate(int n)
{
char *image_name = "splash";
char *image_name = "shore";
cpx *in, *ref, *dev_in, *dev_out;
size_t size;
fft2DSetup(&in, &ref, &dev_in, &dev_out, &size, image_name, 0, n);
@@ -137,6 +137,8 @@ __host__ void tsCombine(fftDir dir, cpx **dev_in, cpx **dev_out, int n)
cudaDeviceSynchronize();
}

#define ROW_COL_KERNEL(rw, kr, kc) ((rw) ? (kr) : (kc))

__host__ void tsCombine2D_help(fftDir dir, cpx **dev_in, cpx **dev_out, int rowWise, int n)
{
dim3 blocks;
@@ -150,24 +152,18 @@ __host__ void tsCombine2D_help(fftDir dir, cpx **dev_in, cpx **dev_out, int rowW
const float w_angle = dir * (M_2_PI / n);
const float w_bangle = dir * (M_2_PI / nBlock);
int bSize = n;

if (blocks.y > 1) {

// Calculate sequence until parts fit into a block, syncronize on CPU until then.
--depth;
int steps = 0;
int dist = (n / 2);
if (rowWise)
_kernelAll2DRow KERNEL_ARGS2(blocks, threads)(*dev_in, *dev_out, w_angle, 0xFFFFFFFF << depth, steps, dist);
else
_kernelAll2DCol KERNEL_ARGS2(blocks, threads)(*dev_in, *dev_out, w_angle, 0xFFFFFFFF << depth, steps, dist);
ROW_COL_KERNEL(rowWise, _kernelAll2DRow, _kernelAll2DCol) KERNEL_ARGS2(blocks, threads)(*dev_in, *dev_out, w_angle, 0xFFFFFFFF << depth, steps, dist);
cudaDeviceSynchronize();
while (--depth > breakSize) {
dist >>= 1;
++steps;
if (rowWise)
_kernelAll2DRow KERNEL_ARGS2(blocks, threads)(*dev_out, *dev_out, w_angle, 0xFFFFFFFF << depth, steps, dist);
else
_kernelAll2DCol KERNEL_ARGS2(blocks, threads)(*dev_out, *dev_out, w_angle, 0xFFFFFFFF << depth, steps, dist);
ROW_COL_KERNEL(rowWise, _kernelAll2DRow, _kernelAll2DCol) KERNEL_ARGS2(blocks, threads)(*dev_out, *dev_out, w_angle, 0xFFFFFFFF << depth, steps, dist);
cudaDeviceSynchronize();
}
swap(dev_in, dev_out);
@@ -176,17 +172,15 @@ __host__ void tsCombine2D_help(fftDir dir, cpx **dev_in, cpx **dev_out, int rowW
}

// Calculate complete sequence in one launch and syncronize on GPU
if (rowWise)
_kernelGPUS2DRow KERNEL_ARGS3(blocks, threads, sizeof(cpx) * nBlock) (*dev_in, *dev_out, w_angle, w_bangle, depth, scaleCpx, bSize);
else
_kernelGPUS2DCol KERNEL_ARGS3(blocks, threads, sizeof(cpx) * nBlock) (*dev_in, *dev_out, w_angle, w_bangle, depth, scaleCpx, bSize);
ROW_COL_KERNEL(rowWise, _kernelGPUS2DRow, _kernelGPUS2DCol) KERNEL_ARGS3(blocks, threads, sizeof(cpx) * nBlock) (*dev_in, *dev_out, w_angle, w_bangle, depth, scaleCpx, bSize);
cudaDeviceSynchronize();
}

__host__ void tsCombine2D(fftDir dir, cpx **dev_in, cpx **dev_out, int n)
{
dim3 blocks, threads;
set_block_and_threads_transpose(&blocks, &threads, n);

if (n > 256) {
tsCombine2D_help(dir, dev_in, dev_out, 1, n);
_kernelTranspose KERNEL_ARGS2(blocks, threads) (*dev_out, *dev_in, n);
@@ -255,11 +249,12 @@ __global__ void _kernelAll(cpx *in, cpx *out, float angle, unsigned int lmask, i
__device__ static __inline__ void inner_k2D(cpx *in, cpx *out, int x, int y, float angle, int steps, int dist)
{
cpx w;
int tid = (blockIdx.y * blockDim.x + threadIdx.x);
int in_low = x + y * gridDim.x;
int in_high = in_low + dist;
cpx in_lower = in[in_low];
cpx in_upper = in[in_high];
SIN_COS_F(angle * (((blockIdx.y * blockDim.x + threadIdx.x) << steps) & ((dist - 1) << steps)), &w.y, &w.x);
SIN_COS_F(angle * ((tid << steps) & ((dist - 1) << steps)), &w.y, &w.x);
cpx_add_sub_mul(&(out[in_low]), &(out[in_high]), in_lower, in_upper, w);
}

@@ -289,33 +284,220 @@ __global__ void _kernelGPUS(cpx *in, cpx *out, float angle, float bAngle, int de
int offset = blockIdx.x * blockDim.x * 2;
in_high += threadIdx.x;
mem_gtos(threadIdx.x, in_high, offset, shared, in);
SYNC_THREADS;
algorithm_p(shared, in_high, bAngle, bit);
mem_stog_db(threadIdx.x, in_high, offset, lead, scale, shared, out);
}

__global__ void _kernelGPUS2DRow(cpx *in, cpx *out, float angle, float bAngle, int depth, cpx scale, int n)
__global__ void _kernelGPUS2DRow(cpx *in, cpx *out, float angle, float bAngle, int depth, cpx scale, int nBlock)
{
extern __shared__ cpx shared[];
int rowStart = gridDim.x * blockIdx.x;
int in_high = (nBlock >> 1) + threadIdx.x;
int rowOffset = blockIdx.y * blockDim.x * 2;
mem_gtos(threadIdx.x, in_high, rowOffset, shared, &(in[rowStart]));
SYNC_THREADS;
algorithm_p(shared, in_high, bAngle, depth);
mem_stog_db(threadIdx.x, in_high, rowOffset, (32 - log2((int)gridDim.x)), scale, shared, &(out[rowStart]));
}

__global__ void _kernelGPUS2DCol(cpx *in, cpx *out, float angle, float bAngle, int depth, cpx scale, int n)
{
extern __shared__ cpx shared[];
int global_offset = n * blockIdx.x;
int bit = depth;
int in_high = n >> 1;
int offset = blockIdx.y * blockDim.x * 2;
int colOffset = blockIdx.y * blockDim.x * 2;
in_high += threadIdx.x;
mem_gtos(threadIdx.x, in_high, offset, shared, &(in[global_offset]));
mem_gtos_col(threadIdx.x, in_high, (threadIdx.x + colOffset) * n + blockIdx.x, (n >> 1) * n, shared, in);
SYNC_THREADS;
algorithm_p(shared, in_high, bAngle, bit);
mem_stog_db(threadIdx.x, in_high, offset, 32 - depth, scale, shared, &(out[global_offset]));
algorithm_p(shared, in_high, bAngle, depth);
mem_stog_db_col(threadIdx.x, in_high, colOffset, 32 - log2((int)gridDim.x), scale, shared, out, n);
}

__global__ void _kernelGPUS2DCol(cpx *in, cpx *out, float angle, float bAngle, int depth, cpx scale, int n)
// ---------------------------------------------
//
// Surface / Texture object
//
// Experimental so see if it scales up better then using global memory
//
// ---------------------------------------------

__device__ static __inline__ void inner_k2DRowSurf(cuSurf in, cuSurf out, int x, int y, float angle, int steps, int dist)
{
cpx w, in_lower, in_upper;
SURF2D_READ(&in_lower, in, x, y);
SURF2D_READ(&in_upper, in, x + dist, y);
SIN_COS_F(angle * (((blockIdx.y * blockDim.x + threadIdx.x) << steps) & ((dist - 1) << steps)), &w.y, &w.x);
SURF2D_WRITE(cuCaddf(in_lower, in_upper), out, x, y);
SURF2D_WRITE(cuCmulf(cuCsubf(in_lower, in_upper), w), out, x + dist, y);
}
__device__ static __inline__ void inner_k2DColSurf(cuSurf in, cuSurf out, int x, int y, float angle, int steps, int dist)
{
cpx w, in_lower, in_upper;
SURF2D_READ(&in_lower, in, x, y);
SURF2D_READ(&in_upper, in, x, y + dist);
SIN_COS_F(angle * (((blockIdx.y * blockDim.x + threadIdx.x) << steps) & ((dist - 1) << steps)), &w.y, &w.x);
SURF2D_WRITE(cuCaddf(in_lower, in_upper), out, x, y);
SURF2D_WRITE(cuCmulf(cuCsubf(in_lower, in_upper), w), out, x, y + dist);
}

__global__ void _kernelAll2DRowSurf(cuSurf in, cuSurf out, float angle, unsigned int lmask, int steps, int dist)
{
int col_id = blockIdx.y * blockDim.x + threadIdx.x;
inner_k2DRowSurf(in, out, (col_id + (col_id & lmask)), blockIdx.x, angle, steps, dist);
}

__global__ void _kernelAll2DColSurf(cuSurf in, cuSurf out, float angle, unsigned int lmask, int steps, int dist)
{
int row_id = blockIdx.y * blockDim.x + threadIdx.x;
inner_k2DColSurf(in, out, blockIdx.x, (row_id + (row_id & lmask)), angle, steps, dist);
}

__global__ void _kernelGPUS2DRowSurf(cuSurf in, cuSurf out, float angle, float bAngle, int depth, cpx scale, int nBlock)
{
extern __shared__ cpx shared[];
int in_high = (nBlock >> 1) + threadIdx.x;
int rowOffset = blockIdx.y * blockDim.x * 2;
mem_gtos_row(threadIdx.x, in_high, rowOffset, shared, in);
SYNC_THREADS;
algorithm_p(shared, in_high, bAngle, depth);
mem_stog_db_row(threadIdx.x, in_high, rowOffset, (32 - log2((int)gridDim.x)), scale, shared, out);
}

__global__ void _kernelGPUS2DColSurf(cuSurf in, cuSurf out, float angle, float bAngle, int depth, cpx scale, int n)
{
extern __shared__ cpx shared[];
int bit = depth;
int in_high = n >> 1;
int offset = blockIdx.y * blockDim.x * 2;
int colOffset = blockIdx.y * blockDim.x * 2;
in_high += threadIdx.x;
mem_gtos_col(threadIdx.x, in_high, (threadIdx.x + offset) * n + blockIdx.x, (n >> 1) * n, shared, in);
mem_gtos_col((int)threadIdx.x, in_high, colOffset, shared, in);
SYNC_THREADS;
algorithm_p(shared, in_high, bAngle, bit);
mem_stog_db_col(threadIdx.x, in_high, offset, 32 - depth, scale, shared, out, n);
algorithm_p(shared, in_high, bAngle, depth);
mem_stog_db_col(threadIdx.x, in_high, colOffset, 32 - log2((int)gridDim.x), scale, shared, out);
}

__host__ void tsCombine2DSurf_help(fftDir dir, cuSurf *surfaceIn, cuSurf *surfaceOut, int rowWise, int n)
{
dim3 blocks;
int threads;
int depth = log2_32(n);
const int breakSize = log2_32(MAX_BLOCK_SIZE);
cpx scaleCpx = make_cuFloatComplex((dir == FFT_FORWARD ? 1.f : 1.f / n), 0.f);
set_block_and_threads2D(&blocks, &threads, n);

const int nBlock = n / blocks.y;
const float w_angle = dir * (M_2_PI / n);
const float w_bangle = dir * (M_2_PI / nBlock);
int bSize = n;
if (blocks.y > 1) {

// Calculate sequence until parts fit into a block, syncronize on CPU until then.
--depth;
int steps = 0;
int dist = (n / 2);
ROW_COL_KERNEL(rowWise, _kernelAll2DRowSurf, _kernelAll2DColSurf) KERNEL_ARGS2(blocks, threads)(*surfaceIn, *surfaceOut, w_angle, 0xFFFFFFFF << depth, steps, dist);
cudaDeviceSynchronize();
while (--depth > breakSize) {
dist >>= 1;
++steps;
ROW_COL_KERNEL(rowWise, _kernelAll2DRowSurf, _kernelAll2DColSurf) KERNEL_ARGS2(blocks, threads)(*surfaceOut, *surfaceOut, w_angle, 0xFFFFFFFF << depth, steps, dist);
cudaDeviceSynchronize();
}
swap(surfaceIn, surfaceOut);
++depth;
bSize = nBlock;
}
// Calculate complete sequence in one launch and syncronize on GPU
ROW_COL_KERNEL(rowWise, _kernelGPUS2DRowSurf, _kernelGPUS2DColSurf) KERNEL_ARGS3(blocks, threads, sizeof(cpx) * nBlock) (*surfaceIn, *surfaceOut, w_angle, w_bangle, depth, scaleCpx, bSize);
cudaDeviceSynchronize();
}

__host__ void tsCombine2DSurf(fftDir dir, cuSurf *surfaceIn, cuSurf *surfaceOut, int n)
{
dim3 blocks, threads;
set_block_and_threads_transpose(&blocks, &threads, n);

tsCombine2DSurf_help(dir, surfaceIn, surfaceOut, 1, n);
tsCombine2DSurf_help(dir, surfaceOut, surfaceIn, 0, n);

swap(surfaceIn, surfaceOut);
}

__host__ void _testTex2DShakedown(cpx **in, cpx **ref, cuSurf *sObjIn, cuSurf *sObjOut, cudaArray **cuAIn, cudaArray **cuAOut)
{
free(*in);
free(*ref);
cudaDestroySurfaceObject(*sObjIn);
cudaFreeArray(*cuAIn);
if (sObjOut != NULL) {
cudaDestroySurfaceObject(*sObjOut);
cudaFreeArray(*cuAOut);
}
}

__host__ void _testTex2DRun(fftDir dir, cpx *in, cudaArray *dev, cuSurf *surfIn, cuSurf *surfOut, char *image_name, char *type, size_t size, int write, int norm, int n)
{
tsCombine2DSurf(dir, surfIn, surfOut, n);
if (write) {
cudaMemcpyFromArray(in, dev, 0, 0, size, cudaMemcpyDeviceToHost);
if (norm) {
normalized_image(in, n);
cpx *tmp = fftShift(in, n);
write_image(image_name, type, tmp, n);
free(tmp);
}
else {
write_image(image_name, type, in, n);
}
}
}

__host__ int _testTex2DCompare(cpx *in, cpx *ref, cudaArray *dev, size_t size, int len)
{
cudaMemcpyFromArray(in, dev, 0, 0, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < len; ++i) {
if (cuCabsf(cuCsubf(in[i], ref[i])) > 0.0001) {
return 0;
}
}
return 1;
}

#define YES 1
#define NO 0

__host__ int tsCombine2DSurf_Validate(int n)
{
int res;
char *image_name = "shore";
cpx *in, *ref;
size_t size;
cudaArray *inArr, *outArr;
cuSurf inSurf, outSurf;
fft2DSurfSetup(&in, &ref, &size, image_name, NO, n, &inArr, &outArr, &inSurf, &outSurf);
cudaMemcpyToArray(inArr, 0, 0, in, size, cudaMemcpyHostToDevice);
_testTex2DRun(FFT_FORWARD, in, inArr, &inSurf, &outSurf, image_name, "surf-frequency-domain", size, YES, YES, n);
_testTex2DRun(FFT_INVERSE, in, inArr, &outSurf, &inSurf, image_name, "surf-spatial-domain", size, YES, NO, n);
res = _testTex2DCompare(in, ref, inArr, size, n * n);
_testTex2DShakedown(&in, &ref, &inSurf, &outSurf, &inArr, &outArr);
return res;
}

__host__ double tsCombine2DSurf_Performance(int n)
{
double measures[NUM_PERFORMANCE];
char *image_name = "shore";
cpx *in, *ref;
size_t size;
cudaArray *inArr, *outArr;
cuSurf inSurf, outSurf;
fft2DSurfSetup(&in, &ref, &size, image_name, NO, n, &inArr, &outArr, &inSurf, &outSurf);

for (int i = 0; i < NUM_PERFORMANCE; ++i) {
startTimer();
tsCombine2DSurf(FFT_FORWARD, &inSurf, &outSurf, n);
cudaCheckError();
measures[i] = stopTimer();
}
_testTex2DShakedown(&in, &ref, &inSurf, &outSurf, &inArr, &outArr);
return avg(measures, NUM_PERFORMANCE);
}
@@ -12,10 +12,13 @@

__host__ void tsCombine(fftDir dir, cpx **dev_in, cpx **dev_out, int n);
__host__ void tsCombine2D(fftDir dir, cpx **dev_in, cpx **dev_out, int n);
__host__ void tsCombine2DSurf(fftDir dir, cuSurf *surfaceIn, cuSurf *surfaceOut, int n);

__host__ int tsCombine_Validate(int n);
__host__ int tsCombine2D_Validate(int n);
__host__ int tsCombine2DSurf_Validate(int n);
__host__ double tsCombine_Performance(int n);
__host__ double tsCombine2D_Performance(int n);
__host__ double tsCombine2DSurf_Performance(int n);

#endif
@@ -3,37 +3,6 @@
__global__ void _kernelGPUSyncTex2DRow(cuSurf surfIn, cuSurf surfOut, float angle, float bAngle, int depth, cpx scale, int n);
__global__ void _kernelGPUSyncTex2DCol(cuSurf surfIn, cuSurf surfOut, float angle, float bAngle, int depth, cpx scale, int n);

__host__ void testTex2DSetup(cpx **in, cpx **ref, size_t *size, char *image_name, int sinus, int n, cudaArray **cuInputArray, cudaArray **cuOutputArray, cuSurf *inputSurfObj, cuSurf *outputSurfObj)
{
if (sinus) {
*in = get_sin_img(n);
*ref = get_sin_img(n);
}
else {
char input_file[40];
sprintf_s(input_file, 40, "%s/%u.ppm", image_name, n);
int sz;
*in = read_image(input_file, &sz);
*ref = read_image(input_file, &sz);
}
*size = n * n * sizeof(cpx);
// Allocate CUDA arrays in device memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float2>();
cudaMallocArray(cuInputArray, &channelDesc, n, n, cudaArraySurfaceLoadStore);
cudaMallocArray(cuOutputArray, &channelDesc, n, n, cudaArraySurfaceLoadStore);
// Specify surface
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
// Create the surface objects
resDesc.res.array.array = *cuInputArray;
*inputSurfObj = 0;
cudaCreateSurfaceObject(inputSurfObj, &resDesc);
resDesc.res.array.array = *cuOutputArray;
*outputSurfObj = 0;
cudaCreateSurfaceObject(outputSurfObj, &resDesc);
}

__host__ void testTex2DShakedown(cpx **in, cpx **ref, cuSurf *sObjIn, cuSurf *sObjOut, cudaArray **cuAIn, cudaArray **cuAOut)
{
free(*in);
@@ -83,7 +52,7 @@ __host__ int tsCombineGPUSyncTex2D_Test(int n)
size_t size;
cudaArray *cuInputArray, *cuOutputArray;
cuSurf inputSurfObj, outputSurfObj;
testTex2DSetup(&in, &ref, &size, image_name, NO, n, &cuInputArray, &cuOutputArray, &inputSurfObj, &outputSurfObj);
fft2DSurfSetup(&in, &ref, &size, image_name, NO, n, &cuInputArray, &cuOutputArray, &inputSurfObj, &outputSurfObj);

for (int i = 0; i < NUM_PERFORMANCE; ++i) {
startTimer();
@@ -135,34 +104,6 @@ __device__ static __inline__ void algorithm_partial(cpx *shared, int in_high, fl
}
}

__device__ static __inline__ void mem_gtos_row(int low, int high, int offset, cpx *shared, cuSurf surf)
{
SURF2D_READ(&(shared[low]), surf, low + offset, blockIdx.x);
SURF2D_READ(&(shared[high]), surf, high + offset, blockIdx.x);
}

__device__ static __inline__ void mem_gtos_col(int low, int high, int offset, cpx *shared, cuSurf surf)
{
SURF2D_READ(&(shared[low]), surf, blockIdx.x, low + offset);
SURF2D_READ(&(shared[high]), surf, blockIdx.x, high + offset);
}

__device__ static __inline__ void mem_stog_db_row(int low, int high, int offset, unsigned int lead, cpx scale, cpx *shared, cuSurf surf)
{
int row_low = BIT_REVERSE(low + offset, lead);
int row_high = BIT_REVERSE(high + offset, lead);
SURF2D_WRITE(cuCmulf(shared[low], scale), surf, row_low, blockIdx.x);
SURF2D_WRITE(cuCmulf(shared[high], scale), surf, row_high, blockIdx.x);
}

__device__ static __inline__ void mem_stog_db_col(int low, int high, int offset, unsigned int lead, cpx scale, cpx *shared, cuSurf surf)
{
int col_low = BIT_REVERSE(low + offset, lead);
int col_high = BIT_REVERSE(high + offset, lead);
SURF2D_WRITE(cuCmulf(shared[low], scale), surf, blockIdx.x, col_low);
SURF2D_WRITE(cuCmulf(shared[high], scale), surf, blockIdx.x, col_high);
}

__global__ void _kernelGPUSyncTex2DRow(cuSurf surfIn, cuSurf surfOut, float angle, float bAngle, int depth, cpx scale, int n)
{
extern __shared__ cpx shared[];
@@ -66,6 +66,13 @@ __host__ __device__ static __inline__ void swap(cpx **in, cpx **out)
*out = tmp;
}

__host__ __device__ static __inline__ void swap(cuSurf *in, cuSurf *out)
{
cuSurf tmp = *in;
*in = *out;
*out = tmp;
}

__host__ __device__ static __inline__ unsigned int bitReverse32(unsigned int x, int l)
{
x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
@@ -131,6 +138,35 @@ __device__ static __inline__ void mem_stog_tb(int low, int high, int offset, uns
global[high + offset] = cuCmulf(shared[BIT_REVERSE(high, lead)], scale);
}


__device__ static __inline__ void mem_gtos_row(int low, int high, int offset, cpx *shared, cuSurf surf)
{
SURF2D_READ(&(shared[low]), surf, low + offset, blockIdx.x);
SURF2D_READ(&(shared[high]), surf, high + offset, blockIdx.x);
}

__device__ static __inline__ void mem_gtos_col(int low, int high, int offset, cpx *shared, cuSurf surf)
{
SURF2D_READ(&(shared[low]), surf, blockIdx.x, low + offset);
SURF2D_READ(&(shared[high]), surf, blockIdx.x, high + offset);
}

__device__ static __inline__ void mem_stog_db_row(int low, int high, int offset, unsigned int lead, cpx scale, cpx *shared, cuSurf surf)
{
int row_low = BIT_REVERSE(low + offset, lead);
int row_high = BIT_REVERSE(high + offset, lead);
SURF2D_WRITE(cuCmulf(shared[low], scale), surf, row_low, blockIdx.x);
SURF2D_WRITE(cuCmulf(shared[high], scale), surf, row_high, blockIdx.x);
}

__device__ static __inline__ void mem_stog_db_col(int low, int high, int offset, unsigned int lead, cpx scale, cpx *shared, cuSurf surf)
{
int col_low = BIT_REVERSE(low + offset, lead);
int col_high = BIT_REVERSE(high + offset, lead);
SURF2D_WRITE(cuCmulf(shared[low], scale), surf, blockIdx.x, col_low);
SURF2D_WRITE(cuCmulf(shared[high], scale), surf, blockIdx.x, col_high);
}

__host__ static __inline void set_block_and_threads(int *numBlocks, int *threadsPerBlock, int size)
{
if (size > MAX_BLOCK_SIZE) {
@@ -240,4 +240,70 @@ int fft2DCompare(cpx *in, cpx *ref, cpx *dev, size_t size, int len)
}
}
return 1;
}

int fft2DCompare(cpx *in, cpx *ref, cpx *dev, size_t size, int len, double *relDiff)
{
double mDiff = 0.0;
double mVal = -1;
cudaMemcpy(in, dev, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < len; ++i) {
mVal = max(mVal, max(cuCabsf(in[i]), cuCabsf(ref[i])));
double tmp = cuCabsf(cuCsubf(in[i], ref[i]));
mDiff = tmp > mDiff ? tmp : mDiff;
}
*relDiff = (mDiff / mVal);
return *relDiff < 0.00001;
}

__host__ void cudaCheckError(cudaError_t err)
{
if (err) {
printf("\n%s: %s\n", cudaGetErrorName(err), cudaGetErrorString(err));
getchar();
exit(err);
}
}

__host__ void cudaCheckError()
{
cudaCheckError(cudaGetLastError());
}

__host__ void fft2DSurfSetup(cpx **in, cpx **ref, size_t *size, char *image_name, int sinus, int n, cudaArray **cuInputArray, cudaArray **cuOutputArray, cuSurf *inputSurfObj, cuSurf *outputSurfObj)
{
if (sinus) {
*in = get_sin_img(n);
*ref = get_sin_img(n);
}
else {
char input_file[40];
sprintf_s(input_file, 40, "%s/%u.ppm", image_name, n);
int sz;
*in = read_image(input_file, &sz);
*ref = read_image(input_file, &sz);
}
*size = n * n * sizeof(cpx);
// Allocate CUDA arrays in device memory
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float2>();
cudaMallocArray(cuInputArray, &channelDesc, n, n, cudaArraySurfaceLoadStore);
cudaCheckError();
if (cuOutputArray != NULL) {
cudaMallocArray(cuOutputArray, &channelDesc, n, n, cudaArraySurfaceLoadStore);
}
// Specify surface
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
// Create the surface objects
resDesc.res.array.array = *cuInputArray;
*inputSurfObj = 0;
cudaCreateSurfaceObject(inputSurfObj, &resDesc);
cudaCheckError();
if (outputSurfObj != NULL) {
resDesc.res.array.array = *cuOutputArray;
*outputSurfObj = 0;
cudaCreateSurfaceObject(outputSurfObj, &resDesc);
cudaCheckError();
}
}
@@ -1,8 +1,10 @@
#ifndef TSTEST_CUH
#define TSTEST_CUH

#include "tsDefinitions.cuh"
#include <Windows.h>
#include <cuda_texture_types.h>

#include "tsDefinitions.cuh"
#include "tsHelper.cuh"

/* Performance measure on Windows, result in micro seconds */
@@ -32,5 +34,9 @@ int fftResultAndFree(int n, cpx **dev_in, cpx **dev_out, cpx **dev_W, cpx **in,
void fft2DSetup(cpx **in, cpx **ref, cpx **dev_i, cpx **dev_o, size_t *size, char *image_name, int sinus, int n);
void fft2DShakedown(cpx **in, cpx **ref, cpx **dev_i, cpx **dev_o);
int fft2DCompare(cpx *in, cpx *ref, cpx *dev, size_t size, int len);
int fft2DCompare(cpx *in, cpx *ref, cpx *dev, size_t size, int len, double *diff);
void cudaCheckError(cudaError_t err);
void cudaCheckError();
__host__ void fft2DSurfSetup(cpx **in, cpx **ref, size_t *size, char *image_name, int sinus, int n, cudaArray **cuInputArray, cudaArray **cuOutputArray, cuSurf *inputSurfObj, cuSurf *outputSurfObj);

#endif