## FFT benchmarks
Perform 2D FFT benchmarks using the CUDA and OpenCL backends of pyvkfft,
and compare with scikit-cuda (cuFFT) and gpyfft (clfft) if they are present

In [None]:
%matplotlib notebook
gpu_name = "GTX"

import os
from itertools import permutations

try:
    import pycuda.driver as cu_drv
    import pycuda.gpuarray as cua
    from pycuda import curandom
    import pyvkfft.cuda
    from pyvkfft.cuda import primes, VkFFTApp as  cuVkFFTApp
    has_pyvkfft_cuda = True
except ImportError:
    has_pyvkfft_cuda = False

try:
    import pyopencl as cl
    import pyopencl.array as cla
    from pyopencl import clrandom
    import pyvkfft.opencl
    from pyvkfft.opencl import primes, VkFFTApp as clVkFFTApp
    has_pyvkfft_opencl = True
except ImportError:
    has_pyvkfft_opencl = False
    
try:
    import pycuda.autoinit
    import pycuda.driver as cu_drv
    import pycuda.gpuarray as cua
    from pycuda import curandom
    import skcuda.fft as cu_fft
    has_skcuda = True
except:
    has_skcuda = True

try:
    import pyopencl as cl
    import pyopencl.array as cla
    from pyopencl import clrandom
    import gpyfft
    has_gpyfft = True
except:
    has_gpyfft = True

import matplotlib.pyplot as plt
import numpy as np
import timeit



In [None]:
gpu_name_real = None
if has_pyvkfft_opencl or has_gpyfft:
    # Create some context on the first available GPU
    if 'PYOPENCL_CTX' in os.environ:
        cl_ctx = cl.create_some_context()
    else:
        cl_ctx = None
        # Find the first OpenCL GPU available and use it, unless
        for p in cl.get_platforms():
            for d in p.get_devices():
                if d.type & cl.device_type.GPU == 0:
                    continue
                gpu_name_real = d.name
                print("Selected OpenCL device: ", d.name)
                cl_ctx = cl.Context(devices=(d,))
                break
            if cl_ctx is not None:
                break
    cq = cl.CommandQueue(cl_ctx)

if has_pyvkfft_cuda or has_skcuda:
    if gpu_name is None:
        import pycuda.autoinit
    else:
        for i in range(cu_drv.Device.count()):
            d = cu_drv.Device(i)
            if gpu_name.lower() in d.name().lower():
                gpu_name_real = d.name()
                print("Selected  CUDA  device: ", d.name())
                cu_ctx = d.make_context()
                break


In [None]:
nb_repeat = 3  # Perform nb_repeat test, keep best time
nz = 8  # number of 2D layers

plt.figure(figsize=(9.5, 8))

results = {"n": []}
if "vkFFT.opencl" not in results and has_pyvkfft_opencl:
    results["vkFFT.opencl"] = []
    results["vkFFT.opencl-dt"] = []
if "gpyfft[clFFT]" not in results and has_gpyfft:
    results["gpyfft[clFFT]"] = []
    results["gpyfft[clFFT]-dt"] = []
if "vkFFT.cuda" not in results and has_pyvkfft_cuda:
    results["vkFFT.cuda"] = []
    results["vkFFT.cuda-dt"] = []
if "skcuda[cuFFT]" not in results and has_skcuda:
    results["skcuda[cuFFT]"] = []
    results["skcuda[cuFFT]-dt"] = []

header_results = "%2d %4s x%4s 2D FFT" % (nz, "N", "N")
for b in results.keys():
    if b != "n" and "-dt" not in b:
        header_results += "%19s" % b

print("Gbytes/s and time given for a couple (FFT, iFFT) on a stack of %d 2D transforms" % nz)
print()
print(header_results)


# Only test up to prime factors equal to 7 (cuFFT)
for n in range(16,512+1):
    if max(primes(n))>2:
        continue
    results["n"].append(n)
    # Estimate number of repeats to last 0.1s with at least 50 GB/s
    nb = int(round(0.1 * 50 / (nz * n ** 2 * 8 * 2 * 2 * 2 / 1024 ** 3)))
    nb = max(nb, 1)
    nb = min(nb, 1000)
    # print("%4d (nb=%4d)"%(n, nb))
    
    # OpenCL backends
    if has_pyvkfft_opencl or has_gpyfft:
        d = clrandom.rand(cq, shape=(nz, n, n), dtype=np.float32).astype(np.complex64)
    
    if has_pyvkfft_opencl:
        app= clVkFFTApp(d.shape, d.dtype, queue=cq, ndim=2)
        dt = 0
        for i in range(nb_repeat):
            cq.finish()
            t0 = timeit.default_timer()
            for i in range(nb):
                d = app.ifft(d)
                d = app.fft(d)
            cq.finish()
            dt1 = timeit.default_timer() - t0
            if dt == 0:
                dt = dt1
            elif dt1< dt:
                dt = dt1
        gbps = d.nbytes * nb * 2 * 2 * 2 / dt / 1024 ** 3
        #print("%4d %4dx%4d 2D FFT+iFFT dt=%6.2f ms %7.2f Gbytes/s [pyvkfft.opencl]  [nb=%4d]" %
        #      (nz, n, n, dt / nb * 1000, gbps, nb))
        del app
        results["vkFFT.opencl"].append(gbps)
        results["vkFFT.opencl-dt"].append(dt)

    if has_gpyfft:
        dt = 0
        for axes in permutations([-1,-2]):
            gpyfft_plan = gpyfft.FFT(cl_ctx, cq, d, None, axes=axes)
            # Shuffle axes order to find fastest transform
            for i in range(nb_repeat):
                cq.finish()
                t0 = timeit.default_timer()
                for i in range(nb):
                    gpyfft_plan.enqueue(forward=True)
                    gpyfft_plan.enqueue(forward=False)
                cq.finish()
                dt1 = timeit.default_timer() - t0
                if dt == 0:
                    dt = dt1
                elif dt1< dt:
                    dt = dt1
            del gpyfft_plan
        gbps = d.nbytes * nb * 2 * 2 * 2 / dt / 1024 ** 3
        #print("%4d %4dx%4d 2D FFT+iFFT dt=%6.2f ms %7.2f Gbytes/s [gpyfft[clFFT]]  [nb=%4d]" %
        #      (nz, n, n, dt / nb * 1000, gbps, nb))
        results["gpyfft[clFFT]"].append(gbps)
        results["gpyfft[clFFT]-dt"].append(dt)

    d.data.release()
    del d
    
    # CUDA backends
    if has_pyvkfft_cuda or has_pyvkfft_cuda:
        d = curandom.rand(shape=(nz, n, n), dtype=np.float32).astype(np.complex64)

    if has_pyvkfft_cuda:
        app= cuVkFFTApp(d.shape, d.dtype, ndim=2)
        dt = 0
        for i in range(nb_repeat):
            cu_ctx.synchronize()
            t0 = timeit.default_timer()
            for i in range(nb):
                d = app.ifft(d)
                d = app.fft(d)
            cu_ctx.synchronize()
            dt1 = timeit.default_timer() - t0
            if dt == 0:
                dt = dt1
            elif dt1< dt:
                dt = dt1
        gbps = d.nbytes * nb * 2 * 2 * 2 / dt / 1024 ** 3
        #print("%4d %4dx%4d 2D FFT+iFFT dt=%6.2f ms %7.2f Gbytes/s [pyvkfft.cuda]    [nb=%4d]" %
        #      (nz, n, n, dt / nb * 1000, gbps, nb))
        del app
        results["vkFFT.cuda"].append(gbps)
        results["vkFFT.cuda-dt"].append(dt)

    if has_skcuda:
        plan = cu_fft.Plan((n,n), np.complex64, np.complex64, batch=nz)
        dt = 0
        for i in range(nb_repeat):
            cu_ctx.synchronize()
            t0 = timeit.default_timer()
            for i in range(nb):
                cu_fft.fft(d, d, plan)
                cu_fft.ifft(d, d, plan)
            cu_ctx.synchronize()
            dt1 = timeit.default_timer() - t0
            if dt == 0:
                dt = dt1
            elif dt1< dt:
                dt = dt1
        gbps = d.nbytes * nb * 2 * 2 * 2 / dt / 1024 ** 3
        #print("%4d %4dx%4d 2D FFT+iFFT dt=%6.2f ms %7.2f Gbytes/s [skcuda[cuFFT]]    [nb=%4d]" %
        #      (nz, n, n, dt / nb * 1000, gbps, nb))
        del plan
        results["skcuda[cuFFT]"].append(gbps)
        results["skcuda[cuFFT]-dt"].append(dt)

        
    d.gpudata.free()
    del d
    
    # text results
    r = "%2d %4d x %4d      " % (nz, n, n)
    for b in results.keys():
        if b != "n" and "-dt" not in b:
            r += "%7.2f [%6.2f ms]" % (results[b][-1], results[b+'-dt'][-1])
    print(r)

    
    plt.clf()
    x = results['n']
    if "gpyfft[clFFT]" in results:
        y = results["gpyfft[clFFT]"]
        plt.plot(x, y, color='#00A000', marker='v', markersize=3, linestyle='', label="gpyfft[clFFT]")
    if "skcuda[cuFFT]" in results:
        y = results["skcuda[cuFFT]"]
        plt.plot(x, y, color='#A00000', marker='^', markersize=3, linestyle='', label="skcuda[cuFFT]")
    if "vkFFT.opencl" in results:
        y = results["vkFFT.opencl"]
        plt.plot(x, y, color='#00FF00', marker='o', markersize=3, linestyle='', label="vkFFT.opencl")
    if "vkFFT.cuda" in results:
        y = results["vkFFT.cuda"]
        plt.plot(x, y, color='#FF0000', marker='o', markersize=3, linestyle='', label="vkFFT.cuda")
    
    plt.legend(loc='lower right')
    plt.xlabel("FFT size")
    plt.ylabel("idealised throughput [Gbytes/s]")
    plt.title("2D FFT speed [%s]" % gpu_name_real)
    plt.grid(which='both', alpha=0.3)
    
    plt.draw()
    plt.gcf().canvas.draw()
    plt.pause(.001)
    plt.tight_layout()

