In [None]:
# %load ~/soft/_pyenvs/obddp/lib/python3.8/site-packages/pycuda/autoinit.py


In [None]:
try:
    import pycuda.driver as cuda_driver
    cuda_driver.init()
except pycuda.driver.Error as e:
    print(f'CUDA problem: {e}')

In [None]:
    from timeit import timeit
    import pycuda.gpuarray as gpuarray
    import pycuda.autoinit
    import numpy as np

In [None]:
 from skcuda.linalg import svd

In [None]:
    import skcuda
    skcuda.misc.init()

In [None]:
    N = 6400
    Y = np.random.randn(N, N) + 1j*np.random.randn(N, N)
    X = np.asarray(Y, np.complex64)
    a_gpu = gpuarray.to_gpu(X)
         
    tm = timeit("svd(a_gpu, jobu='A', jobvt='A', lib='cusolver')", 
                    globals={'a_gpu': a_gpu, 'svd': svd}, 
                    number=1)


In [None]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy
a = numpy.random.randn(4,4)

a = a.astype(numpy.float32)

a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)

cuda.memcpy_htod(a_gpu, a)

mod = SourceModule("""
    __global__ void doublify(float *a)
    {
      int idx = threadIdx.x + threadIdx.y*4;
      a[idx] *= 2;
    }
    """)

func = mod.get_function("doublify")
func(a_gpu, block=(4,4,1))

a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)

In [None]:
print( "original array:" )
print( a )
print( "doubled with kernel:" )
print( a_doubled )

# alternate kernel invocation -------------------------------------------------

func(cuda.InOut(a), block=(4, 4, 1))
print ("doubled with InOut:")
print (a)

# part 2 ----------------------------------------------------------------------

import pycuda.gpuarray as gpuarray
a_gpu = gpuarray.to_gpu(numpy.random.randn(4,4).astype(numpy.float32))
a_doubled = (2*a_gpu).get()

print ("original array:")
print (a_gpu)
print ("doubled with gpuarray:")
print (a_doubled)

In [None]:

import numpy as np
import cupy as cp
#The cupy.ndarray class is in its core, which is a compatible GPU alternative of numpy.ndarray.

x_gpu = cp.array([1, 2, 3])
#x_gpu in the above example is an instance of cupy.ndarray. You can see its creation of identical to NumPy’s one, except that numpy is replaced with cupy. The main difference of cupy.ndarray from numpy.ndarray is that the content is allocated on the device memory. Its data is allocated on the current device, which will be explained later.

#Most of the array manipulations are also done in the way similar to NumPy. Take the Euclidean norm (a.k.a L2 norm) for example. NumPy has numpy.linalg.norm() to calculate it on CPU.

x_cpu = np.array([1, 2, 3])
l2_cpu = np.linalg.norm(x_cpu)
#We can calculate it on GPU with CuPy in a similar way:

x_gpu = cp.array([1, 2, 3])
l2_gpu = cp.linalg.norm(x_gpu)