# Matrix Dot Product - CPU vs. GPU

## Import Required Packages (only work on systems with GPU)

In [None]:
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import time
import skcuda.linalg as culinalg
import skcuda
import ctypes

ctypes.CDLL('libgomp.so.1', mode=ctypes.RTLD_GLOBAL)
_libcusolver = ctypes.cdll.LoadLibrary('/usr/local/cuda-8.0/lib64/libcusolver.so')

culinalg.init()

## Generate a random matrix

In [None]:
import numpy as np

start = time.time()
dim = 20000 #if you get a MemoryError, please reduce this to 
rnd = np.random.RandomState(0)
a = rnd.rand(dim, dim).astype(np.float32)
end = time.time() - start
print("Generation Time: {0:2f} seconds".format(time.time() - start))
generation_time = time.time() - start

## Compute Dot Product using CPU and system memory

In [None]:
start = time.time()
cpu_result = np.dot(a, a)
# cpu_result_sum = np.sum(cpu_result)
end = time.time() - start
print("CPU Time: {0:2f} seconds".format(time.time() - start))
cpu_time = time.time() - start

## Copy over the matrix to GPU memory

In [None]:
start = time.time()
a_gpu = gpuarray.to_gpu(a)
print ("Copy in to GPU memory: {0:2f} seconds".format(time.time() - start))

## Compute Dot Product using GPU and GPU memory.


In [None]:
start = time.time()
gpu_result = culinalg.dot(a_gpu, a_gpu)
# gpu_result_sum = skcuda.misc.sum(gpu_result)
print ("GPU Time: {0:2f} seconds".format(time.time() - start))
gpu_time = time.time() - start

## Speed-up

In [None]:
print("CPU Time: {0:2f} seconds".format(cpu_time))
print("GPU Time: {0:2f} seconds".format(gpu_time))
print("Speed-up: {}".format(int(cpu_time/gpu_time)))

## CPU vs. GPU Architecture

<img src="image/cpu-gpu.png">

## References

NVIDIA P100: https://images.nvidia.com/content/tesla/pdf/nvidia-tesla-p100-PCIe-datasheet.pdf