# Código Ejercicio 2

In [None]:
!pip install pyopencl
import sys
sys.path.append('/usr/local/lib/python3.6/site-packages')
import pyopencl as cl
import pyopencl.array as cl_array
import numpy as np

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyopencl
  Downloading pyopencl-2022.2.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (934 kB)
[K     |████████████████████████████████| 934 kB 15.2 MB/s 
Collecting platformdirs>=2.2.0
  Downloading platformdirs-2.5.2-py3-none-any.whl (14 kB)
Collecting pytools>=2021.2.7
  Downloading pytools-2022.1.12.tar.gz (70 kB)
[K     |████████████████████████████████| 70 kB 10.4 MB/s 
Building wheels for collected packages: pytools
  Building wheel for pytools (setup.py) ... [?25l[?25hdone
  Created wheel for pytools: filename=pytools-2022.1.12-py2.py3-none-any.whl size=65034 sha256=854f9ec0b761e175d5a75c0697b84e80964c680e32abbad6de605e38f410882b
  Stored in directory: /root/.cache/pip/wheels/37/5e/9e/76d7430e116b7cab0016fbabb26b896daae1946a3f7dea9915
Successfully built pytools
Installing collected packages: platformdirs, pytools, pyopencl
Successfully installed platfor

In [None]:
!lscpu
!nvidia-smi

Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              2
On-line CPU(s) list: 0,1
Thread(s) per core:  2
Core(s) per socket:  1
Socket(s):           1
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               85
Model name:          Intel(R) Xeon(R) CPU @ 2.00GHz
Stepping:            3
CPU MHz:             2000.152
BogoMIPS:            4000.30
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            1024K
L3 cache:            39424K
NUMA node0 CPU(s):   0,1
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_si

## Kernel

In [None]:
KernelSource = """
__kernel void parallelMMult(
  const int N,
  __global const float *a_g,
  __global const float *b_g,
  __global float *res_g)
  {
    int i = get_global_id(0);
    int j = get_global_id(1);

    float sum = 0.0f;
    for(int k = 0; k < N; k++) {
      sum += a_g[i*N + k] * b_g[k*N + j];
    }
    //barrera
    res_g[i*N + j] = sum;

  }
"""

## Host

In [None]:
N = 2500

# Obtener la plataforma
plataform_list = cl.get_platforms()

# Obtener los dispositivos
devices = plataform_list[0].get_devices(device_type = cl.device_type.GPU)

# Crear el contexto
context = cl.Context(devices=devices)

# Crear el Command Queue
queue = cl.CommandQueue(context)

# Crear e inicializar los vectores de entrada
a_np = np.arange(N*N).astype(np.float32)
b_np = np.arange(N*N).astype(np.float32)

# Crear e inicializar el vector de salida
res_np = np.empty_like(a_np)

# Crear e inicializar los Buffers (OpenCL)
a_g = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = a_np)
b_g = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf = b_np)

# Crear buffer de salida (OpenCL)
res_g = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, res_np.nbytes)

# Crear el programa
program = cl.Program(context, KernelSource).build()

# Crear el kernel
kernel = program.parallelMMult

# Configurar los argumentos
kernel.set_scalar_arg_dtypes([np.int32, None, None, None])
kernel.set_args(N, a_g, b_g, res_g)

# Definir el espacio indexado
globalRange = (N,N)
localRange = None

# Ejecutar el kernel
ev = cl.enqueue_nd_range_kernel(queue, kernel, globalRange, localRange)

# Copy result del device al host
cl.enqueue_copy(queue, res_np, res_g)

<pyopencl._cl.NannyEvent at 0x7fdec81bbfb0>

In [None]:
np_matmul = np.matmul(a_np.reshape(N,N),b_np.reshape(N,N))

print(res_np.reshape(N,N))

print(np_matmul)

[[1.3013013e+13 1.3013016e+13 1.3013018e+13 ... 1.3020832e+13
  1.3020833e+13 1.3020835e+13]
 [3.2536474e+13 3.2536479e+13 3.2536493e+13 ... 3.2559845e+13
  3.2559853e+13 3.2559866e+13]
 [5.2059865e+13 5.2059877e+13 5.2059894e+13 ... 5.2098918e+13
  5.2098926e+13 5.2098939e+13]
 ...
 [4.8762993e+16 4.8763014e+16 4.8763027e+16 ... 4.8801974e+16
  4.8801987e+16 4.8802004e+16]
 [4.8782556e+16 4.8782574e+16 4.8782591e+16 ... 4.8821542e+16
  4.8821559e+16 4.8821576e+16]
 [4.8802081e+16 4.8802098e+16 4.8802111e+16 ... 4.8841092e+16
  4.8841105e+16 4.8841118e+16]]
[[1.3013021e+13 1.3013023e+13 1.3013025e+13 ... 1.3020823e+13
  1.3020825e+13 1.3020826e+13]
 [3.2536458e+13 3.2536466e+13 3.2536474e+13 ... 3.2559866e+13
  3.2559877e+13 3.2559887e+13]
 [5.2059894e+13 5.2059915e+13 5.2059932e+13 ... 5.2098910e+13
  5.2098926e+13 5.2098943e+13]
 ...
 [4.8763036e+16 4.8763053e+16 4.8763066e+16 ... 4.8802013e+16
  4.8802025e+16 4.8802043e+16]
 [4.8782561e+16 4.8782574e+16 4.8782595e+16 ... 4.8821555e+

In [None]:
print(a_np.reshape(N,N))


[[ 0.  1.  2.  3.  4.]
 [ 5.  6.  7.  8.  9.]
 [10. 11. 12. 13. 14.]
 [15. 16. 17. 18. 19.]
 [20. 21. 22. 23. 24.]]
