<a href="https://colab.research.google.com/github/vagnerhenrique/Arduino/blob/master/GPUTeste.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

Fri Sep  4 20:34:02 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git


Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-w8jf7itq
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-w8jf7itq
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=a4f5e1514a51060ecda49dc6701674a71b43c07111773e8e51f75fcba02acac4
  Stored in directory: /tmp/pip-ephem-wheel-cache-p0sbt755/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [None]:
%load_ext nvcc_plugin


created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cu
#include <stdio.h>
#define N  2000
inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
  if (err != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
  }
  return err;
}
__global__ void matrixMulGPU( int * a, int * b, int * c )
{
  /*
   * Build out this kernel.
   */
    int row = threadIdx.y + blockIdx.y * blockDim.y;
    int col = threadIdx.x + blockIdx.x * blockDim.x;
    
    int val = 0;
    if (row < N && col < N) {
      for (int i = 0; i < N; ++i) {
         val += a[row * N + i] * b[i * N + col];
       }
    
      c[row * N + col] = val;
    }
}
/*
 * This CPU function already works, and will run to create a solution matrix
 * against which to verify your work building out the matrixMulGPU kernel.
 */
void matrixMulCPU( int * a, int * b, int * c )
{
  int val = 0;
for( int row = 0; row < N; ++row )
    for( int col = 0; col < N; ++col )
    {
      val = 0;
      for ( int k = 0; k < N; ++k )
        val += a[row * N + k] * b[k * N + col];
      c[row * N + col] = val;
    }
}
int main()
{
  int *a, *b, *c_cpu, *c_gpu; // Allocate a solution matrix for both the CPU and the GPU operations
int size = N * N * sizeof (int); // Number of bytes of an N x N matrix
// Allocate memory
  cudaMallocManaged (&a, size);
  cudaMallocManaged (&b, size);
  cudaMallocManaged (&c_cpu, size);
  cudaMallocManaged (&c_gpu, size);
// Initialize memory; create 2D matrices
  for( int row = 0; row < N; ++row )
    for( int col = 0; col < N; ++col )
    {
      a[row*N + col] = row;
      b[row*N + col] = col+2;
      c_cpu[row*N + col] = 0;
      c_gpu[row*N + col] = 0;
    }
/*
   * Assign `threads_per_block` and `number_of_blocks` 2D values
   * that can be used in matrixMulGPU above.
   */
dim3 threads_per_block(32, 32, 1);
  dim3 number_of_blocks(N / threads_per_block.x + 1, N / threads_per_block.y + 1, 1);
matrixMulGPU <<< number_of_blocks, threads_per_block >>> ( a, b, c_gpu );
checkCudaErr(cudaDeviceSynchronize(), "Syncronization");
checkCudaErr(cudaGetLastError(), "GPU");
// Call the CPU version to check our work
  matrixMulCPU( a, b, c_cpu );
// Compare the two answers to make sure they are equal
  bool error = false;
  for( int row = 0; row < N && !error; ++row )
    for( int col = 0; col < N && !error; ++col )
      if (c_cpu[row * N + col] != c_gpu[row * N + col])
      {
        printf("FOUND ERROR at c[%d][%d]\n", row, col);
        error = true;
        break;
      }
if (!error)
    printf("Success!\n");
// Free all our allocated memory
  cudaFree(a); cudaFree(b);
  cudaFree( c_cpu ); cudaFree( c_gpu );
}

Success!



In [None]:
!git clone https://github.com/alexminnaar/cublas_vs_numpy.git

In [None]:
!ls -la

In [None]:
!apt-get install nvidia-cuda-toolkit

In [None]:
import numpy
import ctypes
import time
import matplotlib.pyplot as plt

E = ctypes.cdll.LoadLibrary("/content/cublas_vs_numpy/cublas_test.so")

cublas_times = []
numpy_times = []
dims = []

def cublas_mm(matrix_dim):
    N = matrix_dim * matrix_dim

    m1 = numpy.ones((N), dtype=numpy.float32)
    m2 = numpy.ones((N), dtype=numpy.float32)
    output_m = numpy.ones((N), dtype=numpy.float32)

    t0 = time.time()

    E.run(ctypes.c_void_p(m1.ctypes.data),
          ctypes.c_void_p(m2.ctypes.data),
          ctypes.c_void_p(output_m.ctypes.data),
          ctypes.c_int(matrix_dim))

    t1 = time.time()
    return t1 - t0


def numpy_mm(matrix_dim):
    m1 = numpy.ones((matrix_dim, matrix_dim), dtype=numpy.float32)
    m2 = numpy.ones((matrix_dim, matrix_dim), dtype=numpy.float32)

    t0 = time.time()

    _ = numpy.dot(m1, m2)

    t1 = time.time()
    return t1 - t0


for i in range(1, 1000, 10):

    print(i)
    cur_cublas = []
    cur_numpy = []

    for _ in range(100):
        cur_cublas.append(cublas_mm(i))
        cur_numpy.append(numpy_mm(i))

    cublas_times.append(numpy.mean(cur_cublas))
    numpy_times.append(numpy.mean(cur_numpy))
    dims.append(i)

plt.plot(dims, cublas_times, label='cublas')
plt.plot(dims, numpy_times, label='numpy')
plt.legend()
plt.xlabel("Input Matrix Dimension")
plt.ylabel("Computation Time")
plt.savefig('cublas_vs_numpy.png')
plt.show()

# Nova seção

In [None]:
%%cu
//Example 1. Application Using C and cuBLAS: 1-based indexing
//-----------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#define M 6
#define N 5
#define IDX2F(i,j,ld) ((((j)-1)*(ld))+((i)-1))
static __inline__ void modify (cublasHandle_t handle, float *m, int ldm, int
 n, int p, int q, float alpha, float beta){
 cublasSscal (handle, n-q+1, &alpha, &m[IDX2F(p,q,ldm)], ldm);
 cublasSscal (handle, ldm-p+1, &beta, &m[IDX2F(p,q,ldm)], 1);
}
int main (void){
 cudaError_t cudaStat;
 cublasStatus_t stat;
 cublasHandle_t handle;
 int i, j;
 float* devPtrA;
 float* a = 0;
 a = (float *)malloc (M * N * sizeof (*a));
 if (!a) {
 printf ("host memory allocation failed");
 return EXIT_FAILURE;
 }
 for (j = 1; j <= N; j++) {
 for (i = 1; i <= M; i++) {
 a[IDX2F(i,j,M)] = (float)((i-1) * M + j);
 }
 }
 cudaStat = cudaMalloc ((void**)&devPtrA, M*N*sizeof(*a));
 if (cudaStat != cudaSuccess) {
 printf ("device memory allocation failed");
 return EXIT_FAILURE;
 }
 stat = cublasCreate(&handle);
 if (stat != CUBLAS_STATUS_SUCCESS) {
 printf ("CUBLAS initialization failed\n");
 return EXIT_FAILURE;
 }
 stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M);
 if (stat != CUBLAS_STATUS_SUCCESS) {
 printf ("data download failed");
 cudaFree (devPtrA);
 cublasDestroy(handle);
 return EXIT_FAILURE;
 }
 modify (handle, devPtrA, M, N, 2, 3, 16.0f, 12.0f);
 stat = cublasGetMatrix (M, N, sizeof(*a), devPtrA, M, a, M);
 if (stat != CUBLAS_STATUS_SUCCESS) {
 printf ("data upload failed");
 cudaFree (devPtrA);
 cublasDestroy(handle);
 return EXIT_FAILURE;
 }
 cudaFree (devPtrA);
 cublasDestroy(handle);
 for (j = 1; j <= N; j++) {
 for (i = 1; i <= M; i++) {
 printf ("%7.0f", a[IDX2F(i,j,M)]);
 }
 printf ("\n");
 }
 free(a);
 return EXIT_SUCCESS;
}