In [7]:
! pip install git+git://github.com/frehseg/nvcc4jupyter.git

Collecting git+git://github.com/frehseg/nvcc4jupyter.git
  Cloning git://github.com/frehseg/nvcc4jupyter.git to /tmp/pip-req-build-frr2rpug
  Running command git clone -q git://github.com/frehseg/nvcc4jupyter.git /tmp/pip-req-build-frr2rpug


In [8]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [9]:
%%writefile cuda_stuff.cuh
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

#ifndef cuda_stuff_H
#define cuda_stuff_H
 
//MACRO TO DEBUG CUDA FUNCTIONS
/** Error checking,
 *  taken from https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
 */
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

#endif


Overwriting cuda_stuff.cuh


In [10]:
%%writefile saxpy.cu
/*
 * Application saxpy avec GPU 
 * y = A.x + B
 */

#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>

#include "cuda_stuff.cuh"

////////////////////////////////////////////////////////////////
//     Initialisation des vecteurs     
////////////////////////////////////////////////////////////////
void init_tab(float *tab, int len, float val) {
    for (int k=0; k<len; k++)   
      tab[k]= val;
}

void print_tab(char *chaine, float *tab, int len){
   int k;
   printf("\nLes 10 premiers de %s: \n",chaine);
   for (k=0; k<10; k++) 
      printf("%.2f ",tab[k]);
   printf("\nLes 10 derniers: \n");
   for (k=len-10; k<len; k++) 
      printf("%.2f ",tab[k]);
   printf("\n");
}

__global__ void saxpy(float *tabX, float *tabY, int len, float a){
   // TODO
   int idx = blockIdx.x*blockDim.x + threadIdx.x;
   if (idx < len)
    tabY[idx] += a*tabX[idx] ;
}

int main( int argc, char** argv){ 
    float *tabX_d, *tabX_h;
    float *tabY_d, *tabY_h;
    int len = 100;
    float a = 2.;

     /** Initialisation de  nbthreadbyblock et nbblockbygrid  **/
    // TODO
    // TODO
    dim3 grid(1);
    dim3 block(len);

    printf("SAXPY - tableau de %d éléments \n", len);

    /** Allocation memoire sur le host(CPU) **/
    tabX_h=(float *) malloc(sizeof(float) * len);
    init_tab(tabX_h, len , 5.);

    //TODO - allocation de tabY_h
    tabY_h=(float *) malloc(sizeof(float) * len);
    init_tab(tabY_h, len , 0);
    /** Affichage initial **/
    printf("Affichage initial\n");
    print_tab("tabY_h",tabY_h, len);

    /** Allocation memoire sur le device(GPU) **/
    gpuErrchk(cudaMalloc((void**) &tabX_d, sizeof(float) * len));
    // TODO - allocation de tabY_d
    gpuErrchk(cudaMalloc((void**) &tabY_d, sizeof(float) * len));

    /** Transfert mémoire du host vers le device **/
    // TODO
    gpuErrchk(cudaMemcpy(tabX_d, tabX_h, sizeof(float)*len, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(tabY_d, tabY_h, sizeof(float)*len, cudaMemcpyHostToDevice));

    /** Lancement du kernel **/
    //TODO
    saxpy<<<grid, block>>>(tabX_d, tabY_d, len, a);

    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );


    /** Transfert mémoire du device vers le host **/
    // TODO 
    gpuErrchk(cudaMemcpy(tabX_h, tabX_d, sizeof(float)*len, cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(tabY_h, tabY_d, sizeof(float)*len, cudaMemcpyDeviceToHost));

    /** Affichage du resultat **/
    printf("Affichage du résultat\n");
    print_tab("tabY_h", tabY_h, len);

    /** Libération de la mémoire **/
    cudaFree(tabX_d); 
    cudaFree(tabY_d);
    free(tabX_h);
    free(tabY_h);  

    printf("Fin du programme\n");
    return EXIT_SUCCESS;
}

Writing saxpy.cu


In [13]:
! nvcc -arch=sm_37 saxpy.cu







In [14]:
! ./a.out

SAXPY - tableau de 100 éléments 
Affichage initial

Les 10 premiers de tabY_h: 
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
Les 10 derniers: 
0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 
Affichage du résultat

Les 10 premiers de tabY_h: 
10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 
Les 10 derniers: 
10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 10.00 
Fin du programme
