<a href="https://colab.research.google.com/github/vpcano/CAP_P2/blob/main/Ej1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
!mkdir -p cuda
!rm -rf cuda/*

In [37]:
%%writefile cuda/stencil_1d_cpu.cu
    #include <iostream>
    #include <algorithm>
    using namespace std;

    #define RADIUS 3

    void stencil_1D(int *in, int *out, int N) {
        for (int i=0; i<N; i++) {
            out[i] = 0;
            for (int offset=-RADIUS; offset<=RADIUS; offset++) {
                out[i] += in[i+offset];
            }
        }
    }


    void fill_ints(int *x, int n) {
        fill_n(x, n, 1);
    }

    int main(int argc, char *argv[]) {
        int *in, *out;
        int N, size;

        if (argc < 2) {
            printf("Error: you must indicate the length of the array\n");
            return 1;
        }

        N = atoi(argv[1]);
        size = (N + 2*RADIUS) * sizeof(int);

        in = (int*) malloc(size);
        out = (int*) malloc(size);
        fill_ints(in, N + 2*RADIUS);
        fill_ints(out, N + 2*RADIUS);

        stencil_1D(in + RADIUS, out + RADIUS, N);

        printf("Output: \n");
        for (int i=0; i<N+2*RADIUS; i++) {
            printf("%d ", out[i]);
        }
        printf("\n");

        free(in);
        free(out);
        return 0;
    }

Writing cuda/stencil_1d_cpu.cu


In [38]:
!nvcc cuda/stencil_1d_cpu.cu -o cuda/stencil_1d_cpu
!./cuda/stencil_1d_cpu 32


Output: 
1 1 1 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 1 1 


In [39]:
%%writefile cuda/stencil_1d_gpu.cu
    #include <iostream>
    #include <algorithm>
    #include "cuda.h"
    #include "cuda_runtime.h"
    using namespace std;

    #define RADIUS 3
    #define BLOCK_SIZE 16

    __global__ void stencil_1D(int *in, int *out, int N) {
        __shared__ int temp[BLOCK_SIZE + 2*RADIUS];
        int gindex = threadIdx.x + blockIdx.x*blockDim.x;
        int lindex = threadIdx.x + RADIUS;

        temp[lindex] = in[gindex];
        if (threadIdx.x < RADIUS) {
            temp[lindex - RADIUS] = in[gindex - RADIUS];
            temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];
        }

        __syncthreads();

        int result = 0;
        for (int offset=-RADIUS; offset<=RADIUS; offset++) {
            result += temp[lindex + offset];
        }

        out[gindex] = result;
    }


    void fill_ints(int *x, int n) {
        fill_n(x, n, 1);
    }

    int main(int argc, char *argv[]) {
        int *h_in, *h_out;
        int *d_in, *d_out;
        int N, size;

        if (argc < 2) {
            printf("Error: you must indicate the length of the array\n");
            return 1;
        }

        N = atoi(argv[1]);
        size = (N + 2*RADIUS) * sizeof(int);

        h_in = (int*) malloc(size);
        h_out = (int*) malloc(size);
        fill_ints(h_in, N + 2*RADIUS);
        fill_ints(h_out, N + 2*RADIUS);

        cudaMalloc((void**) &d_in, size);
        cudaMalloc((void**) &d_out, size);
        cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_out, h_out, size, cudaMemcpyHostToDevice);

        stencil_1D<<<N/BLOCK_SIZE,BLOCK_SIZE>>>(d_in + RADIUS, d_out + RADIUS, N);

        cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost);

        printf("Output: \n");
        for (int i=0; i<N+2*RADIUS; i++) {
            printf("%d ", h_out[i]);
        }
        printf("\n");

        free(h_in);
        free(h_out);
        cudaFree(d_in);
        cudaFree(d_out);
        return 0;
    }

Writing cuda/stencil_1d_gpu.cu


In [40]:
!nvcc cuda/stencil_1d_gpu.cu -o cuda/stencil_1d_gpu
!./cuda/stencil_1d_gpu 128


Output: 
1 1 1 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 1 1 
