# CUDA Convolution (Colab)

This notebook recreates the CUDA convolution code from the `cuda/` folder and runs it in Google Colab.

In [None]:
!nvidia-smi

In [None]:
%%writefile funcs.h
#ifndef FUNCS_H
#define FUNCS_H

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>

#define CUDA_SAFE_CALL(call) {                                     \
    cudaError err = call;                                          \
    if (cudaSuccess != err) {                                      \
        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
                __FILE__, __LINE__, cudaGetErrorString(err));      \
        exit(EXIT_FAILURE);                                        \
    }                                                              \
}

#define FRACTION_CEILING(numerator, denominator) ((numerator + denominator - 1) / (denominator))

typedef enum { RGB, GREY } color_t;

#ifdef __cplusplus
extern "C" {
#endif

int write_all(int fd, uint8_t *buff, int size);
int read_all(int fd, uint8_t *buff, int size);
void Usage(int argc, char **argv, char **image, int *width, int *height, int *loops, color_t *imageType);
uint64_t micro_time(void);

#ifdef __cplusplus
}
#endif

#endif

In [None]:
%%writefile funcs.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
#include <assert.h>
#include <sys/time.h>
#include "funcs.h"

void Usage(int argc, char **argv, char **image, int *width, int *height, int *loops, color_t *imageType) {
    if (argc == 6 && !strcmp(argv[5], "grey")) {
        *image = (char *)malloc((strlen(argv[1]) + 1) * sizeof(char));
        strcpy(*image, argv[1]);
        *width = atoi(argv[2]);
        *height = atoi(argv[3]);
        *loops = atoi(argv[4]);
        *imageType = GREY;
    } else if (argc == 6 && !strcmp(argv[5], "rgb")) {
        *image = (char *)malloc((strlen(argv[1]) + 1) * sizeof(char));
        strcpy(*image, argv[1]);
        *width = atoi(argv[2]);
        *height = atoi(argv[3]);
        *loops = atoi(argv[4]);
        *imageType = RGB;
    } else {
        fprintf(stderr, "Error Input!\n%s image_name width height loops [rgb/grey].\n", argv[0]);
        exit(EXIT_FAILURE);
    }
}

int write_all(int fd, uint8_t* buff, int size) {
    int n, sent;
    for (sent = 0; sent < size; sent += n)
        if ((n = write(fd, buff + sent, size - sent)) == -1)
            return -1;
    return sent;
}

int read_all(int fd, uint8_t* buff, int size) {
    int n, sent;
    for (sent = 0; sent < size; sent += n)
        if ((n = read(fd, buff + sent, size - sent)) == -1)
            return -1;
    return sent;
}

uint64_t micro_time(void) {
    struct timeval tv;
    assert(gettimeofday(&tv, NULL) == 0);
    return (uint64_t)tv.tv_sec * 1000 * 1000 + (uint64_t)tv.tv_usec;
}

In [None]:
%%writefile cuda_convolute.h
#ifndef CUDA_CONVOLUTE_H
#define CUDA_CONVOLUTE_H

#include "funcs.h"

#ifdef __cplusplus
extern "C" {
#endif

void gpuConvolute(uint8_t *src, int width, int height, int loops, color_t imageType);

#ifdef __cplusplus
}
#endif

#endif

In [None]:
%%writefile cuda_convolute.cu
#include <stdio.h>
#include <stdlib.h>
#include "cuda_convolute.h"
#include "funcs.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

__global__ void kernel_conv_grey(uint8_t *src, uint8_t *dst, int width, int height) {
    int i, j, k, l;
    int h[3][3] = { {1, 2, 1}, {2, 4, 2}, {1, 2, 1} };
    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
    size_t y = blockIdx.y * blockDim.y + threadIdx.y;
    if (0 < x && x < height - 1 && 0 < y && y < width - 1) {
        float val = 0;
        for (i = x - 1, k = 0; i <= x + 1; i++, k++)
            for (j = y - 1, l = 0; j <= y + 1; j++, l++)
                val += src[width * i + j] * h[k][l] / 16.0f;
        dst[width * x + y] = (uint8_t)val;
    }
}

__global__ void kernel_conv_rgb(uint8_t *src, uint8_t *dst, int width, int height) {
    int i, j, k, l;
    int h[3][3] = { {1, 2, 1}, {2, 4, 2}, {1, 2, 1} };
    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
    size_t y = blockIdx.y * blockDim.y + threadIdx.y; // y is column index (0..width-1)
    if (0 < x && x < height - 1 && 0 < y && y < width - 1) {
        float redval = 0, greenval = 0, blueval = 0;
        for (i = x - 1, k = 0; i <= x + 1; i++, k++) {
            for (j = (int)(y * 3) - 3, l = 0; j <= (int)(y * 3) + 3; j += 3, l++) {
                redval += src[(width * 3) * i + j] * h[k][l] / 16.0f;
                greenval += src[(width * 3) * i + j + 1] * h[k][l] / 16.0f;
                blueval += src[(width * 3) * i + j + 2] * h[k][l] / 16.0f;
            }
        }
        dst[(width * 3) * x + (y * 3)] = (uint8_t)redval;
        dst[(width * 3) * x + (y * 3) + 1] = (uint8_t)greenval;
        dst[(width * 3) * x + (y * 3) + 2] = (uint8_t)blueval;
    }
}

extern "C" void gpuConvolute(uint8_t *src, int width, int height, int loops, color_t imageType) {
    uint8_t *d_src, *d_dst, *tmp;
    size_t bytes = (imageType == GREY) ? (size_t)height * width : (size_t)height * width * 3;

    CUDA_SAFE_CALL(cudaMalloc(&d_src, bytes * sizeof(uint8_t)));
    CUDA_SAFE_CALL(cudaMalloc(&d_dst, bytes * sizeof(uint8_t)));

    CUDA_SAFE_CALL(cudaMemcpy(d_src, src, bytes, cudaMemcpyHostToDevice));
    CUDA_SAFE_CALL(cudaMemset(d_dst, 0, bytes));

    const int blockSize = 16;
    for (int t = 0; t < loops; t++) {
        if (imageType == GREY) {
            int gridX = FRACTION_CEILING(height, blockSize);
            int gridY = FRACTION_CEILING(width, blockSize);
            dim3 block(blockSize, blockSize);
            dim3 grid(gridX, gridY);
            kernel_conv_grey<<<grid, block>>>(d_src, d_dst, width, height);
        } else if (imageType == RGB) {
            int gridX = FRACTION_CEILING(height, blockSize);
            int gridY = FRACTION_CEILING(width, blockSize);
            dim3 block(blockSize, blockSize);
            dim3 grid(gridX, gridY);
            kernel_conv_rgb<<<grid, block>>>(d_src, d_dst, width, height);
        }

        tmp = d_src;
        d_src = d_dst;
        d_dst = tmp;
    }

    CUDA_SAFE_CALL(cudaGetLastError());
    CUDA_SAFE_CALL(cudaDeviceSynchronize());

    if (loops % 2 == 0) {
        CUDA_SAFE_CALL(cudaMemcpy(src, d_src, bytes, cudaMemcpyDeviceToHost));
    } else {
        CUDA_SAFE_CALL(cudaMemcpy(src, d_dst, bytes, cudaMemcpyDeviceToHost));
    }

    CUDA_SAFE_CALL(cudaFree(d_src));
    CUDA_SAFE_CALL(cudaFree(d_dst));
}

In [None]:
%%writefile main.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdint.h>
#include "cuda_convolute.h"
#include "funcs.h"

int main(int argc, char** argv) {
    int fd, width, height, loops;
    char *image;
    color_t imageType;

    Usage(argc, argv, &image, &width, &height, &loops, &imageType);

    uint8_t *src = NULL;
    uint64_t c = micro_time();

    if ((fd = open(image, O_RDONLY)) < 0) {
        fprintf(stderr, "cannot open %s\n", argv[1]);
        return EXIT_FAILURE;
    }
    size_t bytes = (imageType == GREY) ? (size_t)height * width : (size_t)height * width * 3;
    src = (uint8_t *)calloc(bytes, sizeof(uint8_t));
    read_all(fd, src, (int)bytes);
    close(fd);

    gpuConvolute(src, width, height, loops, imageType);

    int fd_out;
    char *outImage = (char*)malloc((strlen(image) + 9) * sizeof(char));
    strcpy(outImage, "blur_");
    strcat(outImage, image);
    if ((fd_out = open(outImage, O_CREAT | O_WRONLY, 0644)) == -1) {
        fprintf(stderr, "cannot open-create %s\n", outImage);
        return EXIT_FAILURE;
    }
    write_all(fd_out, src, (int)bytes);
    close(fd_out);
    free(outImage);

    c = micro_time() - c;
    double million = 1000 * 1000;
    fprintf(stdout, "Execution time: %.3f sec\n", c / million);

    free(src);
    return EXIT_SUCCESS;
}

In [None]:
# Compile
!gcc -O3 -c main.c
!gcc -O3 -c funcs.c
!nvcc -O3 -c cuda_convolute.cu
!nvcc -O3 -o cuda_conv main.o funcs.o cuda_convolute.o

In [None]:
# Upload a .raw file from your machine
from google.colab import files
files.upload()

In [None]:
# Run (edit these variables to match your input)
image = "waterfall_grey_1920_2520.raw"
width = 1920
height = 2520
loops = 50
mode = "grey"  # "grey" or "rgb"

!./cuda_conv $image $width $height $loops $mode

In [None]:
# Download the output
from google.colab import files
files.download("blur_" + image)