In [2]:
!nvidia-smi

Mon May  5 18:50:56 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [4]:
!pip install nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1


In [5]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpsg8e8n9o".


In [11]:
code = r"""
#include <iostream>
#include <climits>
#include <chrono>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256

__global__ void reduceMin(int* input, int* output, int size) {
    __shared__ int sdata[BLOCK_SIZE];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < size) ? input[i] : INT_MAX;
    __syncthreads();

    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            sdata[tid] = min(sdata[tid], sdata[tid + stride]);
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];
}

__global__ void reduceMax(int* input, int* output, int size) {
    __shared__ int sdata[BLOCK_SIZE];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < size) ? input[i] : INT_MIN;
    __syncthreads();

    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            sdata[tid] = max(sdata[tid], sdata[tid + stride]);
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];
}

__global__ void reduceSum(int* input, int* output, int size) {
    __shared__ int sdata[BLOCK_SIZE];
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < size) ? input[i] : 0;
    __syncthreads();

    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            sdata[tid] += sdata[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];
}

inline cudaError_t checkCudaError(cudaError_t err, const char* msg) {
    if (err != cudaSuccess) {
        std::cerr << "CUDA Error: " << msg << " - " << cudaGetErrorString(err) << std::endl;
        exit(EXIT_FAILURE);
    }
    return err;
}

int main() {
    const int size = 1 << 10;
    int* h_input = (int*)malloc(size * sizeof(int));
    for (int i = 0; i < size; i++) {
        h_input[i] = rand() % 100 + 1;
    }

    int cpu_min = INT_MAX, cpu_max = INT_MIN, cpu_sum = 0;
    auto start_cpu = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < size; i++) {
        cpu_min = std::min(cpu_min, h_input[i]);
        cpu_max = std::max(cpu_max, h_input[i]);
        cpu_sum += h_input[i];
    }
    auto end_cpu = std::chrono::high_resolution_clock::now();
    auto cpu_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_cpu - start_cpu).count();
    float cpu_avg = (float)cpu_sum / size;

    int *d_input, *d_output_min, *d_output_max, *d_output_sum;
    int gridSize = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;

    checkCudaError(cudaMalloc((void**)&d_input, size * sizeof(int)), "Malloc d_input");
    checkCudaError(cudaMalloc((void**)&d_output_min, gridSize * sizeof(int)), "Malloc d_output_min");
    checkCudaError(cudaMalloc((void**)&d_output_max, gridSize * sizeof(int)), "Malloc d_output_max");
    checkCudaError(cudaMalloc((void**)&d_output_sum, gridSize * sizeof(int)), "Malloc d_output_sum");

    checkCudaError(cudaMemcpy(d_input, h_input, size * sizeof(int), cudaMemcpyHostToDevice), "Memcpy input");

    cudaEvent_t start_gpu, stop_gpu;
    checkCudaError(cudaEventCreate(&start_gpu), "Create event start");
    checkCudaError(cudaEventCreate(&stop_gpu), "Create event stop");

    cudaEventRecord(start_gpu);

    reduceMin<<<gridSize, BLOCK_SIZE>>>(d_input, d_output_min, size);
    reduceMax<<<gridSize, BLOCK_SIZE>>>(d_input, d_output_max, size);
    reduceSum<<<gridSize, BLOCK_SIZE>>>(d_input, d_output_sum, size);

    checkCudaError(cudaGetLastError(), "Kernel launch failed");
    cudaEventRecord(stop_gpu);
    cudaEventSynchronize(stop_gpu);

    float gpu_time = 0.0f;
    cudaEventElapsedTime(&gpu_time, start_gpu, stop_gpu);

    int* h_output_min = (int*)malloc(gridSize * sizeof(int));
    int* h_output_max = (int*)malloc(gridSize * sizeof(int));
    int* h_output_sum = (int*)malloc(gridSize * sizeof(int));

    checkCudaError(cudaMemcpy(h_output_min, d_output_min, gridSize * sizeof(int), cudaMemcpyDeviceToHost), "Memcpy min output");
    checkCudaError(cudaMemcpy(h_output_max, d_output_max, gridSize * sizeof(int), cudaMemcpyDeviceToHost), "Memcpy max output");
    checkCudaError(cudaMemcpy(h_output_sum, d_output_sum, gridSize * sizeof(int), cudaMemcpyDeviceToHost), "Memcpy sum output");

    int gpu_min = INT_MAX, gpu_max = INT_MIN, gpu_sum = 0;
    for (int i = 0; i < gridSize; i++) {
        gpu_min = std::min(gpu_min, h_output_min[i]);
        gpu_max = std::max(gpu_max, h_output_max[i]);
        gpu_sum += h_output_sum[i];
    }
    float gpu_avg = (float)gpu_sum / size;

    std::cout << "\nCPU Results:\n";
    std::cout << "Min: " << cpu_min << " | Max: " << cpu_max << " | Sum: " << cpu_sum
              << " | Avg: " << cpu_avg << " | Time: " << cpu_time << " ms\n";

    std::cout << "\nGPU Results:\n";
    std::cout << "Min: " << gpu_min << " | Max: " << gpu_max << " | Sum: " << gpu_sum
              << " | Avg: " << gpu_avg << " | Time: " << gpu_time << " ms\n";

    free(h_input);
    free(h_output_min);
    free(h_output_max);
    free(h_output_sum);
    cudaFree(d_input);
    cudaFree(d_output_min);
    cudaFree(d_output_max);
    cudaFree(d_output_sum);
    cudaEventDestroy(start_gpu);
    cudaEventDestroy(stop_gpu);

    return 0;
}

"""


with open("main.cu", "w") as f:
    f.write(code)

In [12]:
!nvcc -arch=sm_75 main.cu -o main

In [13]:

!./main


CPU Results:
Min: 1 | Max: 100 | Sum: 52557 | Avg: 51.3252 | Time: 0 ms

GPU Results:
Min: 1 | Max: 100 | Sum: 52557 | Avg: 51.3252 | Time: 0.165888 ms
