In [None]:
# NOTE: This is a workaround to run CUDA in Kaggle / Google Colab. Leaving it here for future reference.

In [15]:
!apt-get install -y nsight-systems-2025.5.2

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libxcb-cursor0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1
  libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xinput0 libxcb-xkb1
  libxkbcommon-x11-0 libxtst6
The following NEW packages will be installed:
  libxcb-cursor0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1
  libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xinput0 libxcb-xkb1
  libxkbcommon-x11-0 libxtst6 nsight-systems-2025.5.2
0 upgraded, 12 newly installed, 0 to remove and 147 not upgraded.
Need to get 411 MB of archives.
After this operation, 752 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxcb-xinerama0 amd64 1.14-3ubuntu3 [5,414 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  nsight-systems-2025.5.2 2025.5.2.266-255236693005v0 [411 MB]
Get:3 http://archive.ubuntu.

In [8]:
%%writefile add.cu
#include <iostream>
#include <math.h>

__global__ void add(int n, float *x, float *y) {
  for (int i = 0; i < n; i++)
    y[i] = x[i] + y[i];
}

int main(void) {
  int N = 1 << 20; // 1M elements

  float *x, *y;
  // Allocate Unified Memory â€“ accessible from CPU or GPU
  cudaMallocManaged(&x, N * sizeof(float));
  cudaMallocManaged(&y, N * sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 1>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i] - 3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);

  return 0;
}

Overwriting add.cu


In [9]:
!nvcc add.cu -o add

In [10]:
 !./add

Max error: 0


In [14]:
!nvprof ./add

==998== NVPROF is profiling process 998, command: ./add
Max error: 0
==998== Profiling application: ./add
==998== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  99.345ms         1  99.345ms  99.345ms  99.345ms  add(int, float*, float*)
      API calls:   66.60%  212.28ms         2  106.14ms  70.892us  212.21ms  cudaMallocManaged
                   31.18%  99.366ms         1  99.366ms  99.366ms  99.366ms  cudaDeviceSynchronize
                    1.55%  4.9489ms       228  21.705us     114ns  1.4120ms  cuDeviceGetAttribute
                    0.45%  1.4277ms         1  1.4277ms  1.4277ms  1.4277ms  cudaLaunchKernel
                    0.21%  660.31us         2  330.15us  311.97us  348.33us  cudaFree
                    0.01%  20.002us         2  10.001us  7.6280us  12.374us  cuDeviceGetName
                    0.00%  7.6630us         2  3.8310us  2.7750us  4.8880us  cuDeviceGetPCIBusId
                    0.

In [17]:
!nsys profile --stats=true ./add

Collecting data...
Max error: 0
Generating '/tmp/nsys-report-5887.qdstrm'
[2KProcessing 1256 events:        ] report1.sqlite
[3/8] Executing 'nvtx_sum' stats report
SKIPPED: /kaggle/working/report1.sqlite does not contain NV Tools Extension (NVTX) data.
[4/8] Executing 'osrt_sum' stats report

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)      Med (ns)    Min (ns)   Max (ns)    StdDev (ns)            Name         
 --------  ---------------  ---------  ------------  ------------  --------  -----------  ------------  ----------------------
     79.8      875,359,381         28  31,262,835.0  10,101,358.0     2,463  443,595,422  83,755,921.6  poll                  
     19.6      214,768,341        667     321,991.5      18,406.0     1,179   19,850,016   1,132,833.5  ioctl                 
      0.2        2,391,766         31      77,153.7      13,044.0     7,570    1,667,597     296,141.1  mmap64                
      0.1        1,111,113         10     111,111.3      76,891.0    