In [1]:
!apt-get install -y nsight-systems-2025.5.2

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libxcb-cursor0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1
  libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xinput0 libxcb-xkb1
  libxkbcommon-x11-0 libxtst6
The following NEW packages will be installed:
  libxcb-cursor0 libxcb-icccm4 libxcb-image0 libxcb-keysyms1
  libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xinput0 libxcb-xkb1
  libxkbcommon-x11-0 libxtst6 nsight-systems-2025.5.2
0 upgraded, 12 newly installed, 0 to remove and 123 not upgraded.
Need to get 411 MB of archives.
After this operation, 752 kB of additional disk space will be used.
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  nsight-systems-2025.5.2 2025.5.2.266-255236693005v0 [411 MB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libxcb-xinerama0 amd64 1.14-3ubuntu3 [5,414 B]
Get:3 http://archive.ubuntu.

# Simple Start

In [2]:
%%writefile add.cu
#include <iostream>
#include <math.h>

__global__ void add(int n, float *x, float *y) {
  for (int i = 0; i < n; i++)
    y[i] = x[i] + y[i];
}

int main(void) {
  int N = 1 << 20; // 1M elements

  float *x, *y;
  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N * sizeof(float));
  cudaMallocManaged(&y, N * sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 1>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i] - 3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);

  return 0;
}

Writing add.cu


In [3]:
!nvcc add.cu -o add

In [4]:
!./add

Max error: 0


In [5]:
!nvprof ./add

==285== NVPROF is profiling process 285, command: ./add
Max error: 0
==285== Profiling application: ./add
==285== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  81.851ms         1  81.851ms  81.851ms  81.851ms  add(int, float*, float*)
      API calls:   69.41%  198.07ms         2  99.037ms  46.502us  198.03ms  cudaMallocManaged
                   28.69%  81.857ms         1  81.857ms  81.857ms  81.857ms  cudaDeviceSynchronize
                    1.37%  3.9012ms       228  17.110us     109ns  1.3801ms  cuDeviceGetAttribute
                    0.33%  937.81us         1  937.81us  937.81us  937.81us  cudaLaunchKernel
                    0.19%  549.88us         2  274.94us  229.17us  320.70us  cudaFree
                    0.01%  19.819us         2  9.9090us  6.0350us  13.784us  cuDeviceGetName
                    0.00%  6.1670us         4  1.5410us     214ns  5.4480us  cuDeviceGet
                    0.00%  4.2

In [6]:
!nsys profile --stats=true ./add

Collecting data...
Max error: 0
Generating '/tmp/nsys-report-556a.qdstrm'
[2KProcessing 1238 events:        ] report1.sqlite
[3/8] Executing 'nvtx_sum' stats report
SKIPPED: /kaggle/working/report1.sqlite does not contain NV Tools Extension (NVTX) data.
[4/8] Executing 'osrt_sum' stats report

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)      Med (ns)    Min (ns)   Max (ns)    StdDev (ns)            Name         
 --------  ---------------  ---------  ------------  ------------  --------  -----------  ------------  ----------------------
     82.0      829,826,688         28  29,636,667.4  10,079,365.5     4,239  398,278,699  75,552,314.4  poll                  
     17.4      176,308,935        667     264,331.2      17,286.0     1,175   18,743,713     914,830.0  ioctl                 
      0.2        2,081,493         31      67,144.9      12,285.0     6,907    1,487,137     264,174.1  mmap64                
      0.1          929,135         10      92,913.5      57,138.5    

# Now Making it More Parallel



In [7]:
%%writefile add_parallel.cu
#include <iostream>
#include <math.h>

__global__ void add(int n, float *x, float *y) {
    int index = threadIdx.x;
    int stride = blockDim.x;
  for (int i = index; i < n; i += stride)
    y[i] = x[i] + y[i];
}

int main(void) {
  int N = 1 << 20; // 1M elements

  float *x, *y;
  // Allocate Unified Memory – accessible from CPU or GPU
  cudaMallocManaged(&x, N * sizeof(float));
  cudaMallocManaged(&y, N * sizeof(float));

  // initialize x and y arrays on the host
  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  // Run kernel on 1M elements on the GPU
  add<<<1, 256>>>(N, x, y);

  // Wait for GPU to finish before accessing on host
  cudaDeviceSynchronize();

  // Check for errors (all values should be 3.0f)
  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = fmax(maxError, fabs(y[i] - 3.0f));
  std::cout << "Max error: " << maxError << std::endl;

  // Free memory
  cudaFree(x);
  cudaFree(y);

  return 0;
}

Writing add_parallel.cu


In [8]:
!nvcc add_parallel.cu -o add_parallel

In [9]:
!./add_parallel

Max error: 0


In [10]:
!nvprof ./add_parallel

==405== NVPROF is profiling process 405, command: ./add_parallel
Max error: 0
==405== Profiling application: ./add_parallel
==405== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  3.5100ms         1  3.5100ms  3.5100ms  3.5100ms  add(int, float*, float*)
      API calls:   95.68%  202.55ms         2  101.27ms  44.086us  202.50ms  cudaMallocManaged
                    2.28%  4.8255ms       228  21.164us     105ns  1.3946ms  cuDeviceGetAttribute
                    1.66%  3.5154ms         1  3.5154ms  3.5154ms  3.5154ms  cudaDeviceSynchronize
                    0.24%  516.02us         2  258.01us  237.78us  278.24us  cudaFree
                    0.12%  259.96us         1  259.96us  259.96us  259.96us  cudaLaunchKernel
                    0.01%  16.481us         2  8.2400us  5.2670us  11.214us  cuDeviceGetName
                    0.00%  6.2150us         4  1.5530us     195ns  5.5340us  cuDeviceGet
            

In [11]:
!nsys profile --stats=true ./add

Collecting data...
Max error: 0
Generating '/tmp/nsys-report-9362.qdstrm'
[2KProcessing 1241 events:        ] report2.sqlite
[3/8] Executing 'nvtx_sum' stats report
SKIPPED: /kaggle/working/report2.sqlite does not contain NV Tools Extension (NVTX) data.
[4/8] Executing 'osrt_sum' stats report

 Time (%)  Total Time (ns)  Num Calls    Avg (ns)      Med (ns)    Min (ns)   Max (ns)    StdDev (ns)            Name         
 --------  ---------------  ---------  ------------  ------------  --------  -----------  ------------  ----------------------
     81.0      826,181,608         28  29,506,486.0  10,094,323.0     2,260  394,937,338  74,913,405.3  poll                  
     18.5      188,242,038        667     282,221.9      15,535.0     1,115   18,402,176     960,524.0  ioctl                 
      0.2        2,073,951         31      66,901.6      12,183.0     6,781    1,445,110     256,538.9  mmap64                
      0.1        1,016,500         10     101,650.0      49,643.0    

In [12]:
!nvidia-smi

Wed Feb  4 07:17:02 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.105.08             Driver Version: 580.105.08     CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   52C    P8             15W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00