In [1]:
# To show that if there is cuda tookit installed
!ls /usr/local

bin  etc  games  include  lib  man  sbin  share  src


In [2]:
# To show that if we have the nvcc command
!which nvcc

/home/student_15030/cudaProj/cuda-12.4/bin/nvcc


In [3]:
# To show the property of the nvidia card(On my one, I use the K80)
!nvidia-smi

Wed Jun  4 13:04:47 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.120                Driver Version: 550.120        CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A6000               Off |   00000000:11:00.0 Off |                  Off |
| 30%   32C    P8             24W /  300W |     398MiB /  49140MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
%%writefile coordinate.cu

#include <cuda_runtime.h>
#include <stdio.h>

__global__ void printCoordinate(int *A,const int nx,const int ny)
{
  int ix=threadIdx.x+blockIdx.x*blockDim.x;
  int iy=threadIdx.y+blockIdx.y*blockDim.y;
  unsigned int idx=iy*nx+ix;
  printf("thread_id(%d,%d) block_id(%d,%d) coordinate(%d,%d)"
          "global index %2d ival %2d\n",threadIdx.x,threadIdx.y,
          blockIdx.x,blockIdx.y,ix,iy,idx,A[idx]);
}

int main(int argc,char ** argv)
{
  cudaSetDevice(0);
  int nx = 8;
  int ny = 6;
  // Malloc
  int* A_host = (int*)malloc(nx*ny*sizeof(int));
  for(int i = 0; i<nx*ny; ++i){
      A_host[i] = 0;
  }
  
  // cudaMalloc
  int *A_dev=NULL;
  cudaMalloc((void**)&A_dev, nx*ny*sizeof(int));
  cudaMemcpy(A_dev, A_host, nx*ny*sizeof(int), cudaMemcpyHostToDevice);
 
  dim3 block(4,2);
  dim3 grid((nx-1)/block.x+1,(ny-1)/block.y+1);
  
  printCoordinate<<<grid,block>>>(A_dev,nx,ny);
  cudaDeviceSynchronize();
 
  // Free
  cudaFree(A_dev);
  free(A_host);
 
  cudaDeviceReset();
  return 0;
}

Overwriting coordinate.cu


In [6]:
# Compile the code. The flag is needed if you use the Tesla K80.  
!nvcc -arch=sm_75 coordinate.cu -o coordinate

In [7]:
!./coordinate

thread_id(0,0) block_id(1,1) coordinate(4,2)global index 20 ival  0
thread_id(1,0) block_id(1,1) coordinate(5,2)global index 21 ival  0
thread_id(2,0) block_id(1,1) coordinate(6,2)global index 22 ival  0
thread_id(3,0) block_id(1,1) coordinate(7,2)global index 23 ival  0
thread_id(0,1) block_id(1,1) coordinate(4,3)global index 28 ival  0
thread_id(1,1) block_id(1,1) coordinate(5,3)global index 29 ival  0
thread_id(2,1) block_id(1,1) coordinate(6,3)global index 30 ival  0
thread_id(3,1) block_id(1,1) coordinate(7,3)global index 31 ival  0
thread_id(0,0) block_id(0,2) coordinate(0,4)global index 32 ival  0
thread_id(1,0) block_id(0,2) coordinate(1,4)global index 33 ival  0
thread_id(2,0) block_id(0,2) coordinate(2,4)global index 34 ival  0
thread_id(3,0) block_id(0,2) coordinate(3,4)global index 35 ival  0
thread_id(0,1) block_id(0,2) coordinate(0,5)global index 40 ival  0
thread_id(1,1) block_id(0,2) coordinate(1,5)global index 41 ival  0
thread_id(2,1) block_id(0,2) coordinate(2,5)glob