<a href="https://colab.research.google.com/github/ziwon/learning-cuda/blob/main/ch03/ch03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [2]:
import torch
print(torch.cuda.is_available())

True


In [3]:
!lsb_release -a

No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 22.04.3 LTS
Release:	22.04
Codename:	jammy


In [4]:
!nvidia-smi

Sun Mar 10 13:29:35 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8              10W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
!nvidia-smi --query-gpu=index,name --format=csv,noheader

0, Tesla T4


In [6]:
!nvidia-smi --query-gpu=gpu_name,gpu_bus_id,driver_version,pstate --format=csv

name, pci.bus_id, driver_version, pstate
Tesla T4, 00000000:00:04.0, 535.104.05, P8


In [7]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-fekwkdy9
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-fekwkdy9
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 781ff5b76ba6c4c2d80dcbbec9983e147613cc71
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [8]:
%load_ext nvcc4jupyter

Source files will be saved in "/tmp/tmp_kq2snfo".


In [9]:
%%cuda
#include<stdio.h>
__global__ void hello(void)
{
    printf("GPU: Hello!\n");
}
int main(int argc,char **argv)
{
    printf("CPU: Hello!\n");
    hello<<<1,10>>>();
    cudaDeviceReset();
    return 0;
}

CPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!
GPU: Hello!



In [10]:
%%cuda
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

void checkDeviceMemory(void) {
    size_t free, total;
    cudaMemGetInfo(&free, &total);
    printf("Device memory (free/total) = %lld/%lld bytes\n", free, total);
}

int main(void) {
    int *dDataPtr;
    cudaError_t errorCode;

    checkDeviceMemory();
    errorCode = cudaMalloc(&dDataPtr, sizeof(int) * 1024 * 1024 * 1024 * 12);
    printf("cudaMalloc - %s\n", cudaGetErrorName(errorCode));
    checkDeviceMemory();

    errorCode = cudaMemset(dDataPtr, 0, sizeof(int) * 1024 * 1024);
    printf("cudaMemset - %s\n", cudaGetErrorName(errorCode));

    errorCode = cudaFree(dDataPtr);
    printf("cudaFree - %s\n", cudaGetErrorName(errorCode));
    checkDeviceMemory();
}

Device memory (free/total) = 15727656960/15835660288 bytes
cudaMalloc - cudaErrorMemoryAllocation
Device memory (free/total) = 15727656960/15835660288 bytes
cudaMemset - cudaErrorInvalidValue
cudaFree - cudaSuccess
Device memory (free/total) = 15727656960/15835660288 bytes



In [11]:
!nvidia-smi nvlink --status

In [12]:
!nvidia-smi -q -d MEMORY



Timestamp                                 : Sun Mar 10 13:29:51 2024
Driver Version                            : 535.104.05
CUDA Version                              : 12.2

Attached GPUs                             : 1
GPU 00000000:00:04.0
    FB Memory Usage
        Total                             : 15360 MiB
        Reserved                          : 257 MiB
        Used                              : 3 MiB
        Free                              : 15099 MiB
    BAR1 Memory Usage
        Total                             : 256 MiB
        Used                              : 3 MiB
        Free                              : 253 MiB
    Conf Compute Protected Memory Usage
        Total                             : 0 MiB
        Used                              : 0 MiB
        Free                              : 0 MiB



In [13]:
from IPython.core.magic import register_cell_magic
import subprocess
import shlex
import os

@register_cell_magic
def cu(line: str, cell: str) -> None:
    """Jupyter Notebook cell magic to run CUDA C code."""

    source_file_name = 'temp.cu'
    executable_name = 'temp.out'

    with open(source_file_name, 'w') as f:
        f.write(cell)

    print("---------------------------------------")
    executable_file_path = os.path.abspath(executable_name)
    print("Path:", executable_file_path)

    compile_command = f"nvcc {source_file_name} -o {executable_name}"
    subprocess.run(shlex.split(compile_command), check=True)

    run_command = f"./{executable_name}"
    result = subprocess.run(shlex.split(run_command), text=True, capture_output=True)
    print(result.stdout)

    os.remove(source_file_name)
    os.remove(executable_name)

In [14]:
%%cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

__global__ void printData(int* _dDataPtr) {
    printf("%d", _dDataPtr[threadIdx.x]);
}

__global__ void setData(int* _dDataPtr) {
    _dDataPtr[threadIdx.x] = 2;
}

int main(void) {
    int  data[10] = { 0 }; // 호스트 메모리 영역
    for (int i = 0; i < 10; i++) data[i] = 1;

    int* dDataPtr; // 디바이스 메모리 영역
    cudaMalloc(&dDataPtr, sizeof(int) * 10);
    cudaMemset(dDataPtr, 0, sizeof(int) * 10);

    printf("Data in device: ");
    printData <<<1, 10>>> (dDataPtr);

    cudaMemcpy(dDataPtr, data, sizeof(int) * 10, cudaMemcpyHostToDevice);
    printf("\nHost -> Device: ");
    printData <<<1, 10>>> (dDataPtr);

    setData <<<1, 10>>> (dDataPtr);
    cudaMemcpy(data, dDataPtr, sizeof(int) * 10, cudaMemcpyDeviceToHost);
    printf("\nDevice -> Host: ");
    for (int i = 0; i < 10; i++) printf("%d", data[i]);

    cudaFree(dDataPtr);
}

---------------------------------------
Path: /content/temp.out
Data in device: 0000000000
Host -> Device: 1111111111
Device -> Host: 2222222222


In [15]:
%%cu

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define NUM_DATA 1024

int main(void) {
    int* a, * b, * c;

    int memSize = sizeof(int) * NUM_DATA;
    a = new int[NUM_DATA]; memset(a, 0, memSize);
    b = new int[NUM_DATA]; memset(b, 0, memSize);
    c = new int[NUM_DATA]; memset(c, 0, memSize);

    for (int i = 0; i < NUM_DATA; i++) {
        a[i] = rand() % 10;
        b[i] = rand() % 10;

    }

    for (int i = 0; i < NUM_DATA; i++)
      c[i] = a[i] + b[i];

    delete[] a; delete[] b; delete[] c;
}

---------------------------------------
Path: /content/temp.out



In [16]:
%%cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define NUM_DATA 1024
#define BLOCK_SIZE 256 // The size of Thread Block

__global__ void vecAdd(int* _a, int* _b, int* _c) {
    int tId = threadIdx.x;
    _c[tId] = _a[tId] + _b[tId];
}

__global__ void vecAdd(int* _a, int* _b, int* _c, int n) {
    int tId = blockIdx.x * blockDim.x + threadIdx.x;
    if (tId < n) {
      _c[tId] = _a[tId] + _b[tId];
    }
}

void getWarpSize(void){
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0); // Assuming device 0 for simplicity
    printf("Warp size: %d\n", prop.warpSize);
}

int main(void) {

    // The warp size for NVIDIA GPUs is typically 32 threads
    getWarpSize();

    int* a, * b, * c, * hc; // Vectors on the host
    int* da, * db, * dc; // Vectors on the device

    int memSize = sizeof(int) * NUM_DATA;
    printf("%d elements, memSize = %d bytes\n", NUM_DATA, memSize);

    // Memory allocation on the host-side
    a = new int[NUM_DATA]; memset(a, 0, memSize);
    b = new int[NUM_DATA]; memset(b, 0, memSize);
    c = new int[NUM_DATA]; memset(c, 0, memSize);
    hc = new int[NUM_DATA]; memset(hc, 0, memSize);

    // Data generation
    for (int i = 0; i < NUM_DATA; i++) {
		  a[i] = rand() % 10;
		  b[i] = rand() % 10;
	  }

    // Vector sum on host (for performance comparision)
    for (int i = 0; i < NUM_DATA; i++)
      hc[i] = a[i] + b[i];

    // Memory allocation on the device-side
    cudaMalloc(&da, memSize); cudaMemset(da, 0, memSize);
    cudaMalloc(&db, memSize); cudaMemset(db, 0, memSize);
    cudaMalloc(&dc, memSize); cudaMemset(dc, 0, memSize);

    // Data copy : Host -> Device
    cudaMemcpy(da, a, memSize, cudaMemcpyHostToDevice);
    cudaMemcpy(db, b, memSize, cudaMemcpyHostToDevice);

    // Kernel call
    // vecAdd <<<1, NUM_DATA>>> (da, db, dc);
    int numBlocks = (NUM_DATA + BLOCK_SIZE - 1) / BLOCK_SIZE;
    vecAdd<<<numBlocks, BLOCK_SIZE>>>(da, db, dc, NUM_DATA);

    // Copy results: device -> host
    cudaMemcpy(c, dc, memSize, cudaMemcpyDeviceToHost);

    // Release device memory
    cudaFree(da); cudaFree(db); cudaFree(dc);

    // Check results
    bool result = true;
    for (int i = 0; i < NUM_DATA; i++) {
        if (hc[i] != c[i]) {
            printf("[%d] The result is not matched! (%d, %d)\n", i, hc[i], c[i]);
            result = false;
        }
    }

    if (result)
		  printf("GPU works well!\n");

	  // Release host memory
	  delete[] a; delete[] b; delete[] c; delete[] hc;

    return 0;
}

---------------------------------------
Path: /content/temp.out
Warp size: 32
1024 elements, memSize = 4096 bytes
GPU works well!



In [17]:
!nvprof -s -o results.nvprof /content/temp.out

