
# 1. GPU Scheduling 


Enter your name and student ID.

 * Name:
 * Student ID:



This notebook demonstrates how NVIDIA GPU is scheduling threads.

# 2. Compilers
* This exercise is CUDA-specific, so we use NVIDIA CUDA

* Execute this before you use CUDA

In [None]:
export PATH=/usr/local/cuda/bin:$PATH

* Check if it works (check if full paths of nvcc are shown)
* We do not recommend nvc/nvc++ for this exercise, but you might give them a try if you like

In [None]:
which nvcc

# 3. Check host and GPU
* Check if you are using the right host, tauleg000, <font color="red">not taulec</font>

In [None]:
hostname
hostname | grep tauleg || echo "Oh, you are not on the right host, access https://tauleg000.zapto.org/ instead"

* Check if GPU is alive by nvidia-smi
* Do `nvidia-smi --help` or see manual (`man nvidia-smi` on terminal) for more info

In [None]:
nvidia-smi


# 4. Basics
When you call a kernel (function f) with
```
f<<<nb,bs>>>();
```
it creates (_nb * bs_) CUDA threads.

More precisely, it creates _nb_ thread blocks, each of which has _bs_ CUDA threads.

The following code is a tool to record how threads are executed on GPU.

It creates many threads repeating a trivial (useless) computation x = a * x + b many times.
Each thread occasionally records the clock to record when and where these threads progress over time.

Specifically,

```
./cuda_sched_rec NTHREADS THREAD_BLOCK_SZ N M 
```

creates approximately NTHREADS threads, with THREAD_BLOCK_SZ threads in each thread block (the number of threads is not exactly NTHREADS when it is not a multiple of THREAD_BLOCK_SZ).

* Each thread repeats x = A x + B, (N * M) times.
* Each thread records clock N times (every M iterations).

* At the end of execution, it dumps the results in the following format for each line.

```
thread=<idx> x=<ans> sm0=<starting SM> sm1=<ending SM> t0 t1 t2 ... t_{n-1}
```


In [None]:
BEGIN SOLUTION
END SOLUTION
%%writefile cuda_sched_rec.cu
#include <assert.h>
#include <stdio.h>

// error check utility (check_api_error and check_launch_error)
#include "cuda_util.h"

// record of execution
typedef long long int llint;
typedef struct {
  double x;                     // a (meaningless) answer 
  uint sm0;                     // SM on which a thread got started
  uint sm1;                     // SM on which a thread ended (MUST BE = sm0; just to verify that)
} record_t;

/* this thread repeats x = a x + b (N * M) times.
   it records the clock N times (every M iterations of x = a x + b)
   to array T.
   final result of x = a x + b, as well as SM each thread was executed
   on are recorded to R. */
__global__ void cuda_thread_fun(double a, double b, record_t * R,
                                llint * T, llint n, llint m,
                                int nthreads) {
  // my thread index
  int idx      = blockDim.x * blockIdx.x + threadIdx.x;
  if (idx >= nthreads) return;
  // initial value (not important)
  double x = idx;
  // where clocks are recorded
  T = &T[idx * n];
  // record starting SM
  R[idx].sm0 = get_smid();
  // main thing. repeat a x + b many times,
  // occasionally recording the clock
  for (long i = 0; i < n; i++) {
    T[i] = clock64();
    for (long j = 0; j < m; j++) {
      x = a * x + b;
    }
  }
  // record ending SM (must be = sm0)
  R[idx].sm1 = get_smid();
  // record result, just so that the computation is not
  // eliminated by the compiler
  R[idx].x = x;
}

/* usage
   ./cuda_sched NTHREADS THREAD_BLOCK_SZ N M S A B

   creates about NTHREADS threads, with THREAD_BLOCK_SZ
   threads in each thread block. 
   each thread repeats x = A x + B (N * M) times.

   S is the shared memory allocated for each thread block
   (just to control the number of thread blocks simultaneously
   scheduled on an SM). shared memory is not actually used at all.
 */
int main(int argc, char ** argv) {
  int i = 1;
  int nthreads        = (argc > i ? atoi(argv[i])  : 100);  i++;
  int thread_block_sz = (argc > i ? atoi(argv[i])  : 64);   i++;
  llint n             = (argc > i ? atoll(argv[i]) : 100);  i++;
  llint m             = (argc > i ? atoll(argv[i]) : 100);  i++;
  int D               = (argc > i ? atoll(argv[i]) : 1);    i++;
  int shm_sz          = (argc > i ? atoi(argv[i])  : 0);    i++;
  double a            = (argc > i ? atof(argv[i])  : 0.99); i++;
  double b            = (argc > i ? atof(argv[i])  : 1.00); i++;

  // get the required number of thread blocks
  int n_thread_blocks = (nthreads + thread_block_sz - 1) / thread_block_sz;
  printf("%d threads/block * %d blocks\n", thread_block_sz, n_thread_blocks);

  // allocate record_t array (both on host and device)
  long R_sz = sizeof(record_t) * nthreads;
  record_t * R = (record_t *)calloc(R_sz, 1);
  record_t * R_dev;
  check_api_error(cudaMalloc(&R_dev, R_sz));
  check_api_error(cudaMemcpy(R_dev, R, R_sz, cudaMemcpyHostToDevice));

  // allocate clock array (both on host and device)
  long T_sz = sizeof(llint) * n * nthreads;
  llint * T = (llint *)calloc(T_sz, 1);
  llint * T_dev;
  check_api_error(cudaMalloc(&T_dev, T_sz));
  check_api_error(cudaMemcpy(T_dev, T, T_sz, cudaMemcpyHostToDevice));

  // call the kernel
  int shm_elems = shm_sz / sizeof(double);
  int shm_size = shm_elems * sizeof(double);
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz,shm_size>>>
                      (a, b, R_dev, T_dev, n, m, nthreads)));
  cudaDeviceSynchronize();

  // get back the results and clocks
  check_api_error(cudaMemcpy(R, R_dev, R_sz, cudaMemcpyDeviceToHost));
  check_api_error(cudaMemcpy(T, T_dev, T_sz, cudaMemcpyDeviceToHost));
  // dump the for visualization
  long k = 0;
  for (long idx = 0; idx < nthreads; idx++) {
    printf("thread=%ld x=%f sm0=%u sm1=%u", idx, R[idx].x, R[idx].sm0, R[idx].sm1);
    for (long i = 0; i < n; i++) {
      printf(" %lld", T[k]);
      k++;
    }
    printf("\n");
  }
  return 0;
}

Read the code carefully and understand what it is doing.  

In [None]:
BEGIN SOLUTION
END SOLUTION
nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_sched_rec cuda_sched_rec.cu

In [None]:
BEGIN SOLUTION
END SOLUTION
./cuda_sched_rec 64 32 10 100 | head -10

# 5. Visualization
* The following python code parses and visualizes the output of cuda_sched_rec.
* The code is shown below for your information; you don't have to understand how it works.

In [None]:
BEGIN SOLUTION
END SOLUTION
#!/usr/bin/python3
import re
# from matplotlib import collections  as mc
import matplotlib.collections as mc
import matplotlib.pyplot as plt
import numpy as np

def read_dat(files_dat):
    pat = re.compile("thread=(?P<thread>\d+) x=(?P<x>\d+\.\d+) sm0=(?P<sm0>\d+) sm1=(?P<sm1>\d+)(?P<t>( \d+)*)")
    log = {}
    for file_dat in files_dat:
        with open(file_dat) as fp:
            for line in fp:
                # 1 : 100.000000 20524414966449 20524423007875 0 0
                m = pat.match(line)
                if not m:
                    continue
                thread = int(m.group("thread"))
                x      = float(m.group("x"))
                sm0    = int(m.group("sm0"))
                sm1    = int(m.group("sm1"))
                t      = [int(s) for s in m.group("t").strip().split()]
                assert(sm0 == sm1), (sm0, sm1)
                if sm0 not in log:
                    log[sm0] = []
                log[sm0].append((thread, t))
    return log

def cuda_sched_plt(files_dat, start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf")):
    log = read_dat(files_dat)
    n_sms = max(sm for sm in log) + 1
    cmap = plt.cm.get_cmap('RdYlGn', n_sms)
    fig, ax = plt.subplots()
    plt.xlabel("cycles")
    plt.ylabel("thread idx")
    for sm,records in sorted(list(log.items())):
        T0 = min(T[0] for thread, T in records)
        X = []
        Y = []
        sm_color = cmap(sm)
        for thread, T in records:
            if start_thread <= thread < end_thread:
                for t in T:
                    if start_t <= t - T0 <= end_t:
                        X.append(t - T0)
                        Y.append(thread)
        ax.plot(X, Y, 'o', markersize=0.5, color=sm_color)
    ax.autoscale()
    plt.savefig("sched.svg")
    plt.show()
    


Let's visualize a few configurations.

## 5-1. one thread

In [None]:
BEGIN SOLUTION
END SOLUTION
./cuda_sched_rec 1 1 100 1000 > cs_1_1.dat

In [None]:
BEGIN SOLUTION
END SOLUTION
cuda_sched_plt(["cs_1_1.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))

* you can change `start_t` and `end_t` to zoom into a narrower time interval and change `start_thread` and `end_thread` to zoom into a range of threads
* or, you can open `sched.svg` generated along with the visualization and magnify anywhere you want to look into, either by the browser or any SVG viewer on your PC

## 5-2. many threads with 1 thread/block
* play with changing N to other values

In [None]:
BEGIN SOLUTION
END SOLUTION
N=150
./cuda_sched_rec ${N} 1 100 1000 > cs_N_1.dat

In [None]:
BEGIN SOLUTION
END SOLUTION
cuda_sched_plt(["cs_N_1.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))

* Increase N and observe when the execution time (the time at the right end of the graph) starts to increase.
* Even in that case, all N threads appear to be executing simultaneously (not one after another).
* That is, _hardware_ interleaves execution of these threads, rapidly switching from one to another.

## 5-3. many threads in 1 thread block
* play with changing N to other values

In [None]:
BEGIN SOLUTION
END SOLUTION
N=150
./cuda_sched_rec ${N} ${N} 100 1000 > cs_N_N.dat

In [None]:
BEGIN SOLUTION
END SOLUTION
cuda_sched_plt(["cs_N_N.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))

* Observe that they are always executed on the same SM. You are not utilizing multiple SMs at all.

* There is a hardwired limit on the number of threads per block. Try to find it and then confirm it with Google.

* When increasing N, observe when the execution time starts to increase. Why do you think it doesn't immediately increase with N&gt;1?

* With a modest value of N (say 100), zoom in at either end of the execution and observe whether there is _any_ difference on when they started or finished execution.  If you look carefully, you will notice that a number of consecutive threads start and end _exactly the same clock_.  Those threads are called a _warp_ and they share an instruction pointer.  It is very analogous to SIMD instruction found in CPUs that apply the same operation on multiple operands.  Guess the number of threads of a warp from the experimental results and confirm it by Google.

## 5-4. many threads in many threads/block
* play with changing N and B to other values

In [None]:
BEGIN SOLUTION
END SOLUTION
N=150
B=64
./cuda_sched_rec ${N} ${B} 100 1000 > cs_N_B.dat

In [None]:
BEGIN SOLUTION
END SOLUTION
cuda_sched_plt(["cs_N_B.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))

* Try to find the maximum number of threads that does not increase the execution time.

# 6. Thread blocks
A thread block is the unit of dispatching to a streaming multiprocessor (SM), which is like a physical core of a CPU.  Threads within a thread block are always dispatched together to the same SM and once dispatched stay on the same SM until finished.

* see [CUDA C++ Programming Guide: A Scalable Programming Model](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#scalable-programming-model)

An SM is a highly multithreaded processor, which can accommodate many threads at the same time and interleave them  by hardware.  For example, it can easily hold, say, 500 threads and interleave their execution without involving software.  In terms of hardware capability, it is somewhat similar to simultaneous multithreading (SIMT) of CPUs.  The degree of multithreading is very different, however; Intel CPUs normally support only two hardware threads (virtual cores) on each physical core.  Moreover, software (either operating system or user-level software) needs to designate which virtual core you want to run a thread on.  In a sense, CPU exposes each virtual core as a single-threaded machine.  If you put more than one OpenMP (OS-level) thread on the same virtual core, software should switch between them from time to time.  A streaming multiprocessor of a GPU, in contrast, is a machine that literally takes many threads and concurrently executes them by hardware.  Determining the SM a thread block executes on is done by hardware.

How many thread blocks are scheduled on an SM at the same time?  It depends; it depends on how much "resources" a single thread block requires.  Here, "resources" mean two things.

1. registers
1. shared memory (see below)


_Registers_ are used for holding local variables and intermediate results of computation.  How many registers a thread block uses is not something you can reliably determine by looking at your code; it depends on the code generated by the compiler.  You can know it by passing `-Xptxas -v` to nvcc and looking at the compiler message.

_Shared memory_ is a scratch-pad memory only shared within a single thread block.  Physically, you can consider it to be a small fast memory attached to each SM.  The name "shared memory" is clearly a misnomer; ordinary memory you get by `cudaMalloc` _is_ shared by all threads (called "global memory").  In contrast, shared memory is, contrary to its name, shared only among threads within a single thread block.  "Local memory" (as opposed to global memory) would have been a better name for it, IMO.

Both registers and shared memory for a thread block are kept on physical registers/memory of an SM throughout the lifetime of the thread block.  Thus, accommodating a larger number of thread blocks at the same time requires a proportionally larger amount of registers/shared memory, which is subject to the physical resource limit of an SM.

Each SM has the following physical resources.

|       | registers      |  shared memory  |
|-------|----------------|-----------------|
|Pascal | 32 bit x 65536 |  64KB           |
|Volta  | 32 bit x 65536 |  up to 96KB (*) |
|Ampere | 32 bit x 65536 |  up to 163KB    |

(*) configurable subject to L1 cache + shared memory <= 128KB and shared memory <= 96KB

* [Pascal Tuning Guide: Occupancy](https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#sm-occupancy)
* [Volta Tuning Guide: Occupancy](https://docs.nvidia.com/cuda/volta-tuning-guide/index.html#sm-occupancy)
* [NVIDIA Ampere GPU Architecture Tuning Guide: Occupancy](https://docs.nvidia.com/cuda/ampere-tuning-guide/index.html#sm-occupancy)

By default, a thread does not use shared memory at all.

Let's observe how many registers a thread uses.

In [None]:
BEGIN SOLUTION
END SOLUTION
nvcc --generate-code arch=compute_80,code=sm_80 -Xptxas -v -o cuda_sched_rec cuda_sched_rec.cu

Since the computation is very simple, register usage will not be a limiting factor for this computation.
Also, since it does not use shared memory at all, it won't be a limiting factor either.
Only the hardwired limit is the limiting factor.

# 7. Shared memory
* Let's use shared memory to observe how it affects the number of thread blocks simultaneously executed.
You specify the size of shared memory per thread block via the third parameter of kernel call, like this.

```
f<<<nb,bs,S>>>();
```

The above kernel launch statement specifies that $S$ bytes of shared memory should be allocated to _each thread block_.  Each SM can therefore execute only up to (SHARED_MEMORY_SIZE_PER_SM / $S$) thread blocks simultaneously.

You can get a pointer to the part of the shared memory allocated to each thread via the following strange syntax within your kernel function, though it is not necessary in our current experiment.

```
extern __shared__ T shmem[];
```

With that, `shmem` points to the start of the shared memory for the thread block.  The name can be arbitrary.

`cuda_sched_rec.cu` is already written to take the size of the shared memory per thread block as a parameter.

Let's allocate 32KB for each thread block; then, on Ampere, only up to three thread blocks (163KB/32KB) can be executed simultaneously.

The following creates 100 thread blocks (in order to avoid creating too many threads, it will set the thread per block to an unusual value of one).

In [None]:
BEGIN SOLUTION
END SOLUTION
N=150
S=$((32 * 1024))
./cuda_sched_rec ${N} 1 100 1000 ${S} > cs_N_1_S.dat

Before visualizing it, imagine what it is like.

In [None]:
BEGIN SOLUTION
END SOLUTION
cuda_sched_plt(["cs_N_1_S.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))

* Play with changing $N$ above; predict when thread blocks start executing not simultaneously (one after another) and confirm it by the experiment (hint: Ampere has 108 streaming multiprocessors).
* Change $S$ and see how it affects the above threshold value.

# 8. Warp
* Consecutively numbered 32 threads within a thread block makes a _warp_ and they can execute only one same instruction at a time.
* That is, it's not possible, within a single cycle, for some threads to execute an instruction A while others in the same warp execute another instruction B.  All the GPU can do is simply to keep some threads from executing instructions that they should not execute.
* A typical example is an "if" statement. e.g.,
```
if (thread_idx % 2 == 0) {
  A;
} else {
  B;
}
```
If there are _any_ thread executing A and _any_ thread executing B within a warp, the time the warp takes is the time to execute A _plus_ the time to execute B.
* An important performance implication is you'd better not have threads branching differently within the same warp.

* Change the following code as follows.
  * it takes an additional command line parameter D
  * each thread executes the loop
```
      for (long j = 0; j < m; j++) {
        x = a * x + b;
      }
```
when and only when (idx / D) is an odd number.
  * for example, if D is 1, then all even-numbered threads execute the loop and all odd-numbered threads do not execute it
  * if D is 32, for example, (idx / D) is essentially the "warp index"; even-numbered warps execute the loop and odd-numbered warps skip it

In [None]:
BEGIN SOLUTION
END SOLUTION
%%writefile cuda_sched_rec_warp.cu
#include <assert.h>
#include <stdio.h>

// error check utility (check_api_error and check_launch_error)
#include "cuda_util.h"

// record of execution
typedef long long int llint;
typedef struct {
  double x;                     // a (meaningless) answer 
  uint sm0;                     // SM on which a thread got started
  uint sm1;                     // SM on which a thread ended (MUST BE = sm0; just to verify that)
} record_t;

/* this thread repeats x = a x + b (N * M) times.
   it records the clock N times (every M iterations of x = a x + b)
   to array T.
   final result of x = a x + b, as well as SM each thread was executed
   on are recorded to R. */
__global__ void cuda_thread_fun(double a, double b, record_t * R,
                                llint * T, llint n, llint m,
                                int D,
                                int nthreads) {
  // my thread index
  int idx      = blockDim.x * blockIdx.x + threadIdx.x;
  if (idx >= nthreads) return;
  // initial value (not important)
  double x = idx;
  // where clocks are recorded
  T = &T[idx * n];
  // record starting SM
  R[idx].sm0 = get_smid();
  // main thing. repeat a x + b many times,
  // occasionally recording the clock
  for (long i = 0; i < n; i++) {
    T[i] = clock64();
    if ((idx / D) % 2 == 0) {
      for (long j = 0; j < m; j++) {
        x = a * x + b;
      }
    }
  }
  // record ending SM (must be = sm0)
  R[idx].sm1 = get_smid();
  // record result, just so that the computation is not
  // eliminated by the compiler
  R[idx].x = x;
}

/* usage
   ./cuda_sched NTHREADS THREAD_BLOCK_SZ N M S A B

   creates about NTHREADS threads, with THREAD_BLOCK_SZ
   threads in each thread block. 
   each thread repeats x = A x + B (N * M) times.

   S is the shared memory allocated for each thread block
   (just to control the number of thread blocks simultaneously
   scheduled on an SM). shared memory is not actually used at all.
 */
int main(int argc, char ** argv) {
  int i = 1;
  int nthreads        = (argc > i ? atoi(argv[i])  : 100);  i++;
  int thread_block_sz = (argc > i ? atoi(argv[i])  : 64);   i++;
  llint n             = (argc > i ? atoll(argv[i]) : 100);  i++;
  llint m             = (argc > i ? atoll(argv[i]) : 100);  i++;
  int D               = (argc > i ? atoll(argv[i]) : 1);    i++;
  int shm_sz          = (argc > i ? atoi(argv[i])  : 0);    i++;
  double a            = (argc > i ? atof(argv[i])  : 0.99); i++;
  double b            = (argc > i ? atof(argv[i])  : 1.00); i++;

  // get the required number of thread blocks
  int n_thread_blocks = (nthreads + thread_block_sz - 1) / thread_block_sz;
  printf("%d threads/block * %d blocks\n", thread_block_sz, n_thread_blocks);

  // allocate record_t array (both on host and device)
  long R_sz = sizeof(record_t) * nthreads;
  record_t * R = (record_t *)calloc(R_sz, 1);
  record_t * R_dev;
  check_api_error(cudaMalloc(&R_dev, R_sz));
  check_api_error(cudaMemcpy(R_dev, R, R_sz, cudaMemcpyHostToDevice));

  // allocate clock array (both on host and device)
  long T_sz = sizeof(llint) * n * nthreads;
  llint * T = (llint *)calloc(T_sz, 1);
  llint * T_dev;
  check_api_error(cudaMalloc(&T_dev, T_sz));
  check_api_error(cudaMemcpy(T_dev, T, T_sz, cudaMemcpyHostToDevice));

  // call the kernel
  int shm_elems = shm_sz / sizeof(double);
  int shm_size = shm_elems * sizeof(double);
  check_launch_error((cuda_thread_fun<<<n_thread_blocks,thread_block_sz,shm_size>>>
                      (a, b, R_dev, T_dev, n, m, D, nthreads)));
  cudaDeviceSynchronize();

  // get back the results and clocks
  check_api_error(cudaMemcpy(R, R_dev, R_sz, cudaMemcpyDeviceToHost));
  check_api_error(cudaMemcpy(T, T_dev, T_sz, cudaMemcpyDeviceToHost));
  // dump the for visualization
  long k = 0;
  for (long idx = 0; idx < nthreads; idx++) {
    printf("thread=%ld x=%f sm0=%u sm1=%u", idx, R[idx].x, R[idx].sm0, R[idx].sm1);
    for (long i = 0; i < n; i++) {
      printf(" %lld", T[k]);
      k++;
    }
    printf("\n");
  }
  return 0;
}

In [None]:
BEGIN SOLUTION
END SOLUTION
nvcc --generate-code arch=compute_80,code=sm_80 -o cuda_sched_rec_warp cuda_sched_rec_warp.cu

Execute the code with various D's (and perhaps other parameters) to visualize the effect of warps and its performance implication

In [None]:
BEGIN SOLUTION
END SOLUTION
N=256
./cuda_sched_rec_warp ${N} 32 100 1000 1 > cs_warp.dat

In [None]:
BEGIN SOLUTION
END SOLUTION
cuda_sched_plt(["cs_warp.dat"], start_t=0, end_t=float("inf"), start_thread=0, end_thread=float("inf"))