#  Cost of Data Access (Caches and Memory Performance)
# 1. Introduction

# 2. Compilers
## 2-1. Set up NVIDIA CUDA and HPC SDK
Execute this before you use NVIDIA HPC SDK

In [None]:
export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/24.9/compilers/bin:$PATH
export PATH=/usr/local/cuda/bin:$PATH

* Check if it works
  * make sure the full path of nvcc is shown as `/usr/local/...`, not `/opt/nvidia/...`
* We do not recommend nvc/nvc++ for this exercise, but you might give them a try if you like

In [None]:
which nvcc
which nvc
which nvc++
nvcc --version
nvc --version

## 2-2. Set up LLVM
Execute this before you use LLVM


In [None]:
export PATH=/home/share/llvm/bin:$PATH
export LD_LIBRARY_PATH=/home/share/llvm/lib:/home/share/llvm/lib/x86_64-unknown-linux-gnu:$LD_LIBRARY_PATH

Check if it works (check if full paths of gcc/g++ are shown)

In [None]:
which clang
which clang++

## 2-3. GCC

Check if it works (check if full paths of nvc/nvc++ are shown)

In [None]:
which gcc
which g++

# 3. Measuring Latency
* We do a simple experiment to measure the _latency_ of data access when the data comes from various levels of caches
* We want to execute many ($n$) load instructions each of which is _dependent_ on the previous load instruction, measure the execution time ($T$), and divide it by $n$ (to get $T/n$)
* To make a load instruction dependent on the previous load instruction, we determine which address it accesses based on the result of the previous load instruction, like this
```
k = 0;
for (i = 0; i < n; i++) {
  k = a[k];
}
```
* A similar access behavior happens when the processor _chases pointers_ like this
```
p = ...;
for (i = 0; i < n; i++) {
  p = p->next;
}
```
so we call this kind of code _pointer chasing_ code, although we do not explicitly use pointers ($k$ serves as a "pseudo pointer" that specifies the next element that should be accessed)
* We change the size of array $a$ and make sure the above loop repeatedly touches every element of $a$
* Here is an example of $a$ (with 16 elements) and (part of) the resulting access order (`a[0] -> a[3] -> a[14] -> a[10] -> a[7] -> a[15] -> a[1] -> ... -> a[4] -> a[0] -> ..`)
* confirm that the resulting chain comes back to `a[0]` after accessing _all_ 16 elements of the array $a$

<img src="svg/latency_measurement_L1.svg" />

* We also make sure the resulting access order is essentially random, to avoid the effect of prefetching or any smartness the processor might implement to run the above loop faster than an iteration / latency of the load instruction.


* We use only a single thread for now,
* Although it is meant to be a single-thread experiment, we still use OpenMP so that it can run on GPUs too (with a single source code)
* For readability, we split the program into two files
* Here is the main function

In [None]:
%%writefile main.cc
#include <assert.h>
#include <err.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#if __NVCOMPILER                // NVIDIA nvc++
#else  // Clang
#define __host__
#define __device__
#define __global__
#endif

/* get current time in nanosecond */
double cur_time() {
  struct timespec ts[1];
  int ok = clock_gettime(CLOCK_REALTIME, ts);
  if (ok == -1) err(1, "clock_gettime");
  return ts->tv_sec + ts->tv_nsec * 1e-9;
}

/* random number generator */
struct prng {
  long sd;
  void seed(long sd) {
    this->sd = sd;
  }
  long gen_randint() {
    long a = 0x5DEECE66Dull;
    long c = 0xB;
    sd = (a * sd + c) & ((1L << 48) - 1);
    long y = sd >> 17;
    return y;
  }
};

/* allocate on the device the main thing will be run */
template<typename T>
T * alloc_dev(size_t n_elems) {
  size_t sz = sizeof(T) * n_elems;
  T * b;
  if (-1 == posix_memalign((void **)&b, 4096, sz)) err(1, "posix_memalign");
  return b;
}

template<typename T>
void dealloc_dev(T * b) {
  free((void *)b);
}

/* swap a[i] and a[j] */
void swap(long * a, long i, long j) {
  long ai = a[i];
  long aj = a[j];
  a[i] = aj;
  a[j] = ai;
}

/* shuffle seq = [0,1,2,...,n_cycles*len_cycle-1];
   make sure
   (1) coalese_size consecutive elements are sequential.
   (2) seq[0:n_cycles] = [0,1,...,n_cycles] */
void shuffle(long * seq, long coalese_size,
             long n_cycles, long len_cycle, long seed) {
  assert(n_cycles % coalese_size == 0);
  long m = n_cycles * len_cycle;
  for (long i = 0; i < m; i++) {
    seq[i] = i;
  }
  if (seed >= 0) {
    prng rg;
    rg.seed(seed);
    long n_blocks = m / coalese_size;
    for (long i = n_cycles / coalese_size; i < n_blocks; i++) {
      long j = rg.gen_randint() % (n_blocks - i);
      for (long k = 0; k < coalese_size; k++) {
        swap(seq, i * coalese_size + k, (i + j) * coalese_size + k);
      }
    }
  }
}

/* set a[k] to the next element to access */
__host__ __device__
void make_cycle(long * a, long * seq,
                long start_idx, long n_cycles, long len_cycle) {
  // a cycle starting from seq[idx] :
  // a[seq[idx]] -> a[seq[idx+n_cycles]] -> a[seq[idx+2*n_cycles]] -> ..
  long m = n_cycles * len_cycle;
  for (long i = 0; i < len_cycle; i++) {
    long cur  = seq[ start_idx +      i  * n_cycles];
    long next = seq[(start_idx + (i + 1) * n_cycles) % m];
    a[cur] = next;
  }
}


void make_cycles(long * a, long * seq, long m,
                 long n_cycles, long len_cycle, 
                 long n_teams, long n_threads_per_team) {
#pragma omp target teams distribute parallel for num_teams(n_teams) num_threads(n_threads_per_team) map(tofrom: a[0:m], seq[0:m])
  for (long idx = 0; idx < n_cycles; idx++) {
    make_cycle(a, seq, idx, n_cycles, len_cycle);
  }
}

/* defined in a separate file (latency.cc or latency_ilp.cc) */
void cycles(long * a, long m, long n, long * end, long n_cycles,
            long n_conc_cycles,
            long n_teams, long n_threads_per_team, int * thread_idx);

struct opts {
  /* number of elements */
  long n_elements;
  /* minimum number of scans */
  double min_scans;
  /* minimum number of accesses */
  long min_accesses;
  /* number of consecutive elements guaranteed to be contiguous */
  long coalese_size;
  long n_cycles;
  long n_conc_cycles;
  long seed;
  opts() {
    n_elements = 1L << 24;
    min_scans = 5.3;
    min_accesses = (1 << 20);
    coalese_size = 1;
    n_cycles = 1;
    n_conc_cycles = 1;
    seed = 123456789012345L;
  }
};

void usage(char * prog) {
  opts o;
  fprintf(stderr, "usage:\n");
  fprintf(stderr, "  %s [options]\n", prog);
  fprintf(stderr, "options:\n");
  fprintf(stderr, "  -m,--n-elements (%ld)\n", o.n_elements);
  fprintf(stderr, "  --min-scans N (%.3f)\n", o.min_scans);
  fprintf(stderr, "  --min-accesses N (%ld)\n", o.min_accesses);
  fprintf(stderr, "  -c,--coalese-size N (%ld)\n", o.coalese_size);
  fprintf(stderr, "  --n-cycles N (%ld)\n", o.n_cycles);
  fprintf(stderr, "  --seed N (%ld)\n", o.seed);
}

opts parse_opts(int argc, char ** argv) {
  static struct option long_options[] = {
    {"n-elements",          required_argument, 0, 'm' },
    {"min-scans",           required_argument, 0, 0 },
    {"min-accesses",        required_argument, 0, 0 },
    {"coalese-size",        required_argument, 0, 0 },
    {"n-cycles",            required_argument, 0, 0 },
    {"n-conc-cycles",       required_argument, 0, 0 },
    {"seed",                required_argument, 0, 0 },
    {0,         0,                 0,  0 }
  };
  opts o;
  while (1) {
    int option_index = 0;
    int c = getopt_long(argc, argv, "m:c:",
			long_options, &option_index);
    if (c == -1) break;

    switch (c) {
    case 'm':
      o.n_elements = atol(optarg);
      break;
    case 0:
      {
        const char * opt_name = long_options[option_index].name;
        if (strcmp(opt_name, "seed") == 0) {
          o.seed = atol(optarg);
        } else if (strcmp(opt_name, "min-scans") == 0) {
          o.min_scans = atof(optarg);
        } else if (strcmp(opt_name, "min-accesses") == 0) {
          o.min_accesses = atol(optarg);
        } else if (strcmp(opt_name, "coalese-size") == 0) {
          o.coalese_size = atol(optarg);
        } else if (strcmp(opt_name, "n-cycles") == 0) {
          o.n_cycles = atol(optarg);
        } else if (strcmp(opt_name, "n-conc-cycles") == 0) {
          o.n_conc_cycles = atol(optarg);
        } else {
          usage(argv[0]);
          exit(1);
        }
        break;
      }
    default:
      usage(argv[0]);
      exit(1);
    }
  }
  return o;
}

long getenv_long(const char * s) {
  char * vs = getenv(s);
  if (!vs) {
    fprintf(stderr, "set environment variable %s\n", s);
    exit(0);
  }
  return atol(vs);
}

typedef long longv;

int main(int argc, char ** argv) {
  const long L = sizeof(longv) / sizeof(long);
  long n_teams = getenv_long("OMP_NUM_TEAMS");
  long n_threads_per_team = getenv_long("OMP_NUM_THREADS");
  opts opt = parse_opts(argc, argv);
  long m = opt.n_elements;
  long n_cycles = opt.n_cycles;
  long coalese_size = opt.coalese_size;
  assert(n_cycles % coalese_size == 0);
  assert(coalese_size % L == 0);
  long len_cycle = (m + n_cycles - 1) / n_cycles;
  if (m % n_cycles) {
    fprintf(stderr,
            "WARNING : m (%ld) not divisible by n_cycles (%ld),"
            " rounded up to %ld\n",
            m, n_cycles, len_cycle * n_cycles);
    m = len_cycle * n_cycles;
  }
  printf("n_elements : %ld\n", m);
  size_t sz = sizeof(long) * m;
  printf("sz : %ld bytes\n", sz);
  printf("n_cycles : %ld\n", n_cycles);
  printf("len_cycle : %ld\n", len_cycle);
  double s = opt.min_scans;
  long n = len_cycle * s;
  if (n * n_cycles < opt.min_accesses) {
    n = (opt.min_accesses + n_cycles - 1) / n_cycles;
  }
  printf("n_accesses_per_cycle : %ld\n", n);
  printf("total_accesses : %ld\n", n * n_cycles);
  long n_conc_cycles = opt.n_conc_cycles;
  printf("n_conc_cycles : %ld\n", n_conc_cycles);
  assert(n_cycles % n_conc_cycles == 0);
  printf("coalese_size : %ld\n", coalese_size);
  printf("seed : %ld\n", opt.seed);

  long * seq = alloc_dev<long>(m);
  shuffle(seq, coalese_size, n_cycles, len_cycle, opt.seed);

  long * a = alloc_dev<long>(m);
  double t0 = cur_time();
  make_cycles(a, seq, m, n_cycles, len_cycle, 
              n_teams, n_threads_per_team);
  double t1 = cur_time();
  double dt0 = t1 - t0;
  printf("make_cycles_total : %f sec\n", dt0);
  printf("make_cycles_per_elem : %.1f nsec\n", 1.0e9 * dt0 / m);
  long * end = alloc_dev<long>(n_cycles);
  int * thread_idx = alloc_dev<int>(n_cycles);
  double t2 = cur_time();
  cycles(a, m, n, end, n_cycles, n_conc_cycles,
         n_teams, n_threads_per_team, thread_idx);
  double t3 = cur_time();
  double dt1 = t3 - t2;
  long bytes = sizeof(long) * n * n_cycles;
  double bw = bytes / dt1;
  printf("bytes accessed : %ld bytes\n", bytes);
  printf("time_total : %f sec\n", dt1);
  printf("time_per_access : %.1f nsec/access\n", 1.0e9 * dt1 / (n * n_cycles));
  printf("bw : %.3f GB/sec\n", bw * 1.e-9);
  printf("checking results ... "); fflush(stdout);
  for (long idx = 0; idx < n_cycles; idx++) {
    assert(end[idx] == seq[(idx + n * n_cycles) % m]);
  }
  printf("OK\n");
#if 0
  for (long idx = 0; idx < n_cycles; idx++) {
    printf("idx = %ld by thread %d\n", idx, thread_idx[idx]);
  }
#endif
  dealloc_dev(seq);
  dealloc_dev(end);
  dealloc_dev(a);
  return 0;
}


* Here is the core part of the program that accesses the array

In [None]:
%%writefile latency.cc
#include <assert.h>
#include <stdio.h>
#include <omp.h>
#if __NVCOMPILER                // NVIDIA nvc++
#include <nv/target>
__device__ int get_thread_index() {
  if target(nv::target::is_device) {
    unsigned int thread_idx;
    unsigned int block_idx;
    unsigned int block_dim;
    asm volatile ("mov.u32 %0, %%ntid.x;"  : "=r"(block_dim));
    asm volatile ("mov.u32 %0, %%ctaid.x;" : "=r"(block_idx));
    asm volatile ("mov.u32 %0, %%tid.x;"   : "=r"(thread_idx));
    int global_idx = thread_idx + block_idx * block_dim;
    return global_idx;
  } else {
    return omp_get_thread_num();
  }
}
__device__ int get_n_threads() {
  if target(nv::target::is_device) {
    unsigned int grid_dim;
    unsigned int block_dim;
    asm volatile ("mov.u32 %0, %%ntid.x;"  : "=r"(block_dim));
    asm volatile ("mov.u32 %0, %%nctaid.x;" : "=r"(grid_dim));
    return grid_dim * block_dim;
  } else {
    return omp_get_num_threads();
  }
}
#else  // Clang
#define __host__
#define __device__
#define __global__
__device__ int get_thread_index() {
#if __CUDA_ARCH__
    unsigned int thread_idx;
    unsigned int block_idx;
    unsigned int block_dim;
    asm volatile ("mov.u32 %0, %%ntid.x;"  : "=r"(block_dim));
    asm volatile ("mov.u32 %0, %%ctaid.x;" : "=r"(block_idx));
    asm volatile ("mov.u32 %0, %%tid.x;"   : "=r"(thread_idx));
    int global_idx = thread_idx + block_idx * block_dim;
    return global_idx;
#else
    return omp_get_thread_num();
#endif
}
__device__ int get_n_threads() {
#if __CUDA_ARCH__
    unsigned int grid_dim;
    unsigned int block_dim;
    asm volatile ("mov.u32 %0, %%ntid.x;"  : "=r"(block_dim));
    asm volatile ("mov.u32 %0, %%nctaid.x;" : "=r"(grid_dim));
    return grid_dim * block_dim;
#else
    return omp_get_num_threads();
#endif
}
#endif

/* starting from cell &a[idx], chase ->next ptr n times
   and put where it ends in end[idx] */
__host__ __device__
void cycle(long * a, long idx, long n, long * end, int * thread_idx) {
  long k = idx;
  asm volatile("// ========== loop begins ========== ");
#pragma unroll(8)
  for (long i = 0; i < n; i++) {
    k = a[k];
  }
  asm volatile("// ---------- loop ends ---------- ");
  end[idx] = k;
  thread_idx[idx] = k;
}


/* a is an array of m cells;
   starting from &a[idx] for each idx in [0:n_cycles],
   chase ->next ptr n times and put where it ends in end[idx] */
void cycles(long * a, long m, long n, long * end, long n_cycles,
            long n_conc_cycles,
            long n_teams, long n_threads_per_team, int * thread_idx) {
#pragma omp target teams distribute parallel for num_teams(n_teams) num_threads(n_threads_per_team) map(tofrom: a[0:m], end[0:n_cycles], thread_idx[0:n_cycles])
  for (long idx = 0; idx < n_cycles; idx++) {
    cycle(a, idx, n, end, thread_idx);
  }
}


* Compile them together

In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -DDBG=0 -Wall -O3 -mavx512f -mfma -fopenmp -fopenmp-targets=nvptx64 -o latency latency.cc main.cc
#nvc++   -Wall -mavx512f -mfma -mp=gpu -cuda -o latency latency.cc main.cc

* run it on a CPU with a single thread (remember `OMP_TARGET_OFFLOAD=DISABLED` disables GPU execution)
* let's run it with $m = 2^{24}$ elements $= 8 \times m = $ 128MB, sufficiently above its last level cache (57MB)
* the parameter $n$ below specifies how many accesses we perform (`n` below)
```
k = 0;
for (i = 0; i < n; i++) {
  k = a[k];
}
```

* the following command will take something like 15 seconds (be patient)

In [None]:
BEGIN SOLUTION
END SOLUTION
# single thread execution on CPU
# most data accesses will miss all caches
export OMP_TARGET_OFFLOAD=DISABLED
export OMP_NUM_TEAMS=1
export OMP_NUM_THREADS=1
export m=$((1 << 24))
export n=$((1 << 27))
./latency --n-elements ${m} --min-accesses ${n}

* look at the number shown as
```
latency_per_elem : XYZ nsec/elem
```
which gives you the latecy imposed by _main memory_ access (when the accesses misses caches at any level)

* observe the latency to main memory is very large (e.g., on 2.4 GHz processor, 1 nanosecond = 2.4 cycles, thus 80 nanoseconds is as large as 200 cycles) compared to typical latency of simple arithmetic (a few cycles)

* now look at the latency of L1 (faster/smalest level) cache, buy making $a$ smaller than the L1 cache size (64KB)

* we set $m = 2^{12}$ so that $a$ occupies 32KB, sufficiently smaller than L1 cache

* note that we set $n$ to the same value with before, so this program executes exactly the same number of iterations, with the only difference being how large area the array $a$ spans

In [None]:
BEGIN SOLUTION
END SOLUTION
# single thread execution on CPU
# most data accesses will hit L1 cache
export OMP_TARGET_OFFLOAD=DISABLED
export OMP_NUM_TEAMS=1
export OMP_NUM_THREADS=1
export m=$((1 << 12))
export n=$((1 << 27))
./latency --n-elements ${m} --min-accesses ${n}

* now run the same program on a GPU, again with a single (CUDA) thread (remember `OMP_TARGET_OFFLOAD=MANDATORY` makes sure the target region runs on GPU)
* all other parameters are set equal to CPU

In [None]:
BEGIN SOLUTION
END SOLUTION
# single thread execution on GPU
# most data accesses will miss all caches
export OMP_TARGET_OFFLOAD=MANDATORY
export OMP_NUM_TEAMS=1
export OMP_NUM_THREADS=1
export m=$((1 << 24))
export n=$((1 << 27))
./latency --n-elements ${m} --min-accesses ${n}

* observe the latency difference between CPU and GPU
* GPU has a few times larger latency to the main memory

* let's see what happens for array $a$ smaller than the L1 cache (192KB)
* to make a comarison to CPU, we set $m$ to the same value as the CPU experiment ($2^{12}$)

In [None]:
BEGIN SOLUTION
END SOLUTION
# single thread execution on GPU
# most data accesses will hit L1 cache
export OMP_TARGET_OFFLOAD=MANDATORY
export OMP_NUM_TEAMS=1
export OMP_NUM_THREADS=1
export m=$((1 << 12))
export n=$((1 << 27))
./latency --n-elements ${m} --min-accesses ${n}

* it is interesting to see the huge difference in L1 cache latency between CPU and GPU
* GPU imposes several dezens of nanoseconds even when an access hits the fastest cache, whereas the L1 latency of CPU caches is as small as a few nanoseconds (a few cycles)

# 4. Plotting the Latencies

* let's see how the latency is affected by the cache level data are coming from
* to see this, we plot the relationship between the size of $a$ and the latency per access

In [None]:
BEGIN SOLUTION
END SOLUTION
export OMP_TARGET_OFFLOAD=DISABLED
export OMP_NUM_TEAMS=1
export OMP_NUM_THREADS=1
export min_m=1000
export max_m=$((1 << 25))
export n=$((1 << 27))

m=${min_m}
while [ ${m} -lt ${max_m} ]; do
    echo "==== m=${m} ===="
    ./latency --n-elements ${m} --min-accesses ${n}
    m=$((m * 5 / 4))
done | tee cpu.txt
echo "done"

In [None]:
import vis_mem
vis_mem.vis_latency(["cpu.txt"])

In [None]:
BEGIN SOLUTION
END SOLUTION
export OMP_TARGET_OFFLOAD=MANDATORY
export OMP_NUM_TEAMS=1
export OMP_NUM_THREADS=1
export min_m=1000
export max_m=$((1 << 25))
export n=$((1 << 27))

m=${min_m}
while [ ${m} -lt ${max_m} ]; do
    echo "==== m=${m} ===="
    ./latency --n-elements ${m} --min-accesses ${n}
    m=$((m * 5 / 4))
done | tee gpu.txt
echo "done"

In [None]:
import vis_mem
vis_mem.vis_latency(["gpu.txt"])

* Compare the CPU and the GPU

In [None]:
import vis_mem
vis_mem.vis_latency(["cpu.txt", "gpu.txt"])

# 5. Increasing the Bandwidth
* The bandwidth reported above is

$$ \frac{\mbox{sizeof(long)}}{\mbox{latency per element}} $$

and it is essentially the reciprocal (inverse) of the latency

* When we look at the values when $a$ is much larger than the last level cache (so data are all coming from the main memory), the observed value is very small (e.g., $\approx 0.08$ GB/sec on CPU and $\approx 0.02$ GB/sec on GPU)
* They are much smaller than the advertised hardware bandwidth ($>$ 50 GB/sec for the CPU we are using and $\approx$ 1.5 TB/sec for the A100 GPU we are using)
* Just as we cannot reduce the latency of arithmetic, there is no way to reduce the latency between the main memory and the processor
* We can only increase the _bandwidth_ (the amount of data we can move per unit time) by _increasing the parallelism_

* There are multiple ways to do that
  * On CPUs, it is essential to exploit instruction level parallelism _in a single thread_, which can be done by performing several loops like this for different regions of $a$ (different `start` index below)
```
k = start;
for (i = 0; i < n; i++) {
  k = a[k];
}
```
  * On GPUs, principle is the same, but parallelism can be easily and most naturally extracted by having many CUDA threads perfoming a loop like the above

* Either way, we make multiple chains of pointers, each of which covers a disjoint region of the entire array
* Here is a simple example having _two_ 8-element chains of pointers
* Confirm that the chain starting from $a[1]$ makes another 8-element chain (a[1], a[8], a[9], ...)

<img src="svg/latency_measurement_L2.svg" />


## 5-1. Traversing Multiple Pointer Chains by A Single Thread
* We can chase two chains simultaneously by essentially doing something like this
```
k0 = 0;
k1 = 1;
for (i = 0; i < n; i++) {
  k0 = a[k0];
  k1 = a[k1];
}
```

* The code generalizes this idea so that we can chase an arbitrary number of ($C$) chains simultaneously

In [None]:
%%writefile latency_ilp.cc
#include <assert.h>
#include <stdio.h>
#include <omp.h>
#if __NVCOMPILER                // NVIDIA nvc++
#include <nv/target>
__device__ int get_thread_index() {
  if target(nv::target::is_device) {
    unsigned int thread_idx;
    unsigned int block_idx;
    unsigned int block_dim;
    asm volatile ("mov.u32 %0, %%ntid.x;"  : "=r"(block_dim));
    asm volatile ("mov.u32 %0, %%ctaid.x;" : "=r"(block_idx));
    asm volatile ("mov.u32 %0, %%tid.x;"   : "=r"(thread_idx));
    int global_idx = thread_idx + block_idx * block_dim;
    return global_idx;
  } else {
    return omp_get_thread_num();
  }
}
__device__ int get_n_threads() {
  if target(nv::target::is_device) {
    unsigned int grid_dim;
    unsigned int block_dim;
    asm volatile ("mov.u32 %0, %%ntid.x;"  : "=r"(block_dim));
    asm volatile ("mov.u32 %0, %%nctaid.x;" : "=r"(grid_dim));
    return grid_dim * block_dim;
  } else {
    return omp_get_num_threads();
  }
}
#else  // Clang
#define __host__
#define __device__
#define __global__
__device__ int get_thread_index() {
#if __CUDA_ARCH__
    unsigned int thread_idx;
    unsigned int block_idx;
    unsigned int block_dim;
    asm volatile ("mov.u32 %0, %%ntid.x;"  : "=r"(block_dim));
    asm volatile ("mov.u32 %0, %%ctaid.x;" : "=r"(block_idx));
    asm volatile ("mov.u32 %0, %%tid.x;"   : "=r"(thread_idx));
    int global_idx = thread_idx + block_idx * block_dim;
    return global_idx;
#else
    return omp_get_thread_num();
#endif
}
__device__ int get_n_threads() {
#if __CUDA_ARCH__
    unsigned int grid_dim;
    unsigned int block_dim;
    asm volatile ("mov.u32 %0, %%ntid.x;"  : "=r"(block_dim));
    asm volatile ("mov.u32 %0, %%nctaid.x;" : "=r"(grid_dim));
    return grid_dim * block_dim;
#else
    return omp_get_num_threads();
#endif
}
#endif

void cycle_conc(long * a, long idx, long C, long n, long * end, int * thread_idx) {
  long k[C];
  /* track only every L elements */
  for (long c = 0; c < C; c++) {
    k[c] = idx + c;
  }
  asm volatile("// ========== loop begins ========== ");
#pragma unroll(8)
  for (long i = 0; i < n; i++) {
    for (long c = 0; c < C; c++) {
      k[c] = a[k[c]];
    }
  }
  asm volatile("// ---------- loop ends ---------- ");
  for (long c = 0; c < C; c++) {
    end[idx + c] = k[c];
    thread_idx[idx + c] = get_thread_index();
  }
}


/* a is an array of m cells;
   starting from &a[idx] for each idx in [0:n_cycles],
   chase ->next ptr n times and put where it ends in end[idx] */
void cycles(long * a, long m, long n, long * end, long n_cycles,
            long n_conc_cycles,
            long n_teams, long n_threads_per_team, int * thread_idx) {
#pragma omp target teams distribute parallel for num_teams(n_teams) num_threads(n_threads_per_team) map(tofrom: a[0:m], end[0:n_cycles], thread_idx[0:n_cycles])
  for (long idx = 0; idx < n_cycles; idx += n_conc_cycles) {
    cycle_conc(a, idx, n_conc_cycles, n, end, thread_idx);
  }
}


* This code uses a variable-length array to have a number of ($C$) variables which is determined by command line
* It is not supported by `nvc++`, 
* `clang++` supports this, but only on CPUs
* Here is the trick to avoid variable length arrays by using templates up to a constant number

In [None]:
%%writefile latency_ilp_c.cc
#include <assert.h>
#include <stdio.h>
#include <omp.h>
#if __NVCOMPILER                // NVIDIA nvc++
#include <nv/target>
__device__ int get_thread_index() {
  if target(nv::target::is_device) {
    unsigned int thread_idx;
    unsigned int block_idx;
    unsigned int block_dim;
    asm volatile ("mov.u32 %0, %%ntid.x;"  : "=r"(block_dim));
    asm volatile ("mov.u32 %0, %%ctaid.x;" : "=r"(block_idx));
    asm volatile ("mov.u32 %0, %%tid.x;"   : "=r"(thread_idx));
    int global_idx = thread_idx + block_idx * block_dim;
    return global_idx;
  } else {
    return omp_get_thread_num();
  }
}
__device__ int get_n_threads() {
  if target(nv::target::is_device) {
    unsigned int grid_dim;
    unsigned int block_dim;
    asm volatile ("mov.u32 %0, %%ntid.x;"  : "=r"(block_dim));
    asm volatile ("mov.u32 %0, %%nctaid.x;" : "=r"(grid_dim));
    return grid_dim * block_dim;
  } else {
    return omp_get_num_threads();
  }
}
#else  // Clang
#define __host__
#define __device__
#define __global__
__device__ int get_thread_index() {
#if __CUDA_ARCH__
    unsigned int thread_idx;
    unsigned int block_idx;
    unsigned int block_dim;
    asm volatile ("mov.u32 %0, %%ntid.x;"  : "=r"(block_dim));
    asm volatile ("mov.u32 %0, %%ctaid.x;" : "=r"(block_idx));
    asm volatile ("mov.u32 %0, %%tid.x;"   : "=r"(thread_idx));
    int global_idx = thread_idx + block_idx * block_dim;
    return global_idx;
#else
    return omp_get_thread_num();
#endif
}
__device__ int get_n_threads() {
#if __CUDA_ARCH__
    unsigned int grid_dim;
    unsigned int block_dim;
    asm volatile ("mov.u32 %0, %%ntid.x;"  : "=r"(block_dim));
    asm volatile ("mov.u32 %0, %%nctaid.x;" : "=r"(grid_dim));
    return grid_dim * block_dim;
#else
    return omp_get_num_threads();
#endif
}
#endif

template<long C>
void cycle_conc_t(long * a, long idx, long n, long * end, int * thread_idx) {
  long k[C];
  for (long c = 0; c < C; c++) {
    k[c] = idx + c;
  }
  asm volatile("// ========== loop begins C = %0 ========== " : : "i" (C));
#pragma unroll(8)
  for (long i = 0; i < n; i++) {
    for (long c = 0; c < C; c++) {
      k[c] = a[k[c]];
    }
  }
  asm volatile("// ---------- loop ends C = %0 ---------- " : : "i" (C));
  for (long c = 0; c < C; c++) {
    end[idx + c] = k[c];
    thread_idx[idx + c] = get_thread_index();
  }
}

void cycle_conc(long * a, long idx, long C, long n, long * end, int * thread_idx) {
  const long max_const_c = 12;
  long c;
  for (c = 0; c + max_const_c <= C; c += max_const_c) {
    cycle_conc_t<max_const_c>(a, idx + c, n, end, thread_idx + c);
  }
  switch (C - c) {
  case 0:
    break;
  case 1:
    cycle_conc_t<1>(a, idx + c, n, end, thread_idx + c);
    break;
  case 2:
    cycle_conc_t<2>(a, idx + c, n, end, thread_idx + c);
    break;
  case 3:
    cycle_conc_t<3>(a, idx + c, n, end, thread_idx + c);
    break;
  case 4:
    cycle_conc_t<4>(a, idx + c, n, end, thread_idx + c);
    break;
  case 5:
    cycle_conc_t<5>(a, idx + c, n, end, thread_idx + c);
    break;
  case 6:
    cycle_conc_t<6>(a, idx + c, n, end, thread_idx + c);
    break;
  case 7:
    cycle_conc_t<7>(a, idx + c, n, end, thread_idx + c);
    break;
  case 8:
    cycle_conc_t<8>(a, idx + c, n, end, thread_idx + c);
    break;
  case 9:
    cycle_conc_t<9>(a, idx + c, n, end, thread_idx + c);
    break;
  case 10:
    cycle_conc_t<10>(a, idx + c, n, end, thread_idx + c);
    break;
  case 11:
    cycle_conc_t<11>(a, idx + c, n, end, thread_idx + c);
    break;
  default:
    assert(C < max_const_c);
    break;
  }
}
  


/* a is an array of m cells;
   starting from &a[idx] for each idx in [0:n_cycles],
   chase ->next ptr n times and put where it ends in end[idx] */
void cycles(long * a, long m, long n, long * end, long n_cycles,
            long n_conc_cycles,
            long n_teams, long n_threads_per_team, int * thread_idx) {
#pragma omp target teams distribute parallel for num_teams(n_teams) num_threads(n_threads_per_team) map(tofrom: a[0:m], end[0:n_cycles], thread_idx[0:n_cycles])
  for (long idx = 0; idx < n_cycles; idx += n_conc_cycles) {
    cycle_conc(a, idx, n_conc_cycles, n, end, thread_idx);
  }
}


In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -DDBG=0 -Wall -O3 -mavx512f -mfma -fopenmp -fopenmp-targets=nvptx64 -o latency_ilp_c latency_ilp_c.cc main.cc
#nvc++   -Wall -O3 -mavx512f -mfma -mp=gpu -cuda -o latency_ilp_c latency_ilp_c.cc main.cc

* To explore the effect of chasing multiple pointer chains simultaneously, the program has two parameters
  * `--n-cycles` : the number of disjoint chain of pointers in the array
  * `--n-conc-cycles` : the number of chains of pointers we traverse simultaneously (2 for the code shown just above)
* Obviously we need to set the former as large as the latter
* Below we simply set them to the same number ($C$)

* First let's set $C$ to 1

In [None]:
BEGIN SOLUTION
END SOLUTION
export OMP_TARGET_OFFLOAD=DISABLED
export OMP_NUM_TEAMS=1
export OMP_NUM_THREADS=1
export m=$((1 << 24))
export n=$((1 << 27))
export C=1
./latency_ilp_c --n-elements ${m} --min-accesses ${n} --n-cycles ${C} --n-conc-cycles ${C}

* Observe that this case shows a similar performance with the previous version
* Now let's set $C$ to 2 and see what happens

In [None]:
BEGIN SOLUTION
END SOLUTION
export OMP_TARGET_OFFLOAD=DISABLED
export OMP_NUM_TEAMS=1
export OMP_NUM_THREADS=1
export m=$((1 << 24))
export n=$((1 << 27))
export C=2
./latency_ilp_c --n-elements ${m} --min-accesses ${n} --n-cycles ${C} --n-conc-cycles ${C}

* Observe that `total_accesses` is the same and the execution time is almost halved (i.e., the bandwidth (`bw`) almost doubled)
* Play with larger values of `C`

* Make sure that this is not an unintended side effect of changing the way cycles are formed, by setting `--n-cycles 2` and comparing the two cases `--n-conc-cycles 1` and `--n-conc-cycles 2`

In [None]:
BEGIN SOLUTION
END SOLUTION
export OMP_TARGET_OFFLOAD=DISABLED
export OMP_NUM_TEAMS=1
export OMP_NUM_THREADS=1
export m=$((1 << 24))
export n=$((1 << 27))
export C=1
./latency_ilp_c --n-elements ${m} --min-accesses ${n} --n-cycles 2 --n-conc-cycles ${C}

## 5-2. Plotting C vs. Bandwidth (CPU)

In [None]:
BEGIN SOLUTION
END SOLUTION
export OMP_TARGET_OFFLOAD=DISABLED
export OMP_NUM_TEAMS=1
export OMP_NUM_THREADS=1
m=$((1 << 24))
n=$((1 << 27))

for C in 1 2 3 4 6 8 10 12 14 16 ; do
    echo "==== C=${C} ===="
    echo ./latency_ilp_c --n-elements ${m} --min-accesses ${n} --n-cycles ${C} --n-conc-cycles ${C}
done | tee cpu_bw.txt
echo "done"

In [None]:
import vis_mem
vis_mem.vis_bw(["cpu_bw.txt"])

## 5-3. Plotting C vs. Bandwidth (GPU)
* Let's see whether a similar thing happens on the GPU
* Set the value of `C` to 1, 2, 3, ... and see the effect

In [None]:
BEGIN SOLUTION
END SOLUTION
export OMP_TARGET_OFFLOAD=MANDATORY
export OMP_NUM_TEAMS=1
export OMP_NUM_THREADS=1
export m=$((1 << 24))
export n=$((1 << 27))
export C=1
./latency_ilp_c --n-elements ${m} --min-accesses ${n} --n-cycles ${C} --n-conc-cycles ${C}

## 5-4. Traversing Multiple Pointer Chains by Multiple Threads on GPU (OpenMP)

* For GPU, a much more straightforward way to increase the bandwidth is, of course, increasing the number of threads
* We simply set the number of cycles (`--n-cycles`) and the number of threads to the same number
* It lets each thread follow a single pointer chain

In [None]:
BEGIN SOLUTION
END SOLUTION
export OMP_TARGET_OFFLOAD=MANDATORY
export OMP_NUM_TEAMS=1
export m=$((1 << 24))
export n=$((1 << 27))
export C=1
export OMP_NUM_THREADS=${C}
./latency --n-elements ${m} --min-accesses ${n} --n-cycles ${C}

* Increase C to 32, 64, ... (a multiple of 32) and see how far you can go

* Let's visualize how the bandwidth increases with the number of threads

In [None]:
BEGIN SOLUTION
END SOLUTION
export OMP_TARGET_OFFLOAD=MANDATORY
export OMP_NUM_TEAMS=1
m=$((1 << 24))
n=$((1 << 27))

for C in 32 64 96 128 192 256 384 512 768 1024 ; do
    echo "==== C=${C} ===="
    OMP_NUM_THREADS=${C} ./latency --n-elements ${m} --min-accesses ${n} --n-cycles ${C}
done | tee gpu_bw_threads.txt

In [None]:
import vis_mem
vis_mem.vis_bw(["gpu_bw_threads.txt"])

* <font color="red">NOTE:</font> I found a number of issues and mysteries with `clang++` and `nvc++` in this experiment
* `clang++`
  * performance is very poor (the bandwidth does not increase almost at all when increasing the number of threads)
  * I haven't had the time to look into it
* `nvc++`
  * bandwidth scales much better than clang++
  * but when I give `-O2` option or above in the command line, the compilation succeeds but when executing the commnad, it fails with the following error

```
$ ./latency --n-elements 16777216 --min-accesses 134217728 --n-cycles 32
n_elements : 16777216
sz : 134217728 bytes
n_cycles : 32
len_cycle : 524288
n_accesses_per_cycle : 4194304
total_accesses : 134217728
n_conc_cycles : 1
coalese_size : 1
seed : 123456789012345
Module function not found, error 500
Accelerator Fatal Error: Failed to find device function 'nvkernel__Z6cyclesPlllS_llllPi_F1L83_2'! File was compiled with: -gpu=cc80
Rebuild this file with -gpu=cc80 to use NVIDIA Tesla GPU 0
 File: /home/pd/parallel-distributed-programming/jupyter/nb/source/pd07_mem_ext/include/ver/main_omp.cc
 Function: _Z11make_cyclesPlS_lllll:101
 Line: 101
```

* The best workaround for now is to **use nvc++ with `-O1` or below**
* Or use CUDA, which I demonstarte below

## 5-5. Traversing Multiple Pointer Chains by Multiple Threads on GPU (CUDA)
* This would be unnecessary if OpenMP result were more clear-cut

* The main file

In [None]:
%%writefile main_cuda.cc
#include <assert.h>
#include <err.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "cuda_util.h"

/* get current time in nanosecond */
double cur_time() {
  struct timespec ts[1];
  int ok = clock_gettime(CLOCK_REALTIME, ts);
  if (ok == -1) err(1, "clock_gettime");
  return ts->tv_sec + ts->tv_nsec * 1e-9;
}

/* random number generator */
struct prng {
  long sd;
  void seed(long sd) {
    this->sd = sd;
  }
  long gen_randint() {
    long a = 0x5DEECE66Dull;
    long c = 0xB;
    sd = (a * sd + c) & ((1L << 48) - 1);
    long y = sd >> 17;
    return y;
  }
};

/* allocate on the device the main thing will be run */
template<typename T>
T * alloc_dev(size_t n_elems) {
  size_t sz = sizeof(T) * n_elems;
  T * b;
  check_cuda_api(cudaMallocManaged((void **)&b, sz));
  return b;
}

template<typename T>
void dealloc_dev(T * b) {
  check_cuda_api(cudaFree((void *)b));
}

/* swap a[i] and a[j] */
void swap(long * a, long i, long j) {
  long ai = a[i];
  long aj = a[j];
  a[i] = aj;
  a[j] = ai;
}

/* shuffle seq = [0,1,2,...,n_cycles*len_cycle-1];
   make sure
   (1) coalese_size consecutive elements are sequential.
   (2) seq[0:n_cycles] = [0,1,...,n_cycles] */
void shuffle(long * seq, long coalese_size,
             long n_cycles, long len_cycle, long seed) {
  assert(n_cycles % coalese_size == 0);
  long m = n_cycles * len_cycle;
  for (long i = 0; i < m; i++) {
    seq[i] = i;
  }
  if (seed >= 0) {
    prng rg;
    rg.seed(seed);
    long n_blocks = m / coalese_size;
    for (long i = n_cycles / coalese_size; i < n_blocks; i++) {
      long j = rg.gen_randint() % (n_blocks - i);
      for (long k = 0; k < coalese_size; k++) {
        swap(seq, i * coalese_size + k, (i + j) * coalese_size + k);
      }
    }
  }
}

/* set a[k] to the next element to access */
__host__ __device__
void make_cycle(long * a, long * seq,
                long start_idx, long n_cycles, long len_cycle) {
  // a cycle starting from seq[idx] :
  // a[seq[idx]] -> a[seq[idx+n_cycles]] -> a[seq[idx+2*n_cycles]] -> ..
  long m = n_cycles * len_cycle;
  for (long i = 0; i < len_cycle; i++) {
    long cur  = seq[ start_idx +      i  * n_cycles];
    long next = seq[(start_idx + (i + 1) * n_cycles) % m];
    a[cur] = next;
  }
}

__global__
void make_cycles_g(long * a, long * seq, long n_cycles, long len_cycle) {
  long nthreads = get_n_threads();
  for (long idx = get_thread_index(); idx < n_cycles; idx += nthreads) {
    make_cycle(a, seq, idx, n_cycles, len_cycle);
  }
}

void make_cycles(long * a, long * seq, long m,
                 long n_cycles, long len_cycle, 
                 long n_teams, long n_threads_per_team) {
  check_cuda_launch((make_cycles_g<<<n_teams,n_threads_per_team>>>(a, seq, n_cycles, len_cycle)));
}

/* defined in a separate file (latency.cc or latency_ilp.cc) */
void cycles(long * a, long m, long n, long * end, long n_cycles,
            long n_conc_cycles,
            long n_teams, long n_threads_per_team, int * thread_idx);

struct opts {
  /* number of elements */
  long n_elements;
  /* minimum number of scans */
  double min_scans;
  /* minimum number of accesses */
  long min_accesses;
  /* number of consecutive elements guaranteed to be contiguous */
  long coalese_size;
  long n_cycles;
  long n_conc_cycles;
  long seed;
  opts() {
    n_elements = 1L << 24;
    min_scans = 5.3;
    min_accesses = (1 << 20);
    coalese_size = 1;
    n_cycles = 1;
    n_conc_cycles = 1;
    seed = 123456789012345L;
  }
};

void usage(char * prog) {
  opts o;
  fprintf(stderr, "usage:\n");
  fprintf(stderr, "  %s [options]\n", prog);
  fprintf(stderr, "options:\n");
  fprintf(stderr, "  -m,--n-elements (%ld)\n", o.n_elements);
  fprintf(stderr, "  --min-scans N (%.3f)\n", o.min_scans);
  fprintf(stderr, "  --min-accesses N (%ld)\n", o.min_accesses);
  fprintf(stderr, "  -c,--coalese-size N (%ld)\n", o.coalese_size);
  fprintf(stderr, "  --n-cycles N (%ld)\n", o.n_cycles);
  fprintf(stderr, "  --seed N (%ld)\n", o.seed);
}

opts parse_opts(int argc, char ** argv) {
  static struct option long_options[] = {
    {"n-elements",          required_argument, 0, 'm' },
    {"min-scans",           required_argument, 0, 0 },
    {"min-accesses",        required_argument, 0, 0 },
    {"coalese-size",        required_argument, 0, 0 },
    {"n-cycles",            required_argument, 0, 0 },
    {"n-conc-cycles",       required_argument, 0, 0 },
    {"seed",                required_argument, 0, 0 },
    {0,         0,                 0,  0 }
  };
  opts o;
  while (1) {
    int option_index = 0;
    int c = getopt_long(argc, argv, "m:c:",
			long_options, &option_index);
    if (c == -1) break;

    switch (c) {
    case 'm':
      o.n_elements = atol(optarg);
      break;
    case 0:
      {
        const char * opt_name = long_options[option_index].name;
        if (strcmp(opt_name, "seed") == 0) {
          o.seed = atol(optarg);
        } else if (strcmp(opt_name, "min-scans") == 0) {
          o.min_scans = atof(optarg);
        } else if (strcmp(opt_name, "min-accesses") == 0) {
          o.min_accesses = atol(optarg);
        } else if (strcmp(opt_name, "coalese-size") == 0) {
          o.coalese_size = atol(optarg);
        } else if (strcmp(opt_name, "n-cycles") == 0) {
          o.n_cycles = atol(optarg);
        } else if (strcmp(opt_name, "n-conc-cycles") == 0) {
          o.n_conc_cycles = atol(optarg);
        } else {
          usage(argv[0]);
          exit(1);
        }
        break;
      }
    default:
      usage(argv[0]);
      exit(1);
    }
  }
  return o;
}

long getenv_long(const char * s) {
  char * vs = getenv(s);
  if (!vs) {
    fprintf(stderr, "set environment variable %s\n", s);
    exit(0);
  }
  return atol(vs);
}

typedef long longv;

int main(int argc, char ** argv) {
  const long L = sizeof(longv) / sizeof(long);
  long n_teams = getenv_long("OMP_NUM_TEAMS");
  long n_threads_per_team = getenv_long("OMP_NUM_THREADS");
  opts opt = parse_opts(argc, argv);
  long m = opt.n_elements;
  long n_cycles = opt.n_cycles;
  long coalese_size = opt.coalese_size;
  assert(n_cycles % coalese_size == 0);
  assert(coalese_size % L == 0);
  long len_cycle = (m + n_cycles - 1) / n_cycles;
  if (m % n_cycles) {
    fprintf(stderr,
            "WARNING : m (%ld) not divisible by n_cycles (%ld),"
            " rounded up to %ld\n",
            m, n_cycles, len_cycle * n_cycles);
    m = len_cycle * n_cycles;
  }
  printf("n_elements : %ld\n", m);
  size_t sz = sizeof(long) * m;
  printf("sz : %ld bytes\n", sz);
  printf("n_cycles : %ld\n", n_cycles);
  printf("len_cycle : %ld\n", len_cycle);
  double s = opt.min_scans;
  long n = len_cycle * s;
  if (n * n_cycles < opt.min_accesses) {
    n = (opt.min_accesses + n_cycles - 1) / n_cycles;
  }
  printf("n_accesses_per_cycle : %ld\n", n);
  printf("total_accesses : %ld\n", n * n_cycles);
  long n_conc_cycles = opt.n_conc_cycles;
  printf("n_conc_cycles : %ld\n", n_conc_cycles);
  assert(n_cycles % n_conc_cycles == 0);
  printf("coalese_size : %ld\n", coalese_size);
  printf("seed : %ld\n", opt.seed);

  long * seq = alloc_dev<long>(m);
  shuffle(seq, coalese_size, n_cycles, len_cycle, opt.seed);

  long * a = alloc_dev<long>(m);
  double t0 = cur_time();
  make_cycles(a, seq, m, n_cycles, len_cycle, 
              n_teams, n_threads_per_team);
  double t1 = cur_time();
  double dt0 = t1 - t0;
  printf("make_cycles_total : %f sec\n", dt0);
  printf("make_cycles_per_elem : %.1f nsec\n", 1.0e9 * dt0 / m);
  long * end = alloc_dev<long>(n_cycles);
  int * thread_idx = alloc_dev<int>(n_cycles);
  double t2 = cur_time();
  cycles(a, m, n, end, n_cycles, n_conc_cycles,
         n_teams, n_threads_per_team, thread_idx);
  double t3 = cur_time();
  double dt1 = t3 - t2;
  long bytes = sizeof(long) * n * n_cycles;
  double bw = bytes / dt1;
  printf("bytes accessed : %ld bytes\n", bytes);
  printf("time_total : %f sec\n", dt1);
  printf("time_per_access : %.1f nsec/access\n", 1.0e9 * dt1 / (n * n_cycles));
  printf("bw : %.3f GB/sec\n", bw * 1.e-9);
  printf("checking results ... "); fflush(stdout);
  for (long idx = 0; idx < n_cycles; idx++) {
    assert(end[idx] == seq[(idx + n * n_cycles) % m]);
  }
  printf("OK\n");
#if 0
  for (long idx = 0; idx < n_cycles; idx++) {
    printf("idx = %ld by thread %d\n", idx, thread_idx[idx]);
  }
#endif
  dealloc_dev(seq);
  dealloc_dev(end);
  dealloc_dev(a);
  return 0;
}


* The core part of the program that accesses the array

In [None]:
%%writefile latency_cuda.cc
#include <assert.h>
#include <stdio.h>
#include <omp.h>
#include "cuda_util.h"

/* starting from cell &a[idx], chase ->next ptr n times
   and put where it ends in end[idx] */
__host__ __device__
void cycle(long * a, long idx, long n, long * end, int * thread_idx) {
  long k = idx;
  asm volatile("// ========== loop begins ========== ");
#pragma unroll(8)
  for (long i = 0; i < n; i++) {
    k = a[k];
  }
  asm volatile("// ---------- loop ends ---------- ");
  end[idx] = k;
  thread_idx[idx] = k;
}

__global__ void cycles_g(long * a, long n_cycles, long n, long * end, int * thread_idx) {
  long nthreads = get_n_threads();
  for (long idx = get_thread_index(); idx < n_cycles; idx += nthreads) {
    cycle(a, idx, n, end, thread_idx);
  }
}

/* a is an array of m cells;
   starting from &a[idx] for each idx in [0:n_cycles],
   chase ->next ptr n times and put where it ends in end[idx] */
void cycles(long * a, long m, long n, long * end, long n_cycles,
            long n_conc_cycles,
            long n_teams, long n_threads_per_team, int * thread_idx) {
  check_cuda_launch((cycles_g<<<n_teams,n_threads_per_team>>>(a, n_cycles, n, end, thread_idx)));
}


* Compile them together

In [None]:
BEGIN SOLUTION
END SOLUTION
nvcc -DDBG=0 -O4 -x cu -o latency_cuda latency_cuda.cc main_cuda.cc

In [None]:
BEGIN SOLUTION
END SOLUTION
export OMP_TARGET_OFFLOAD=MANDATORY
export OMP_NUM_TEAMS=1
m=$((1 << 24))
n=$((1 << 27))

for C in 32 64 96 128 192 256 384 512 768 1024 ; do
    echo "==== C=${C} ===="
    OMP_NUM_THREADS=${C} ./latency_cuda --n-elements ${m} --min-accesses ${n} --n-cycles ${C}
done | tee gpu_bw_threads.txt

* Visualize

In [None]:
import vis_mem
vis_mem.vis_bw(["gpu_bw_threads.txt"])