#  Instruction Level Parallelism
# 1. Introduction
* we have learned that CPU has multicore and SIMD parallelism
* the last dimension of parallelism is _Instruction Level Parallelism (ILP)_, the ability to execute many instructions <font color=red>of a single thread</font> concurrently (i.e., execution of many instructions overlap in time)
* in contrast, GPU does not aggressively try to extract ILP from a single thread; parallelism mostly comes from simultaneously executing many threads
* illuminating the difference between the two is both instructive and practically important, especially for optimizing CPU code

# 2. Compilers
## 2-1. Set up NVIDIA HPC SDK
Execute this before you use NVIDIA HPC SDK

In [None]:
export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/24.9/compilers/bin:$PATH

Check if it works (check if full paths of nvc/nvc++ are shown)

In [None]:
which nvc
which nvc++

## 2-2. Set up LLVM
Execute this before you use LLVM


In [None]:
export PATH=/home/share/llvm/bin:$PATH
export LD_LIBRARY_PATH=/home/share/llvm/lib:/home/share/llvm/lib/x86_64-unknown-linux-gnu:$LD_LIBRARY_PATH

Check if it works (check if full paths of gcc/g++ are shown)

In [None]:
which clang
which clang++

## 2-3. GCC

Check if it works (check if full paths of nvc/nvc++ are shown)

In [None]:
which gcc
which g++

# 3. CPU without ILP
* this is an experiment very similar to what we did on GPUs
* each thread repeats x = a * x + b many times and occasionally record time
* although the primary focus is a single-thread performance, the program is written with OpenMP so it can be executed on multicore CPUs and GPUs
* the compilation option below enables execution on both GPU and CPU

In [None]:
%%writefile no_ilp.cc
#include <assert.h>
#include <err.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <sched.h>
#include <stdint.h>
#include <x86intrin.h>

long cur_time_ns() {
  struct timespec ts[1];
  if (clock_gettime(CLOCK_REALTIME, ts) == -1) err(1, "clock_gettime");
  return ts->tv_sec * 1000000000L + ts->tv_nsec;
}

#if __NVCOMPILER
#include <nv/target>
/* get core number (SM id for GPU). */
__host__ __device__ static unsigned int get_core(void) {
  if target(nv::target::is_device) {
    unsigned int sm;
    asm("mov.u32 %0, %%smid;" : "=r"(sm));
    return sm;
  } else {
    return sched_getcpu();
  }
}

/* get GPU/CPU clock (for Clang LLVM compiler) */
__attribute__((unused,nothrow))
static long get_clock(void) {
  if target(nv::target::is_device) {
    long clock;
    asm volatile("mov.s64 %0, %%clock64;" : "=l" (clock));
    return clock;
  } else {
    uint32_t low, high;
    asm volatile("rdtsc" : "=a"(low), "=d"(high));
    return ((uint64_t)high << 32) | low;
  }
}

#else  // __clang__ or GCC
/* get SM id (for Clang LLVM compiler).
   return -1 if called on CPU */
__attribute__((unused))
static unsigned int get_core(void) {
#if __CUDA_ARCH__
  unsigned int sm;
  asm("mov.u32 %0, %%smid;" : "=r"(sm));
  return sm;
#else
  return sched_getcpu();
#endif
}

/* get GPU/CPU clock (for Clang LLVM compiler) */
__attribute__((unused,nothrow))
static long get_clock(void) {
#if __CUDA_ARCH__
  long clock;
  asm volatile("mov.s64 %0, %%clock64;" : "=l" (clock));
  return clock;
#else
  return _rdtsc();
#endif
}
#endif

typedef struct {
  double x;
  int core[2];
} record_t;

/* the function for an iteration
   perform
   x = a x + b
   (M * N) times and record current time
   every N iterations to T.
   record thread and cpu to R.
 */

void iter_fun(double a, double b, long i, long M, long N,
              record_t * R, long * T) {
  // initial value (not important)
  double x = i;
  // record in T[i * M] ... T[(i+1) * M - 1]
  T = &T[i * M];
  // record starting SM
  R[i].core[0] = get_core();
  // repeat a x + b many times.
  // record time every N iterations
  for (long j = 0; j < M; j++) {
    T[j] = get_clock();
    asm volatile("// ========== loop begins ==========");
    for (long k = 0; k < N; k++) {
      x = a * x + b;
    }
    asm volatile("// ---------- loop ends ----------");
  }
  // record ending SM (must be = sm[0])
  R[i].core[1] = get_core();
  // record result, just so that the computation is not
  // eliminated by the compiler
  R[i].x = x;
}

void dump(record_t * R, long * T, long L, long M, long N) {
  long k = 0;
  for (long i = 0; i < L; i++) {
    long dt = T[k + M - 1] - T[k];
    double avg = (double)dt / ((double)M * (double)N);
    printf("i=%ld x=%f core0=%d core1=%d cycles_per_iter=%f",
           i, R[i].x, R[i].core[0], R[i].core[1], avg);
    for (long j = 0; j < M; j++) {
      printf(" %ld", T[k]);
      k++;
    }
    printf("\n");
  }
}

int getenv_int(const char * v) {
  char * s = getenv(v);
  if (!s) {
    fprintf(stderr, "specify environment variable %s\n", v);
    exit(1);
  }
  return atoi(s);
}

int main(int argc, char ** argv) {
  int idx = 1;
  long L   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  long M   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  long N   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  double a = (idx < argc ? atof(argv[idx]) : 0.99); idx++;
  double b = (idx < argc ? atof(argv[idx]) : 1.00); idx++;
  int n_teams = getenv_int("OMP_NUM_TEAMS");
  int n_threads_per_team = getenv_int("OMP_NUM_THREADS");
  record_t * R = (record_t *)calloc(L, sizeof(record_t));
  long * T = (long *)calloc(L * M, sizeof(long));
#pragma omp target teams distribute parallel for num_teams(n_teams) num_threads(n_threads_per_team) map(tofrom: R[0:L], T[0:L*M])
  for (long i = 0; i < L; i++) {
    iter_fun(a, b, i, M, N, R, T);
  }
  dump(R, T, L, M, N);
  return 0;
}


In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -Wall -O3 -mavx512f -mfma -fopenmp no_ilp.cc -o no_ilp
#nvc++   -Wall -O3 -mavx512f -mfma -mp=multicore -cuda no_ilp.cc -o no_ilp
#g++     -Wall -O3 -mavx512f -mfma -fopenmp no_ilp.cc -o no_ilp

* run it on a single core

In [None]:
BEGIN SOLUTION
END SOLUTION
OMP_NUM_TEAMS=1 OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./no_ilp 1 100 1000000 > a.dat
cat a.dat

* it shows cycles per iteration (of the innermost loop) by `cycles_per_iter=xxxx`, which can be extracted by

In [None]:
BEGIN SOLUTION
END SOLUTION
awk '{print $5}' a.dat

* this is essentially the _latency_ of an FMA instruction 
* according to [Intel intrinsics guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html), the latency of FMA instruction is 4, but we observe a slightly lower value
* this is due to dynamic frequency scaling (essentially, a core boosts its frequency when the total load is low; the `rdtsc` instruction we use to get the clock is a clock that runs at a constant speed regardless of the actual processor frequency)

* let's witness instruction sequence 
* identify the instruction sequence corresponding to the innermost loop

In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -Wall -O3 -mavx512f -mfma -fopenmp no_ilp.cc -S
#nvc++   -Wall -O3 -mavx512f -mfma -mp=multicore -cuda no_ilp.cc -S
#g++     -Wall -O3 -mavx512f -mfma -fopenmp no_ilp.cc -S

In [None]:
BEGIN SOLUTION
END SOLUTION
cat no_ilp.s

# 4. Double ILP on CPU
* an ILP increases when there are multiple instructions that are _independent_
* no processor can execute the following sequence faster than 4 cycles / FMA, because each instruction has to wait for the previous instruction to produce its result (i.e., depends on the previous instruction)
```
x = a * x + b
x = a * x + b
x = a * x + b
  ...
```
* what a CPU _can_ do is to execute something like the following as fast as 4 cycles / (2 FMAs)
```
x0 = a * x0 + b
x1 = a * x1 + b
x0 = a * x0 + b
x1 = a * x1 + b
x0 = a * x0 + b
x1 = a * x1 + b
  ...
```
* this is possible because instructions working on x0 do not depend on those working on x1 and vice versa

In [None]:
%%writefile ilp2.cc
#include <assert.h>
#include <err.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <sched.h>
#include <stdint.h>
#include <x86intrin.h>

long cur_time_ns() {
  struct timespec ts[1];
  if (clock_gettime(CLOCK_REALTIME, ts) == -1) err(1, "clock_gettime");
  return ts->tv_sec * 1000000000L + ts->tv_nsec;
}

#if __NVCOMPILER
#include <nv/target>
/* get core number (SM id for GPU). */
__host__ __device__ static unsigned int get_core(void) {
  if target(nv::target::is_device) {
    unsigned int sm;
    asm("mov.u32 %0, %%smid;" : "=r"(sm));
    return sm;
  } else {
    return sched_getcpu();
  }
}

/* get GPU/CPU clock (for Clang LLVM compiler) */
__attribute__((unused,nothrow))
static long get_clock(void) {
  if target(nv::target::is_device) {
    long clock;
    asm volatile("mov.s64 %0, %%clock64;" : "=l" (clock));
    return clock;
  } else {
    uint32_t low, high;
    asm volatile("rdtsc" : "=a"(low), "=d"(high));
    return ((uint64_t)high << 32) | low;
  }
}

#else  // __clang__ or GCC
/* get SM id (for Clang LLVM compiler).
   return -1 if called on CPU */
__attribute__((unused))
static unsigned int get_core(void) {
#if __CUDA_ARCH__
  unsigned int sm;
  asm("mov.u32 %0, %%smid;" : "=r"(sm));
  return sm;
#else
  return sched_getcpu();
#endif
}

/* get GPU/CPU clock (for Clang LLVM compiler) */
__attribute__((unused,nothrow))
static long get_clock(void) {
#if __CUDA_ARCH__
  long clock;
  asm volatile("mov.s64 %0, %%clock64;" : "=l" (clock));
  return clock;
#else
  return _rdtsc();
#endif
}
#endif

typedef struct {
  double x;
  int core[2];
} record_t;

/* the function for an iteration
   perform
   x = a x + b
   (M * N) times and record current time
   every N iterations to T.
   record thread and cpu to R.
 */

void iter_fun(double a, double b, long i, long M, long N,
              record_t * R, long * T) {
  // initial value (not important)
  double x0 = i, x1 = i + 0.5;
  // record in T[i * M] ... T[(i+1) * M - 1]
  T = &T[i * M];
  // record starting SM
  R[i].core[0] = get_core();
  // repeat a x + b many times.
  // record time every N iterations
  for (long j = 0; j < M; j++) {
    T[j] = get_clock();
    asm volatile("// ========== loop begins ==========");
    for (long k = 0; k < N; k++) {
      x0 = a * x0 + b;
      x1 = a * x1 + b;
    }
    asm volatile("// ---------- loop ends ----------");
  }
  // record ending SM (must be = sm[0])
  R[i].core[1] = get_core();
  // record result, just so that the computation is not
  // eliminated by the compiler
  R[i].x = (x0 + x1) / 2.0;
}

void dump(record_t * R, long * T, long L, long M, long N) {
  long k = 0;
  for (long i = 0; i < L; i++) {
    long dt = T[k + M - 1] - T[k];
    double avg = (double)dt / ((double)M * (double)N);
    printf("i=%ld x=%f core0=%d core1=%d cycles_per_iter=%f",
           i, R[i].x, R[i].core[0], R[i].core[1], avg);
    for (long j = 0; j < M; j++) {
      printf(" %ld", T[k]);
      k++;
    }
    printf("\n");
  }
}

int getenv_int(const char * v) {
  char * s = getenv(v);
  if (!s) {
    fprintf(stderr, "specify environment variable %s\n", v);
    exit(1);
  }
  return atoi(s);
}

int main(int argc, char ** argv) {
  int idx = 1;
  long L   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  long M   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  long N   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  double a = (idx < argc ? atof(argv[idx]) : 0.99); idx++;
  double b = (idx < argc ? atof(argv[idx]) : 1.00); idx++;
  int n_teams = getenv_int("OMP_NUM_TEAMS");
  int n_threads_per_team = getenv_int("OMP_NUM_THREADS");
  record_t * R = (record_t *)calloc(L, sizeof(record_t));
  long * T = (long *)calloc(L * M, sizeof(long));
#pragma omp target teams distribute parallel for num_teams(n_teams) num_threads(n_threads_per_team) map(tofrom: R[0:L], T[0:L*M])
  for (long i = 0; i < L; i++) {
    iter_fun(a, b, i, M, N, R, T);
  }
  dump(R, T, L, M, N);
  return 0;
}


In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -Wall -O3 -mavx512f -mfma -fopenmp ilp2.cc -o ilp2
#nvc++   -Wall -O3 -mavx512f -mfma -mp=multicore -cuda ilp2.cc -o ilp2
#g++     -Wall -O3 -mavx512f -mfma -fopenmp ilp2.cc -o ilp2

* run it on a single core again

In [None]:
BEGIN SOLUTION
END SOLUTION
OMP_NUM_TEAMS=1 OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./ilp2 1 100 1000000 > a.dat
awk '{print $5}' a.dat

* observe that the number of cycles per iteration does not change almost at all
* put differently, it increased the _throughput_, the number of operations executed per cycle (or a unit time)

# 5. Increase ILP further
* let's increase ILP furter
* the following code can configure the number of variables to update in the `k` loop (therefore the number of _independent_ FMA instructions) by setting a preprocessor constant `C`

In [None]:
%%writefile ilp.cc
#include <assert.h>
#include <err.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <sched.h>
#include <stdint.h>
#include <x86intrin.h>

long cur_time_ns() {
  struct timespec ts[1];
  if (clock_gettime(CLOCK_REALTIME, ts) == -1) err(1, "clock_gettime");
  return ts->tv_sec * 1000000000L + ts->tv_nsec;
}

#if __NVCOMPILER
#include <nv/target>
/* get core number (SM id for GPU). */
__host__ __device__ static unsigned int get_core(void) {
  if target(nv::target::is_device) {
    unsigned int sm;
    asm("mov.u32 %0, %%smid;" : "=r"(sm));
    return sm;
  } else {
    return sched_getcpu();
  }
}

/* get GPU/CPU clock (for Clang LLVM compiler) */
__attribute__((unused,nothrow))
static long get_clock(void) {
  if target(nv::target::is_device) {
    long clock;
    asm volatile("mov.s64 %0, %%clock64;" : "=l" (clock));
    return clock;
  } else {
    uint32_t low, high;
    asm volatile("rdtsc" : "=a"(low), "=d"(high));
    return ((uint64_t)high << 32) | low;
  }
}

#else  // __clang__ or GCC
/* get SM id (for Clang LLVM compiler).
   return -1 if called on CPU */
__attribute__((unused))
static unsigned int get_core(void) {
#if __CUDA_ARCH__
  unsigned int sm;
  asm("mov.u32 %0, %%smid;" : "=r"(sm));
  return sm;
#else
  return sched_getcpu();
#endif
}

/* get GPU/CPU clock (for Clang LLVM compiler) */
__attribute__((unused,nothrow))
static long get_clock(void) {
#if __CUDA_ARCH__
  long clock;
  asm volatile("mov.s64 %0, %%clock64;" : "=l" (clock));
  return clock;
#else
  return _rdtsc();
#endif
}
#endif

typedef struct {
  double x;
  int core[2];
} record_t;

/* the function for an iteration
   perform
   x = a x + b
   (M * N) times and record current time
   every N iterations to T.
   record thread and cpu to R.
 */

void iter_fun(double a, double b, long i, long M, long N,
              record_t * R, long * T) {
  // initial value (not important)
#ifndef C
#error "give -DC=xxx in the command line"  
#endif  
  double x[C];
  for (int c = 0; c < C; c++) {
    x[c] = i + c / (double)C;
  }
  // record in T[i * M] ... T[(i+1) * M - 1]
  T = &T[i * M];
  // record starting SM
  R[i].core[0] = get_core();
  // repeat a x + b many times.
  // record time every N iterations
  for (long j = 0; j < M; j++) {
    T[j] = get_clock();
    asm volatile("// ========== loop begins ==========");
    for (long k = 0; k < N; k++) {
      for (int c = 0; c < C; c++) {
        x[c] = a * x[c] + b;
      }
    }
    asm volatile("// ---------- loop ends ----------");
  }
  // record ending SM (must be = sm[0])
  R[i].core[1] = get_core();
  // record result, just so that the computation is not
  // eliminated by the compiler
  double t = 0.0;
  for (int c = 0; c < C; c++) {
    t += x[c];
  }
  R[i].x = t / C;
}

void dump(record_t * R, long * T, long L, long M, long N) {
  long k = 0;
  for (long i = 0; i < L; i++) {
    long dt = T[k + M - 1] - T[k];
    double avg = (double)dt / ((double)M * (double)N);
    printf("i=%ld x=%f core0=%d core1=%d cycles_per_iter=%f",
           i, R[i].x, R[i].core[0], R[i].core[1], avg);
    for (long j = 0; j < M; j++) {
      printf(" %ld", T[k]);
      k++;
    }
    printf("\n");
  }
}

int getenv_int(const char * v) {
  char * s = getenv(v);
  if (!s) {
    fprintf(stderr, "specify environment variable %s\n", v);
    exit(1);
  }
  return atoi(s);
}

int main(int argc, char ** argv) {
  int idx = 1;
  long L   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  long M   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  long N   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  double a = (idx < argc ? atof(argv[idx]) : 0.99); idx++;
  double b = (idx < argc ? atof(argv[idx]) : 1.00); idx++;
  int n_teams = getenv_int("OMP_NUM_TEAMS");
  int n_threads_per_team = getenv_int("OMP_NUM_THREADS");
  record_t * R = (record_t *)calloc(L, sizeof(record_t));
  long * T = (long *)calloc(L * M, sizeof(long));
#pragma omp target teams distribute parallel for num_teams(n_teams) num_threads(n_threads_per_team) map(tofrom: R[0:L], T[0:L*M])
  for (long i = 0; i < L; i++) {
    iter_fun(a, b, i, M, N, R, T);
  }
  dump(R, T, L, M, N);
  return 0;
}


* compile it with `C=4`
* we inhibit vectorization to make sure we witness the effect of ILP, not of SIMD instructions

In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -DC=4 -Wall -O3 -mavx512f -mfma -fopenmp -fno-vectorize ilp.cc -o ilp
#nvc++   -DC=4 -Wall -O3 -mavx512f -mfma -mp=multicore -Mnovect -cuda ilp.cc -o ilp
#g++     -DC=4 -Wall -O3 -mavx512f -mfma -fopenmp -fno-tree-vectorize ilp.cc -o ilp

* and run it on a single core

In [None]:
BEGIN SOLUTION
END SOLUTION
OMP_NUM_TEAMS=1 OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./ilp 1 100 1000000 > a.dat
awk '{print $5}' a.dat

* change the `C` above and how far you can go, without noticeably increasing the cycles per iteration
* it is the processor's _throughput_ limit, determined by the number of execution units for floatint point operations (two on many CPUs)
* note: this number if the same even if it is SIMD FMA, which we will see below

* change `C` systematically and graph the result
* make sure to increase `C` to observe that the throughput initially rises and eventually plateaus at a certain point

In [None]:
BEGIN SOLUTION
END SOLUTION
for L in 1; do
    for M in 100 ; do
        for N in 1000000; do
            for C in 1 2     ; do  # put values of C you want to experiment with
                # choose the compiler and craft an appropriate command
                # line to produce exe file ilp_${C}
                #nvc++   -DC=${C} -Wall -O3 -mavx512f -mfma -mp=multicore -Mnovect -cuda ilp.cc -o ilp_${C} \
                #g++     -DC=${C} -Wall -O3 -mavx512f -mfma -fopenmp -fno-tree-vectorize ilp.cc -o ilp_${C} \
                clang++ -DC=${C} -Wall -O3 -mavx512f -mfma -fopenmp -fno-vectorize ilp.cc -o ilp_${C} \
                    && (echo -n "L=${L} M=${M} N=${N} C=${C} ";
                        OMP_NUM_TEAMS=1 OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./ilp_${C} ${L} ${M} ${N} | awk '{print $5}')
            done
        done
    done
done 

In [None]:
DATA_STRING_CPU = r"""
L=1 M=100 N=10000000 C=1 cycles_per_iter=2.128242
L=1 M=100 N=10000000 C=2 cycles_per_iter=2.126931
L=1 M=100 N=10000000 C=3 cycles_per_iter=2.125129
    ...
"""

In [None]:
import vis_latency_throughput
vis_latency_throughput.vis(DATA_STRING_CPU)

# 6. GPU 
* what about GPU?
* GPU does not aggressively exploit ILP
* let's fast see the case where we have almost no ilp
* the code can be exactly the same as the CPU case; we merely have to change the compile option

In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -Wall -O3 -mavx512f -mfma -fopenmp -fopenmp-targets=nvptx64 no_ilp.cc -o no_ilp_gpu
#nvc++   -Wall -O3 -mavx512f -mfma -mp=gpu -cuda no_ilp.cc -o no_ilp_gpu

* and run it with a single thread

In [None]:
BEGIN SOLUTION
END SOLUTION
OMP_TARGET_OFFLOAD=MANDATORY OMP_NUM_TEAMS=1 OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./no_ilp_gpu 1 100 1000000 > a.dat
awk '{print $5}' a.dat

* this is the _latency_ of FMA instruction on NVIDIA GPU
* it is slightly longer than in terms of the number of cycles; it is even longer in terms of absolute time (as the frequency of A100 GPU, 1.095GHz - 1.41MHz, is lower than that of Intel Xeon Platinum 8368, 2.40GHz - 3.4GHz)

* another good news is that this same executable can actually run on CPUs too
* we merely have to disable offloading by changing the environment variable

In [None]:
BEGIN SOLUTION
END SOLUTION
# execute on CPU
OMP_TARGET_OFFLOAD=DISABLED OMP_NUM_TEAMS=1 OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./no_ilp_gpu 1 100 1000000 > a.dat
awk '{print $5}' a.dat

# 7. GPU with ILP?
* combined, A100 GPU is about 3x slower than Xeon Platinum 8363 CPU when it comes to the latency of an FMA
* but that's not the main point
* the main point is what happens if we expose more independent FMAs to a _single thread_ of GPU

In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -DC=2 -Wall -O3 -mavx512f -mfma -fopenmp -fopenmp-targets=nvptx64 ilp.cc -o ilp_gpu
#nvc++   -DC=2 -Wall -O3 -mavx512f -mfma -mp=gpu -cuda ilp.cc -o ilp_gpu

* and run it with a single thread, on GPU

In [None]:
BEGIN SOLUTION
END SOLUTION
OMP_TARGET_OFFLOAD=MANDATORY OMP_NUM_TEAMS=1 OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./ilp_gpu 1 100 1000000 > a.dat
awk '{print $5}' a.dat

* and on CPU

In [None]:
BEGIN SOLUTION
END SOLUTION
OMP_TARGET_OFFLOAD=DISABLED OMP_NUM_TEAMS=1 OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./ilp_gpu 1 100 1000000 > a.dat
awk '{print $5}' a.dat

* change the value of `C` and see what happens on GPU 

* then change `C` systematically and graph the result
* observe that latency immediately starts increasing and thus the throughput does not increase

In [None]:
BEGIN SOLUTION
END SOLUTION
for L in 1; do
    for M in 100 ; do
        for N in 1000000; do
            for C in 1 2    ; do  # put values of C you want to experiment with
                # choose the compiler and craft an appropriate command
                # line to produce exe file ilp_gpu_${C}
                #nvc++   -DC=${C} -Wall -O3 -mavx512f -mfma -mp=gpu -cuda ilp.cc -o ilp_gpu_${C} \
                clang++ -DC=${C} -Wall -O3 -mavx512f -mfma -fopenmp -fopenmp-targets=nvptx64 ilp.cc -o ilp_gpu_${C} \
                    && (echo -n "L=${L} M=${M} N=${N} C=${C} ";
                        OMP_TARGET_OFFLOAD=MANDATORY OMP_NUM_TEAMS=1 OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./ilp_gpu_${C} ${L} ${M} ${N} | awk '{print $5}')
            done
        done
    done
done 

In [None]:
DATA_STRING_GPU = r"""
L=1 M=100 N=10000000 C=1 cycles_per_iter=2.128242
L=1 M=100 N=10000000 C=2 cycles_per_iter=2.126931
L=1 M=100 N=10000000 C=3 cycles_per_iter=2.125129
    ...
"""

In [None]:
import vis_latency_throughput
vis_latency_throughput.vis(DATA_STRING_GPU)

# 8. CPU with SIMD x ILP
* Intel CPU is able to execute up to two _SIMD_ FMAs per cycle
* therefore, the throughput limit of a single thread is actually even higher than what you saw above

* change the following code if necessary to take advantage of SIMD and get throughput higher than the case without SIMD
* does the compiler successfully vectorize the loop without any code change?

In [None]:
%%writefile simd_ilp.cc
#include <assert.h>
#include <err.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#include <sched.h>
#include <stdint.h>
#include <x86intrin.h>

long cur_time_ns() {
  struct timespec ts[1];
  if (clock_gettime(CLOCK_REALTIME, ts) == -1) err(1, "clock_gettime");
  return ts->tv_sec * 1000000000L + ts->tv_nsec;
}

#if __NVCOMPILER
#include <nv/target>
/* get core number (SM id for GPU). */
__host__ __device__ static unsigned int get_core(void) {
  if target(nv::target::is_device) {
    unsigned int sm;
    asm("mov.u32 %0, %%smid;" : "=r"(sm));
    return sm;
  } else {
    return sched_getcpu();
  }
}

/* get GPU/CPU clock (for Clang LLVM compiler) */
__attribute__((unused,nothrow))
static long get_clock(void) {
  if target(nv::target::is_device) {
    long clock;
    asm volatile("mov.s64 %0, %%clock64;" : "=l" (clock));
    return clock;
  } else {
    uint32_t low, high;
    asm volatile("rdtsc" : "=a"(low), "=d"(high));
    return ((uint64_t)high << 32) | low;
  }
}

#else  // __clang__ or GCC
/* get SM id (for Clang LLVM compiler).
   return -1 if called on CPU */
__attribute__((unused))
static unsigned int get_core(void) {
#if __CUDA_ARCH__
  unsigned int sm;
  asm("mov.u32 %0, %%smid;" : "=r"(sm));
  return sm;
#else
  return sched_getcpu();
#endif
}

/* get GPU/CPU clock (for Clang LLVM compiler) */
__attribute__((unused,nothrow))
static long get_clock(void) {
#if __CUDA_ARCH__
  long clock;
  asm volatile("mov.s64 %0, %%clock64;" : "=l" (clock));
  return clock;
#else
  return _rdtsc();
#endif
}
#endif

typedef struct {
  double x;
  int core[2];
} record_t;

/* the function for an iteration
   perform
   x = a x + b
   (M * N) times and record current time
   every N iterations to T.
   record thread and cpu to R.
 */

void iter_fun(double a, double b, long i, long M, long N,
              record_t * R, long * T) {
  // initial value (not important)
#ifndef C
#error "give -DC=xxx in the command line"  
#endif  
  double x[C];
  for (int c = 0; c < C; c++) {
    x[c] = i + c / (double)C;
  }
  // record in T[i * M] ... T[(i+1) * M - 1]
  T = &T[i * M];
  // record starting SM
  R[i].core[0] = get_core();
  // repeat a x + b many times.
  // record time every N iterations
  for (long j = 0; j < M; j++) {
    T[j] = get_clock();
    asm volatile("// ========== loop begins ==========");
    for (long k = 0; k < N; k++) {
      for (int c = 0; c < C; c++) {
        x[c] = a * x[c] + b;
      }
    }
    asm volatile("// ---------- loop ends ----------");
  }
  // record ending SM (must be = sm[0])
  R[i].core[1] = get_core();
  // record result, just so that the computation is not
  // eliminated by the compiler
  double t = 0.0;
  for (int c = 0; c < C; c++) {
    t += x[c];
  }
  R[i].x = t / C;
}

void dump(record_t * R, long * T, long L, long M, long N) {
  long k = 0;
  for (long i = 0; i < L; i++) {
    long dt = T[k + M - 1] - T[k];
    double avg = (double)dt / ((double)M * (double)N);
    printf("i=%ld x=%f core0=%d core1=%d cycles_per_iter=%f",
           i, R[i].x, R[i].core[0], R[i].core[1], avg);
    for (long j = 0; j < M; j++) {
      printf(" %ld", T[k]);
      k++;
    }
    printf("\n");
  }
}

int getenv_int(const char * v) {
  char * s = getenv(v);
  if (!s) {
    fprintf(stderr, "specify environment variable %s\n", v);
    exit(1);
  }
  return atoi(s);
}

int main(int argc, char ** argv) {
  int idx = 1;
  long L   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  long M   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  long N   = (idx < argc ? atol(argv[idx]) : 100);  idx++;
  double a = (idx < argc ? atof(argv[idx]) : 0.99); idx++;
  double b = (idx < argc ? atof(argv[idx]) : 1.00); idx++;
  int n_teams = getenv_int("OMP_NUM_TEAMS");
  int n_threads_per_team = getenv_int("OMP_NUM_THREADS");
  record_t * R = (record_t *)calloc(L, sizeof(record_t));
  long * T = (long *)calloc(L * M, sizeof(long));
#pragma omp target teams distribute parallel for num_teams(n_teams) num_threads(n_threads_per_team) map(tofrom: R[0:L], T[0:L*M])
  for (long i = 0; i < L; i++) {
    iter_fun(a, b, i, M, N, R, T);
  }
  dump(R, T, L, M, N);
  return 0;
}


In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -DC=16 -Wall -O3 -mavx512f -mfma -fopenmp simd_ilp.cc -o simd_ilp
#nvc++   -DC=16 -Wall -O3 -mavx512f -mfma -mp=multicore -cuda simd_ilp.cc -o simd_ilp
#g++     -DC=16 -Wall -O3 -mavx512f -mfma -fopenmp simd_ilp.cc -o simd_ilp

* then change `C` systematically and graph the result
* make sure to increase `C` to observe that the throughput initially rises and eventually plateaus at a certain point
* see how far you can go with a single thread on CPUs

In [None]:
BEGIN SOLUTION
END SOLUTION
for L in 1; do
    for M in 100 ; do
        for N in 1000000; do
            for C in 1 2     ; do  # put values of C you want to experiment with
                # choose the compiler and craft an appropriate command
                # line to produce exe file simd_ilp_${C}
                #nvc++   -DC=${C} -Wall -O3 -mavx512f -mfma -mp=multicore -cuda simd_ilp.cc -o simd_ilp_${C} \
                #g++     -DC=${C} -Wall -O3 -mavx512f -mfma -fopenmp simd_ilp.cc -o simd_ilp_${C} \
                clang++ -DC=${C} -Wall -O3 -mavx512f -mfma -fopenmp simd_ilp.cc -o simd_ilp_${C} \
                    && (echo -n "L=${L} M=${M} N=${N} C=${C} ";
                        OMP_NUM_TEAMS=1 OMP_NUM_THREADS=1 OMP_PROC_BIND=true ./simd_ilp_${C} ${L} ${M} ${N} | awk '{print $5}')
            done
        done
    done
done 

In [None]:
DATA_STRING_SIMD = r"""
L=1 M=100 N=10000000 C=1 cycles_per_iter=2.128242
L=1 M=100 N=10000000 C=2 cycles_per_iter=2.126931
L=1 M=100 N=10000000 C=3 cycles_per_iter=2.125129
    ...
"""

In [None]:
import vis_latency_throughput
vis_latency_throughput.vis(DATA_STRING_SIMD)