#  A real exercise --- a fast matrix multiply
# 1. Introduction
* Matrix-matrix multiplication is an important building block for many applications, most notably for deep neural networks
* It is also a good exercise for exploiting SIMD and ILP, as doing so brings much benefits

# 2. Compilers
## 2-1. Set up NVIDIA HPC SDK
Execute this before you use NVIDIA HPC SDK

In [None]:
export PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/24.9/compilers/bin:$PATH

Check if it works (check if full paths of nvc/nvc++ are shown)

In [None]:
which nvc
which nvc++

## 2-2. Set up LLVM
Execute this before you use LLVM


In [None]:
export PATH=/home/share/llvm/bin:$PATH
export LD_LIBRARY_PATH=/home/share/llvm/lib:/home/share/llvm/lib/x86_64-unknown-linux-gnu:$LD_LIBRARY_PATH

Check if it works (check if full paths of gcc/g++ are shown)

In [None]:
which clang
which clang++

## 2-3. GCC

Check if it works (check if full paths of nvc/nvc++ are shown)

In [None]:
which gcc
which g++

# 3. A very basic matrix multiply without SIMD or ILP
* this is a very basic matrix multiply without SIMD or ILP

In [None]:
%%writefile mm_basic.cc
#include <assert.h>
#include "mm_cpu.h"

void gemm(matrix A, matrix B, matrix C) {
  idx_t M = C.M;
  idx_t N = C.N;
  idx_t K = A.N;
  for (idx_t i = 0; i < M; i++) {
    for (idx_t j = 0; j < N; j++) {
      real c = 0;
      asm volatile("# loop begins");
      for (idx_t k = 0; k < K; k++) {
        c += A(i,k) * B(k,j);
      }
      asm volatile("# loop ends");
      C(i,j) += c;
    }
  }
}




* definition of matrix and auxiliary functions

In [None]:
%%writefile mm_cpu.h
/* 
 * mm_cpu.h
 */

/* type definition */
typedef float real;
typedef long idx_t;

#if ! defined(__AVX512F__)
#error "__AVX512F__ must be defined (forgot to give -mavx512f -mfma?)"
#endif

#include <x86intrin.h>
enum { vwidth = 64 };
typedef real realv __attribute__((vector_size(vwidth),__may_alias__,aligned(vwidth)));
enum { L = sizeof(realv) / sizeof(real) };

__attribute__((unused))
static realv U(real c) {
  return _mm512_set1_ps(c);
  // return _mm512_set1_pd(c);
}

__attribute__((unused))
static realv& V(real& p) {
  return *((realv*)&p);
}

#define CHECK_IDX 0

struct matrix {
  idx_t M;                      // number of rows
  idx_t N;                      // number of columns
  idx_t ld;                     // leading dimension (usually = N)
  real * a;                     // array of values (M x ld elements)
  matrix(idx_t _M, idx_t _N) {
    M = _M;
    N = _N;
    ld = _N;
    a = (real *)aligned_alloc(vwidth, sizeof(real) * M * ld);
  }
  /* return a scalar A(i,j) */
  real& operator() (idx_t i, idx_t j) {
#if CHECK_IDX
    assert(i < M);
    assert(j < N);
    assert(i >= 0);
    assert(j >= 0);
#endif
    return a[i * ld + j];
  }
  /* A.V(i,j) returns a vector at A(i,j) (i.e., A(i,j:j+L)).
     you can put it on lefthand side, e.g., A.V(i,j) = ... */
  realv& V(idx_t i, idx_t j) {
#if CHECK_IDX
    assert(i < M);
    assert(j + L <= N);
    assert(i >= 0);
    assert(j >= 0);
#endif
    return ::V(a[i * ld + j]);
  }
};


* main function

In [None]:
%%writefile mm_main_cpu.cc
/* 
 * mm_main_cpu.cc
 */

#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>

#include "mm_cpu.h"
#include "event.h"

void gemm(matrix A, matrix B, matrix C);

/* initialize elements randomly */
static void rand_init(matrix A, unsigned short rg[3]) {
  for (idx_t i = 0; i < A.M; i++) {
    for (idx_t j = 0; j < A.N; j++) {
      A(i,j) = erand48(rg);
    }
  }
}

/* initialize all elements by c */
static void const_init(matrix A, real c) {
  for (idx_t i = 0; i < A.M; i++) {
    for (idx_t j = 0; j < A.N; j++) {
      A(i,j) = c;
    }
  }
}

static void zero_init(matrix A) {
  const_init(A, 0.0);
}

static real comp_ij(matrix A, matrix B,
                    idx_t i, idx_t j, long times) {
  real s1 = 0.0;
  idx_t K = A.N;
  for (idx_t k = 0; k < K; k++) {
    s1 += A(i,k) * B(k,j);
  }
  real s = 0.0;
  for (long t = 0; t < times; t++) {
    s += s1;
  }
  return s;
}

int main(int argc, char ** argv) {
  int i = 1;
  const long M = (argc > i ? atol(argv[i]) :   8); i++;
  const long N = (argc > i ? atol(argv[i]) :  32); i++;
  const long K = (argc > i ? atol(argv[i]) : 192); i++;
  const long approx_fmas = (argc > i ? atol(argv[i]) : 1L * 1000L * 1000L * 1000L); i++;
  const long chk  = (argc > i ? atol(argv[i]) : 5); i++;
  const long seed = (argc > i ? atol(argv[i]) : 76843802738543); i++;

  matrix A(M, K);
  matrix B(K, N);
  matrix C(M, N);

  unsigned short rg[3] = { (unsigned short)((seed >> 16) & 65535),
                           (unsigned short)((seed >> 8)  & 65535),
                           (unsigned short)((seed >> 0)  & 65535) };
  rand_init(A, rg);
  rand_init(B, rg);
  zero_init(C);
  
  const long fmas = (long)M * (long)N * (long)K;
  const long repeat = (approx_fmas + fmas - 1) / fmas;
  const long fmas_total = fmas * repeat;
  printf("M = %ld, N = %ld, K = %ld\n", M, N, K);
  printf("sizeof(real) = %ld\n", sizeof(real));
  printf("L : %u\n", L);
  printf("A : %ld x %ld (ld=%ld) %ld bytes\n",
         M, K, (long)A.ld, M * A.ld * sizeof(real));
  printf("B : %ld x %ld (ld=%ld) %ld bytes\n",
         K, N, (long)B.ld, K * B.ld * sizeof(real));
  printf("C : %ld x %ld (ld=%ld) %ld bytes\n",
         M, N, (long)C.ld, M * C.ld * sizeof(real));
  printf("total = %ld bytes\n",
         (M * A.ld + K * B.ld + M * C.ld) * sizeof(real));
  printf("repeat : %ld times\n", repeat);
  printf("perform %ld fmas ... ", fmas_total); fflush(stdout);

  const char * events = getenv("EV");
  if (!events) events = "cycles,ref-cycles,instructions";
  perf_event_counters_t pc = mk_perf_event_counters(events);
  perf_event_values_t v0 = perf_event_counters_get(pc);
  /* real thing happens here */
  for (long i = 0; i < repeat; i++) {
    gemm(A, B, C);
  }
  /* real thing ends here */
  perf_event_values_t v1 = perf_event_counters_get(pc);
  printf("done\n");

  /* show performance counters */
  for (int i = 0; i < pc.n; i++) {
    printf("%s : %lld\n", pc.events[i], v1.values[i] - v0.values[i]);
  }
  for (int i = 0; i < pc.n; i++) {
    if (strcmp(pc.events[i], "cycles") == 0) {
      long dt = v1.values[i] - v0.values[i];
      printf("%f fmas/core-cycle\n", fmas_total / (double)dt);
    }
    if (strcmp(pc.events[i], "ref-cycles") == 0) {
      long dt = v1.values[i] - v0.values[i];
      printf("%f fmas/ref-cycle\n", fmas_total / (double)dt);
    }
    if (strcmp(pc.events[i], "instructions") == 0) {
      long di = v1.values[i] - v0.values[i];
      printf("%f fmas/instruction\n", fmas_total / (double)di);
    }
  }
  printf("=== checking results of randomly picked %ld elements ===\n", chk);
  for (long c = 0; c < chk; c++) {
    long i = nrand48(rg) % M;
    long j = nrand48(rg) % N;
    real s = comp_ij(A, B, i, j, repeat);
    printf("C(%ld,%ld) = %f, ans = %f, |C(%ld,%ld) - s| = %.9f\n",
           i, j, C(i,j), s,
           i, j, fabs(C(i,j) - s));
  }
  perf_event_counters_destroy(pc);
  return 0;
}

* compile it into an executable

In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -Wall -O3 -mavx512f -mfma mm_main_cpu.cc mm_basic.cc -o mm_basic -lpfm
#nvc++   -Wall -O3 -mavx512f -mfma mm_main_cpu.cc mm_basic.cc -o mm_basic -lpfm
#g++     -Wall -O3 -mavx512f -mfma mm_main_cpu.cc mm_basic.cc -o mm_basic -lpfm

* and execute it

In [None]:
BEGIN SOLUTION
END SOLUTION
./mm_basic

* it performs $C = C + A * B$ for
 * $C$ : $M \times N$ matrix 
 * $A$ : $M \times K$ matrix 
 * $B$ : $K \times N$ matrix 
so many times that it performs FMAs at least a billion ($10^9$) times
* by default $M = 8$, $N = 32$ and $K = 192$, which are small
* the reason of this choice becomes clear later

* you can change $M$, $N$ and $K$, as well as the number of FMAs by giving the first four parameters

In [None]:
BEGIN SOLUTION
END SOLUTION
./mm_basic 100 200 300 $((2 * 1000 * 1000 * 1000))

* it measures
  * the core cycles,
  * the reference cycles, and
  * the number of instructions
using performance counter and shows
  * the number of fused multiply-adds (fmas) per core cycle (`fmas/core-cycle`)
  * the number of fused multiply-adds (FMAs) per reference cycle (`fmas/ref-cycle`)
  * the number of fused multiply-adds (FMAs) per instruction
* lines below "=== checking results of randomly picked 5 elements ===" compares the result with what they should be
  * make sure they are all nearly zero
* observe the performance, shown as the number of fused multiply-adds (FMAs) per core cycle (`fmas/core-cycle`)
* compare it with the maximum performance, which is 32 fmas/core-cycle (for single precision floating point numbers)

# <font color="green"> Problem 1 :  Reason about performance of the basic code</font>
* look at the assembly code generated and look for the loop corresponding to the innermost loop (between the `asm volatile("# loop begins")` and `asm volatile("# loop ends")`)
  * did the compiler vectorize it?
* try to predict from the assembly code how many core cycles it will take per iteration and the resulting performance (fmas/cycle)
  * hint: the latency of a scalar fmadd instruction is four
* check if the experimental result matches the prediction 
* further notes:
  * there are a number of reasons why the actual performance does not exactly match the expectation (some are compiler-related and others are processor-related)
  * `nvc++` performs better than the other two compilers (do you see why?)
  * to get a result that is easy to understand, use either `clang++` or `g++` and set $M = N = 1$ and $K = $ large (e.g., 2000), so that the code almost nothing other than executing the innermost loop just once, with a large trip count

* compile it into assembly code (open the output `mm_basic.s` with editor)

In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -Wall -O3 -mavx512f -mfma mm_basic.cc -S
#nvc++   -Wall -O3 -mavx512f -mfma mm_basic.cc -S -Mkeepasm
#g++     -Wall -O3 -mavx512f -mfma mm_basic.cc -S

* write your findings and thoughts below

BEGIN SOLUTION
END SOLUTION
* did the compiler vectorize it :
* you prediction from the assembly code on the number of cycles/iteration :
* you prediction of fmas/core-cycle based on it :
* does it match the experimental result :
* other finding and/or thoughts

# 4. A vectorized matrix multiply
# <font color="green"> Problem 2 :  Apply SIMD</font>
* apply SIMD to the `gemm` function
* you should achieve roughly $L$ times speedup with $L$-way SIMD instructions ($L = 512/32 = 16$ when using 512 bit SIMD instructions for single precision)
* think about _which_ loop should vectorized, among the three possible possible choices, and explain your choice
* you can assume matrix sizes are _convenient_; that is, for your convenience, you can assume one (or some) of $M$, $N$ and $K$ is a multiple of a certain number (typically, $L$), so that you don't have to worry about the remainder iterations of the loop you vectorized

In [None]:
BEGIN SOLUTION
END SOLUTION
%%writefile mm_simd.cc
#include <assert.h>
#include "mm_cpu.h"

void gemm(matrix A, matrix B, matrix C) {
  idx_t M = C.M;
  idx_t N = C.N;
  idx_t K = A.N;
  for (idx_t i = 0; i < M; i++) {
    for (idx_t j = 0; j < N; j++) {
      real c = 0;
      asm volatile("# loop begins");
      for (idx_t k = 0; k < K; k++) {
        c += A(i,k) * B(k,j);
      }
      asm volatile("# loop ends");
      C(i,j) += c;
    }
  }
}




* compile it into an executable

In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -Wall -O3 -mavx512f -mfma mm_main_cpu.cc mm_simd.cc -o mm_simd -lpfm
#nvc++   -Wall -O3 -mavx512f -mfma mm_main_cpu.cc mm_simd.cc -o mm_simd -lpfm
#g++     -Wall -O3 -mavx512f -mfma mm_main_cpu.cc mm_simd.cc -o mm_simd -lpfm

* and run it

In [None]:
BEGIN SOLUTION
END SOLUTION
./mm_simd

* make sure errors reported below `=== checking results of randomly picked 5 elements ===` are all exactly or nearly zero

# <font color="green"> Problem 3 :  Reason about performance of the vectorized code</font>
* do the same for the vectorized version
* look at the assembly code generated and look for the loop corresponding to the innermost loop (between the `asm volatile("# loop begins")` and `asm volatile("# loop ends")`)
* try to predict how many core cycles it will take per iteration and the resulting performance (fmas/cycle); check if the result matches the expectation (hint: the latency of a SIMD (packed) fmadd instruction is four)

* compile it into assembly code

In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -Wall -O3 -mavx512f -mfma mm_simd.cc -S
#nvc++   -Wall -O3 -mavx512f -mfma mm_simd.cc -S -Mkeepasm
#g++     -Wall -O3 -mavx512f -mfma mm_simd.cc -S

BEGIN SOLUTION
END SOLUTION
* you prediction from the assembly code on the number of cycles/iteration :
* you prediction of fmas/core-cycle based on it :
* does it match the experimental result :
* other finding and/or thoughts

# 5. A vectorized + ILP-rich matrix multiply
* SIMD significantly boosts performance of matrix multiply
* but remember that the maximum performance is _two SIMD fmadd instructions per cycle_, far beyond what you achieved just by using SIMD
* the fundamental reason is that the innermost loop has little ILP, as all fma instructions update the same variable c
* in other words, there is no way the processor runs the innermost loop faster than whatever is the latency of fma instruction per iteration
* to overcome this, we take advantage of the fact that there are plenty of elements (i.e., elements of matrix _C_) we apply this innermost loop to and update several of them in parallel (by a single core, taking advantage of ILP)
* to illustrate, the strategy looks like this

```
  for (idx_t i = 0; i < M; i++) {
    for (idx_t j = 0; j < N; j += L) {
      fetch many elements of C; (*)
      for (idx_t k = 0; k < K; k++) {
        for elements of C fetched at (*) {
          ... += A(i,k) * B.V(k,j);
        }
      }
      store back the elements of C fetched at (*);
    }
  }
```

# <font color="green"> Problem 4 :  Apply SIMD + ILP</font>
* apply ILP to the SIMD code you obtained in the previous problem
* calculate how many elements need to be updated to possibly reach nearly peak performance (two SIMD fmadd instructions per cycle $=$ 32 single-precision fmas/cycle) based on the performance you obtained in the previous problem
* again, you can assume for convenience that some of $M$, $N$ and $K$ are multiple of certain numbers

In [None]:
BEGIN SOLUTION
END SOLUTION
%%writefile mm_simd_ilp.cc
#include <assert.h>
#include "mm_cpu.h"

void gemm(matrix A, matrix B, matrix C) {
  idx_t M = C.M;
  idx_t N = C.N;
  idx_t K = A.N;
  for (idx_t i = 0; i < M; i++) {
    for (idx_t j = 0; j < N; j++) {
      real c = 0;
      asm volatile("# loop begins");
      for (idx_t k = 0; k < K; k++) {
        c += A(i,k) * B(k,j);
      }
      asm volatile("# loop ends");
      C(i,j) += c;
    }
  }
}




* compile it into an executable

In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -Wall -O3 -mavx512f -mfma mm_main_cpu.cc mm_simd_ilp.cc -o mm_simd_ilp -lpfm
#nvc++   -Wall -O3 -mavx512f -mfma mm_main_cpu.cc mm_simd_ilp.cc -o mm_simd_ilp -lpfm
#g++     -Wall -O3 -mavx512f -mfma mm_main_cpu.cc mm_simd_ilp.cc -o mm_simd_ilp -lpfm

* and run it

In [None]:
BEGIN SOLUTION
END SOLUTION
./mm_simd_ilp

#  Remarks
* by updating enough variables concurrently inside the innermost ($K$) loop, you can obtain $\sim$26 fmas/cycle, or roughly 80% of the peak performance, which you should set as the goal
* closing the gap is harder but the principle and the tool (`llvm-mca`) are covered in the lecture

# 6. Analyze the loop with llvm-mca
* [llvm-mca (LLVM machine code analyzer)](https://llvm.org/docs/CommandGuide/llvm-mca.html) is a great tool to understand how many cycles a loop will take
* how to use
  * generate assembly with `-S`
  * open the output `.s` file with an editor and find the innermost loop you want to analyze
  * put assembly comment `# LLVM-MCA-BEGIN` at the head of the loop (right before or after the loop header label) and
  * put assembly comment `# LLVM-MCA-END` at the head of the loop (right after the conditional jump instruction that jumps to the loop header label), like this

```
        ...
        ...
        # LLVM-MCA-BEGIN
.LBB1_13:                               #   Parent Loop BB1_9 Depth=1
                                        #     Parent Loop BB1_10 Depth=2
                                        # =>    This Inner Loop Header: Depth=3
        vbroadcastss    -4(%r10,%r8,4), %zmm1
        vfmadd132ps     (%r12,%rbx), %zmm0, %zmm1 # zmm1 = (zmm1 * mem) + zmm0
        vbroadcastss    (%r10,%r8,4), %zmm0
        vfmadd132ps     (%r12,%r9,4), %zmm1, %zmm0 # zmm0 = (zmm0 * mem) + zmm1
        addq    $2, %r8
        addq    %r11, %r12
        cmpq    %r8, %rcx
        jne     .LBB1_13
        # LLVM-MCA-END
```
  * run `llvm-mca filename.s`
  * run `llvm-mca --help` for other useful options
  * run `llvm-mca --timeline filename.s` is particularly instructive
  * see [llvm-mca - LLVM machine code analyzer](https://llvm.org/docs/CommandGuide/llvm-mca.html) for how to read the result

## 6-1. Limitations
* it is a static analyzer that analyzes the code based on instruction latencies and dispatch port of each instruction in the loop
* while very useful, a major limitation is that, being a _static_ tool, it cannot simulate aspects that affect execution time depending on the values inside memory or registers
* most notably it cannot accurately simulate memory access latencies that significantly depend on whether the accessed data is on the cache (cache hit) or not; it assumes all memory accesses have the same latency that seems that of L1 cache hit


# <font color="green"> Problem 5 :  Analyze the loop with llvm-mca</font>
* analyze `mm_simd_ilp.s` as described above
* open it using editor and put the `# LLVM-MCA-BEGIN` and `# LLVM-MCA-END` markers appropriately
* and run the command below to understand the number of cycles an iteration takes

* compile it into assembly code

In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -Wall -O3 -mavx512f -mfma mm_simd_ilp.cc -S
#nvc++   -Wall -O3 -mavx512f -mfma mm_simd_ilp.cc -S -Mkeepasm
#g++     -Wall -O3 -mavx512f -mfma mm_simd_ilp.cc -S

* open `mm_simd_ilp.s` using an editor and put the `# LLVM-MCA-BEGIN` and `# LLVM-MCA-END` markers appropriately

* run the following command

In [None]:
BEGIN SOLUTION
END SOLUTION
llvm-mca mm_simd_ilp.s

* or

In [None]:
BEGIN SOLUTION
END SOLUTION
llvm-mca --timeline mm_simd_ilp.s

# <font color="green"> Problem 6 :  Closer to peak (optional)</font>
* try to reduce the gap to the peak performance, using llvm-mca
* describe what you do and show the performance


In [None]:
BEGIN SOLUTION
END SOLUTION
%%writefile mm_fast.cc
#include <assert.h>
#include "mm_cpu.h"

void gemm(matrix A, matrix B, matrix C) {
  idx_t M = C.M;
  idx_t N = C.N;
  idx_t K = A.N;
  for (idx_t i = 0; i < M; i++) {
    for (idx_t j = 0; j < N; j++) {
      real c = 0;
      asm volatile("# loop begins");
      for (idx_t k = 0; k < K; k++) {
        c += A(i,k) * B(k,j);
      }
      asm volatile("# loop ends");
      C(i,j) += c;
    }
  }
}




* compile it into an executable

In [None]:
BEGIN SOLUTION
END SOLUTION
clang++ -Wall -O3 -mavx512f -mfma mm_main_cpu.cc mm_fast.cc -o mm_fast -lpfm
#nvc++   -Wall -O3 -mavx512f -mfma mm_main_cpu.cc mm_fast.cc -o mm_fast -lpfm
#g++     -Wall -O3 -mavx512f -mfma mm_main_cpu.cc mm_fast.cc -o mm_fast -lpfm

In [None]:
BEGIN SOLUTION
END SOLUTION
./mm_fast

# 7. Notes on large matrices
* run your code for larger matrices and see how performance changes
* you will see that performance deteriorates when you make $K$ and $N$ larger (e.g., $K = N = 512$)
* the reason for that has something to do with memory subsystem (caches) and will be covered in later weeks
* this is a part of the reason why we have been working on small matrices
