# Vectorization

Big Picture
- What is vectorization?
- When is it useful?
- What can the compiler do?
- How can you do it manually?


First lets look at an example problem

# Problem 1

Given a list of data count the pairs of a given byte

```c++
uint64_t count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
  uint64_t total = 0;
  // we will want to compare each short to the pair of target bytes
  uint16_t check = target | (target << 8U);
  for (uint64_t i = 0; i < size; i++) {
    // boolean automaitcally converts to an integer (true = 1, false = 0)
    total += (data[i] == check);
  }
  return total;
}

```

We will be doing all of our experiments with 1 GiB of data

We compile and run this, and it takes 1 second to run

Let us take a look at what it is doing

```
Dump of assembler code for function count_pairs:
ex1a.c:
7	count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
   0x0000000000001567 <+0>:	f3 0f 1e fa	endbr64 
   0x000000000000156b <+4>:	55	push   %rbp
   0x000000000000156c <+5>:	48 89 e5	mov    %rsp,%rbp
   0x000000000000156f <+8>:	48 89 7d d8	mov    %rdi,-0x28(%rbp)
   0x0000000000001573 <+12>:	48 89 75 d0	mov    %rsi,-0x30(%rbp)
   0x0000000000001577 <+16>:	89 d0	mov    %edx,%eax
   0x0000000000001579 <+18>:	88 45 cc	mov    %al,-0x34(%rbp)

8	  uint64_t total = 0;
   0x000000000000157c <+21>:	48 c7 45 f0 00 00 00 00	movq   $0x0,-0x10(%rbp)

9	  uint16_t check = target | (target << 8U);
   0x0000000000001584 <+29>:	0f b6 55 cc	movzbl -0x34(%rbp),%edx
   0x0000000000001588 <+33>:	0f b6 45 cc	movzbl -0x34(%rbp),%eax
   0x000000000000158c <+37>:	c1 e0 08	shl    $0x8,%eax
   0x000000000000158f <+40>:	09 d0	or     %edx,%eax
   0x0000000000001591 <+42>:	66 89 45 ee	mov    %ax,-0x12(%rbp)

10	  for (uint64_t i = 0; i < size; i++) {
   0x0000000000001595 <+46>:	48 c7 45 f8 00 00 00 00	movq   $0x0,-0x8(%rbp)
   0x000000000000159d <+54>:	eb 25	jmp    0x15c4 <count_pairs+93>

11	    total += (data[i] == check);
   0x000000000000159f <+56>:	48 8b 45 f8	mov    -0x8(%rbp),%rax
   0x00000000000015a3 <+60>:	48 8d 14 00	lea    (%rax,%rax,1),%rdx
   0x00000000000015a7 <+64>:	48 8b 45 d8	mov    -0x28(%rbp),%rax
   0x00000000000015ab <+68>:	48 01 d0	add    %rdx,%rax
   0x00000000000015ae <+71>:	0f b7 00	movzwl (%rax),%eax
   0x00000000000015b1 <+74>:	66 39 45 ee	cmp    %ax,-0x12(%rbp)
   0x00000000000015b5 <+78>:	0f 94 c0	sete   %al
   0x00000000000015b8 <+81>:	0f b6 c0	movzbl %al,%eax
   0x00000000000015bb <+84>:	48 01 45 f0	add    %rax,-0x10(%rbp)

10	  for (uint64_t i = 0; i < size; i++) {
   0x00000000000015bf <+88>:	48 83 45 f8 01	addq   $0x1,-0x8(%rbp)
   0x00000000000015c4 <+93>:	48 8b 45 f8	mov    -0x8(%rbp),%rax
   0x00000000000015c8 <+97>:	48 3b 45 d0	cmp    -0x30(%rbp),%rax
   0x00000000000015cc <+101>:	72 d1	jb     0x159f <count_pairs+56>

12	  }
13	  return total;
   0x00000000000015ce <+103>:	48 8b 45 f0	mov    -0x10(%rbp),%rax

14	}
   0x00000000000015d2 <+107>:	5d	pop    %rbp
   0x00000000000015d3 <+108>:	c3	retq   
End of assembler dump.
```

However, we are running compiled code, and the compiler has the ability to optimize our code for us, so let us tell the compiler to take the same code, but this time optimize it.

Now it only takes 208 milliseconds, which is a speedup of almost 5x.

Lets see what it is doing

```
Dump of assembler code for function count_pairs:
ex1a.c:
7	count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
   0x0000000000001630 <+0>:	f3 0f 1e fa	endbr64 

8	  uint64_t total = 0;
9	  uint16_t check = target | (target << 8U);
   0x0000000000001634 <+4>:	89 d1	mov    %edx,%ecx
   0x0000000000001636 <+6>:	0f b6 d2	movzbl %dl,%edx
   0x0000000000001639 <+9>:	c1 e1 08	shl    $0x8,%ecx
   0x000000000000163c <+12>:	09 d1	or     %edx,%ecx

10	  for (uint64_t i = 0; i < size; i++) {
   0x000000000000163e <+14>:	48 85 f6	test   %rsi,%rsi
   0x0000000000001641 <+17>:	74 25	je     0x1668 <count_pairs+56>
   0x0000000000001643 <+19>:	48 8d 34 77	lea    (%rdi,%rsi,2),%rsi
   0x0000000000001647 <+23>:	31 c0	xor    %eax,%eax
   0x0000000000001649 <+25>:	0f 1f 80 00 00 00 00	nopl   0x0(%rax)

11	    total += (data[i] == check);
   0x0000000000001650 <+32>:	31 d2	xor    %edx,%edx
   0x0000000000001652 <+34>:	66 39 0f	cmp    %cx,(%rdi)
   0x0000000000001655 <+37>:	0f 94 c2	sete   %dl
   0x0000000000001658 <+40>:	48 83 c7 02	add    $0x2,%rdi
   0x000000000000165c <+44>:	48 01 d0	add    %rdx,%rax

10	  for (uint64_t i = 0; i < size; i++) {
   0x000000000000165f <+47>:	48 39 fe	cmp    %rdi,%rsi
   0x0000000000001662 <+50>:	75 ec	jne    0x1650 <count_pairs+32>
   0x0000000000001664 <+52>:	c3	retq   
   0x0000000000001665 <+53>:	0f 1f 00	nopl   (%rax)
   0x0000000000001668 <+56>:	31 c0	xor    %eax,%eax

12	  }
13	  return total;
   0x000000000000166a <+58>:	c3	retq   
End of assembler dump.


```

![x86 registers](source/registers.JPG "X86 registers")


| Version | time to process 1 Gib in ms |
|--- | --- |
| unoptimized | 1013 |
| O3 | 301 |


Now we are going to add a special flag "-march=native"
This tells the compiler that we will be running the code on the same machine that we are compiling it on.
This allows the compiler to use any special things that this computer has which are non-standard.
Most of the time when we compile code we want it to work across a broad range of machines, but newer hardware comes with more optimizations. 

If you want to see what sort of extra hardware is on your machine you can look with `lscpu` or `cat /proc/cpuinfo`

For example if we look at my laptop 

```
Architecture:            x86_64
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         46 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  20
  On-line CPU(s) list:   0-19
Vendor ID:               GenuineIntel
  Model name:            13th Gen Intel(R) Core(TM) i9-13900H
    CPU family:          6
    Model:               186
    Thread(s) per core:  2
    Core(s) per socket:  10
    Socket(s):           1
    Stepping:            2
    BogoMIPS:            5990.40
    Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid pni pclmulqdq vmx ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves avx_vnni umip waitpkg gfni vaes vpclmulqdq rdpid movdiri movdir64b fsrm serialize flush_l1d arch_capabilities
Virtualization features: 
  Virtualization:        VT-x
  Hypervisor vendor:     Microsoft
  Virtualization type:   full
Caches (sum of all):     
  L1d:                   480 KiB (10 instances)
  L1i:                   320 KiB (10 instances)
  L2:                    12.5 MiB (10 instances)
  L3:                    24 MiB (1 instance)
```
Specifically we are looking at the flags

Here is a modern high end intel server

```
Flags:                           
fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
```

We will mostly be converned with the following types

 - SSE (Streaming SIMD Extensions)
 - AVX (Advanced Vector Extensions)

This introduces 2 terms, SIMD and Vectors

SIMD stands for single instruction multiple data, which means we have a single instruction which processes multiple pieces of data at the same time
AVX is basically just the successor to SIMD

These both allow us to use vectorization 

# What is vectorization

Operations on wide registers

Most registers are between 8 and 64 bits
Operations on these registers happen on the entire register

Vector registers are normally a fair amount bigger 128 - 512 bits

They are normally thought of as a vector of data, where operations happen individually on each item in the vector

![Vector Operation](source/vector_op.JPG "Vector Operation")


For example `vpcmpeqw ` is a special instruction which operated on 2 256 bit registers, it treats these registers as having 16, 16-bit objects, compares all 16 of them for equality and then outputs -1 in the same location of the output 256-bit register, if they are equal, and 0 otherwise.

This would allow us to perform the equivalent of 4 iterations of the unrolled loop all at once.

This is still the same code, we are just changing how we compile it 
```c++
uint64_t count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
  uint64_t total = 0;
  // we will want to compare each short to the pair of target bytes
  uint16_t check = target | (target << 8U);
  for (uint64_t i = 0; i < size; i++) {
    total += (data[i] == check);
  }
  return total;
}

```

Compiling with march native gives us over another 2x speedup

| Version | time to process 1 Gib in ms |
|--- | --- |
| unoptimized | 1013 |
| O3 | 301 |
| march=native | 114 |

Let us take a look at the assembly for this one 

```
Dump of assembler code for function count_pairs:
ex1a.c:
7	count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
   0x0000000000001620 <+0>:	f3 0f 1e fa	endbr64 

8	  uint64_t total = 0;
9	  uint16_t check = target | (target << 8U);
   0x0000000000001624 <+4>:	41 89 d0	mov    %edx,%r8d
   0x0000000000001627 <+7>:	41 c1 e0 08	shl    $0x8,%r8d
   0x000000000000162b <+11>:	0f b6 d2	movzbl %dl,%edx
   0x000000000000162e <+14>:	41 09 d0	or     %edx,%r8d

10	  for (uint64_t i = 0; i < size; i++) {
   0x0000000000001631 <+17>:	48 85 f6	test   %rsi,%rsi
   0x0000000000001634 <+20>:	0f 84 ee 01 00 00	je     0x1828 <count_pairs+520>
   0x000000000000163a <+26>:	48 8d 46 ff	lea    -0x1(%rsi),%rax
   0x000000000000163e <+30>:	48 83 f8 0e	cmp    $0xe,%rax
   0x0000000000001642 <+34>:	0f 86 e3 01 00 00	jbe    0x182b <count_pairs+523>
   0x0000000000001648 <+40>:	48 89 f2	mov    %rsi,%rdx
   0x000000000000164b <+43>:	48 c1 ea 04	shr    $0x4,%rdx
   0x000000000000164f <+47>:	48 c1 e2 05	shl    $0x5,%rdx
   0x0000000000001653 <+51>:	c4 e2 7d 79 25 98 0a 00 00	vpbroadcastw 0xa98(%rip),%ymm4        # 0x20f4
   0x000000000000165c <+60>:	62 d2 7d 28 7b e8	vpbroadcastw %r8d,%ymm5
   0x0000000000001662 <+66>:	48 89 f8	mov    %rdi,%rax
   0x0000000000001665 <+69>:	48 01 fa	add    %rdi,%rdx
   0x0000000000001668 <+72>:	c5 e1 ef db	vpxor  %xmm3,%xmm3,%xmm3
   0x000000000000166c <+76>:	0f 1f 40 00	nopl   0x0(%rax)

11	    total += (data[i] == check);
   0x0000000000001670 <+80>:	c5 d5 75 00	vpcmpeqw (%rax),%ymm5,%ymm0
   0x0000000000001674 <+84>:	48 83 c0 20	add    $0x20,%rax
   0x0000000000001678 <+88>:	c5 fd db c4	vpand  %ymm4,%ymm0,%ymm0
   0x000000000000167c <+92>:	c4 e2 7d 33 c8	vpmovzxwd %xmm0,%ymm1
   0x0000000000001681 <+97>:	c4 e3 7d 39 c0 01	vextracti128 $0x1,%ymm0,%xmm0
   0x0000000000001687 <+103>:	c4 e2 7d 33 c0	vpmovzxwd %xmm0,%ymm0
   0x000000000000168c <+108>:	c4 e2 7d 35 d0	vpmovzxdq %xmm0,%ymm2
   0x0000000000001691 <+113>:	c4 e3 7d 39 c0 01	vextracti128 $0x1,%ymm0,%xmm0
   0x0000000000001697 <+119>:	c4 e2 7d 35 c0	vpmovzxdq %xmm0,%ymm0
   0x000000000000169c <+124>:	c5 ed d4 c0	vpaddq %ymm0,%ymm2,%ymm0
   0x00000000000016a0 <+128>:	c4 e2 7d 35 d1	vpmovzxdq %xmm1,%ymm2
   0x00000000000016a5 <+133>:	c4 e3 7d 39 c9 01	vextracti128 $0x1,%ymm1,%xmm1
   0x00000000000016ab <+139>:	c4 e2 7d 35 c9	vpmovzxdq %xmm1,%ymm1
   0x00000000000016b0 <+144>:	c5 ed d4 c9	vpaddq %ymm1,%ymm2,%ymm1
   0x00000000000016b4 <+148>:	c5 fd d4 c1	vpaddq %ymm1,%ymm0,%ymm0
   0x00000000000016b8 <+152>:	c5 e5 d4 d8	vpaddq %ymm0,%ymm3,%ymm3

10	  for (uint64_t i = 0; i < size; i++) {
   0x00000000000016bc <+156>:	48 39 d0	cmp    %rdx,%rax
   0x00000000000016bf <+159>:	75 af	jne    0x1670 <count_pairs+80>
   0x00000000000016c1 <+161>:	c5 f9 6f c3	vmovdqa %xmm3,%xmm0
   0x00000000000016c5 <+165>:	62 f3 fd 28 39 db 01	vextracti64x2 $0x1,%ymm3,%xmm3
   0x00000000000016cc <+172>:	c5 f9 d4 c3	vpaddq %xmm3,%xmm0,%xmm0
   0x00000000000016d0 <+176>:	c5 f1 73 d8 08	vpsrldq $0x8,%xmm0,%xmm1
   0x00000000000016d5 <+181>:	c5 f9 d4 c1	vpaddq %xmm1,%xmm0,%xmm0
   0x00000000000016d9 <+185>:	48 89 f1	mov    %rsi,%rcx
   0x00000000000016dc <+188>:	c4 e1 f9 7e c0	vmovq  %xmm0,%rax
   0x00000000000016e1 <+193>:	48 83 e1 f0	and    $0xfffffffffffffff0,%rcx
   0x00000000000016e5 <+197>:	40 f6 c6 0f	test   $0xf,%sil
   0x00000000000016e9 <+201>:	0f 84 45 01 00 00	je     0x1834 <count_pairs+532>
   0x00000000000016ef <+207>:	c5 f8 77	vzeroupper 
   0x00000000000016f2 <+210>:	49 89 f1	mov    %rsi,%r9
   0x00000000000016f5 <+213>:	49 29 c9	sub    %rcx,%r9
   0x00000000000016f8 <+216>:	49 8d 51 ff	lea    -0x1(%r9),%rdx
   0x00000000000016fc <+220>:	48 83 fa 06	cmp    $0x6,%rdx
   0x0000000000001700 <+224>:	76 75	jbe    0x1777 <count_pairs+343>

11	    total += (data[i] == check);
   0x0000000000001702 <+226>:	62 d2 7d 08 7b c0	vpbroadcastw %r8d,%xmm0
   0x0000000000001708 <+232>:	c5 f9 75 04 4f	vpcmpeqw (%rdi,%rcx,2),%xmm0,%xmm0
   0x000000000000170d <+237>:	c4 e2 79 79 0d de 09 00 00	vpbroadcastw 0x9de(%rip),%xmm1        # 0x20f4
   0x0000000000001716 <+246>:	c5 f9 db c1	vpand  %xmm1,%xmm0,%xmm0
   0x000000000000171a <+250>:	c4 e2 79 33 d0	vpmovzxwd %xmm0,%xmm2
   0x000000000000171f <+255>:	c4 e2 79 35 ca	vpmovzxdq %xmm2,%xmm1
   0x0000000000001724 <+260>:	c5 f9 73 d8 08	vpsrldq $0x8,%xmm0,%xmm0
   0x0000000000001729 <+265>:	c5 e9 73 da 08	vpsrldq $0x8,%xmm2,%xmm2
   0x000000000000172e <+270>:	c4 e2 79 33 c0	vpmovzxwd %xmm0,%xmm0
   0x0000000000001733 <+275>:	c4 e2 79 35 d2	vpmovzxdq %xmm2,%xmm2
   0x0000000000001738 <+280>:	c5 f1 d4 ca	vpaddq %xmm2,%xmm1,%xmm1
   0x000000000000173c <+284>:	c4 e2 79 35 d0	vpmovzxdq %xmm0,%xmm2
   0x0000000000001741 <+289>:	c5 f9 73 d8 08	vpsrldq $0x8,%xmm0,%xmm0
   0x0000000000001746 <+294>:	c4 e2 79 35 c0	vpmovzxdq %xmm0,%xmm0
   0x000000000000174b <+299>:	c5 e9 d4 c0	vpaddq %xmm0,%xmm2,%xmm0
   0x000000000000174f <+303>:	c5 f1 d4 c0	vpaddq %xmm0,%xmm1,%xmm0

10	  for (uint64_t i = 0; i < size; i++) {
   0x0000000000001753 <+307>:	c5 f1 73 d8 08	vpsrldq $0x8,%xmm0,%xmm1
   0x0000000000001758 <+312>:	c5 f9 d4 c1	vpaddq %xmm1,%xmm0,%xmm0
   0x000000000000175c <+316>:	c4 e1 f9 7e c2	vmovq  %xmm0,%rdx
   0x0000000000001761 <+321>:	48 01 d0	add    %rdx,%rax
   0x0000000000001764 <+324>:	4c 89 ca	mov    %r9,%rdx
   0x0000000000001767 <+327>:	48 83 e2 f8	and    $0xfffffffffffffff8,%rdx
   0x000000000000176b <+331>:	48 01 d1	add    %rdx,%rcx
   0x000000000000176e <+334>:	49 39 d1	cmp    %rdx,%r9
   0x0000000000001771 <+337>:	0f 84 b3 00 00 00	je     0x182a <count_pairs+522>

11	    total += (data[i] == check);
   0x0000000000001777 <+343>:	45 31 c9	xor    %r9d,%r9d
   0x000000000000177a <+346>:	66 44 39 04 4f	cmp    %r8w,(%rdi,%rcx,2)
   0x000000000000177f <+351>:	41 0f 94 c1	sete   %r9b
   0x0000000000001783 <+355>:	4c 01 c8	add    %r9,%rax

10	  for (uint64_t i = 0; i < size; i++) {
   0x0000000000001786 <+358>:	4c 8d 49 01	lea    0x1(%rcx),%r9
   0x000000000000178a <+362>:	48 8d 14 09	lea    (%rcx,%rcx,1),%rdx
   0x000000000000178e <+366>:	4c 39 ce	cmp    %r9,%rsi
   0x0000000000001791 <+369>:	0f 86 93 00 00 00	jbe    0x182a <count_pairs+522>

11	    total += (data[i] == check);
   0x0000000000001797 <+375>:	45 31 c9	xor    %r9d,%r9d
   0x000000000000179a <+378>:	66 44 39 44 17 02	cmp    %r8w,0x2(%rdi,%rdx,1)
   0x00000000000017a0 <+384>:	41 0f 94 c1	sete   %r9b
   0x00000000000017a4 <+388>:	4c 01 c8	add    %r9,%rax

10	  for (uint64_t i = 0; i < size; i++) {
   0x00000000000017a7 <+391>:	4c 8d 49 02	lea    0x2(%rcx),%r9
   0x00000000000017ab <+395>:	49 39 f1	cmp    %rsi,%r9
   0x00000000000017ae <+398>:	73 7a	jae    0x182a <count_pairs+522>

11	    total += (data[i] == check);
   0x00000000000017b0 <+400>:	45 31 c9	xor    %r9d,%r9d
   0x00000000000017b3 <+403>:	66 44 39 44 17 04	cmp    %r8w,0x4(%rdi,%rdx,1)
   0x00000000000017b9 <+409>:	41 0f 94 c1	sete   %r9b
   0x00000000000017bd <+413>:	4c 01 c8	add    %r9,%rax

10	  for (uint64_t i = 0; i < size; i++) {
   0x00000000000017c0 <+416>:	4c 8d 49 03	lea    0x3(%rcx),%r9
   0x00000000000017c4 <+420>:	4c 39 ce	cmp    %r9,%rsi
   0x00000000000017c7 <+423>:	76 61	jbe    0x182a <count_pairs+522>

11	    total += (data[i] == check);
   0x00000000000017c9 <+425>:	45 31 c9	xor    %r9d,%r9d
   0x00000000000017cc <+428>:	66 44 39 44 17 06	cmp    %r8w,0x6(%rdi,%rdx,1)
   0x00000000000017d2 <+434>:	41 0f 94 c1	sete   %r9b
   0x00000000000017d6 <+438>:	4c 01 c8	add    %r9,%rax

10	  for (uint64_t i = 0; i < size; i++) {
   0x00000000000017d9 <+441>:	4c 8d 49 04	lea    0x4(%rcx),%r9
   0x00000000000017dd <+445>:	4c 39 ce	cmp    %r9,%rsi
   0x00000000000017e0 <+448>:	76 48	jbe    0x182a <count_pairs+522>

11	    total += (data[i] == check);
   0x00000000000017e2 <+450>:	45 31 c9	xor    %r9d,%r9d
   0x00000000000017e5 <+453>:	66 44 39 44 17 08	cmp    %r8w,0x8(%rdi,%rdx,1)
   0x00000000000017eb <+459>:	41 0f 94 c1	sete   %r9b
   0x00000000000017ef <+463>:	4c 01 c8	add    %r9,%rax

10	  for (uint64_t i = 0; i < size; i++) {
   0x00000000000017f2 <+466>:	4c 8d 49 05	lea    0x5(%rcx),%r9
   0x00000000000017f6 <+470>:	4c 39 ce	cmp    %r9,%rsi
   0x00000000000017f9 <+473>:	76 2f	jbe    0x182a <count_pairs+522>

11	    total += (data[i] == check);
   0x00000000000017fb <+475>:	45 31 c9	xor    %r9d,%r9d
   0x00000000000017fe <+478>:	66 44 39 44 17 0a	cmp    %r8w,0xa(%rdi,%rdx,1)
   0x0000000000001804 <+484>:	41 0f 94 c1	sete   %r9b
   0x0000000000001808 <+488>:	48 83 c1 06	add    $0x6,%rcx
   0x000000000000180c <+492>:	4c 01 c8	add    %r9,%rax

10	  for (uint64_t i = 0; i < size; i++) {
   0x000000000000180f <+495>:	48 39 ce	cmp    %rcx,%rsi
   0x0000000000001812 <+498>:	76 16	jbe    0x182a <count_pairs+522>

11	    total += (data[i] == check);
   0x0000000000001814 <+500>:	66 44 39 44 17 0c	cmp    %r8w,0xc(%rdi,%rdx,1)
   0x000000000000181a <+506>:	0f 94 c2	sete   %dl
   0x000000000000181d <+509>:	0f b6 d2	movzbl %dl,%edx
   0x0000000000001820 <+512>:	48 01 d0	add    %rdx,%rax

10	  for (uint64_t i = 0; i < size; i++) {
   0x0000000000001823 <+515>:	c3	retq   
   0x0000000000001824 <+516>:	0f 1f 40 00	nopl   0x0(%rax)
   0x0000000000001828 <+520>:	31 c0	xor    %eax,%eax

12	  }
13	  return total;
   0x000000000000182a <+522>:	c3	retq   
   0x000000000000182b <+523>:	31 c9	xor    %ecx,%ecx
   0x000000000000182d <+525>:	31 c0	xor    %eax,%eax
   0x000000000000182f <+527>:	e9 be fe ff ff	jmpq   0x16f2 <count_pairs+210>
   0x0000000000001834 <+532>:	c5 f8 77	vzeroupper 
   0x0000000000001837 <+535>:	c3	retq   
End of assembler dump.


```

It becomes a giant mess

We need multiple versions of the loops depending on the number of iterations

Sometimes we need to do special work at the beginning or end if the alignment is not proper

We can fix some of these issues by giving the compiler more information.
Here is the same code, but with some extra information given to the compiler

```c++
uint64_t count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
  // tell the compilier that data is aligned to 32 bytes
  data = __builtin_assume_aligned(data, 32);
  // tell the compilier that size is a multiple of 32
  size = size & (~31U);
  uint64_t total = 0;
  uint16_t check = target | (target << 8U);
  for (uint64_t i = 0; i < size; i++) {
    total += (data[i] == check);
  }
  return total;
}
```

In this case the start and end work is trivial, so it runs at the same speed, but if we look at the assembly, it is a bit simpler 

```
Dump of assembler code for function count_pairs:
ex1b.c:
7	count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
   0x0000000000001620 <+0>:	f3 0f 1e fa	endbr64 

8	  // tell the compilier that data is aligned to 32 bytes
9	  data = __builtin_assume_aligned(data, 32);
10	  // tell the compilier that size is a multiple of 32
11	  size = size & (~31U);
12	  uint64_t total = 0;
13	  uint16_t check = target | (target << 8U);
   0x0000000000001624 <+4>:	89 d1	mov    %edx,%ecx
   0x0000000000001626 <+6>:	c1 e1 08	shl    $0x8,%ecx
   0x0000000000001629 <+9>:	0f b6 d2	movzbl %dl,%edx
   0x000000000000162c <+12>:	48 89 f0	mov    %rsi,%rax
   0x000000000000162f <+15>:	09 d1	or     %edx,%ecx

14	  for (uint64_t i = 0; i < size; i++) {
   0x0000000000001631 <+17>:	83 e0 e0	and    $0xffffffe0,%eax
   0x0000000000001634 <+20>:	0f 84 8f 00 00 00	je     0x16c9 <count_pairs+169>
   0x000000000000163a <+26>:	c4 e2 7d 79 2d b1 0a 00 00	vpbroadcastw 0xab1(%rip),%ymm5        # 0x20f4
   0x0000000000001643 <+35>:	62 f2 7d 28 7b e1	vpbroadcastw %ecx,%ymm4
   0x0000000000001649 <+41>:	48 8d 04 47	lea    (%rdi,%rax,2),%rax
   0x000000000000164d <+45>:	c5 e1 ef db	vpxor  %xmm3,%xmm3,%xmm3
   0x0000000000001651 <+49>:	0f 1f 80 00 00 00 00	nopl   0x0(%rax)

15	    total += (data[i] == check);
   0x0000000000001658 <+56>:	c5 dd 75 07	vpcmpeqw (%rdi),%ymm4,%ymm0
   0x000000000000165c <+60>:	48 83 c7 20	add    $0x20,%rdi
   0x0000000000001660 <+64>:	c5 fd db c5	vpand  %ymm5,%ymm0,%ymm0
   0x0000000000001664 <+68>:	c4 e2 7d 33 c8	vpmovzxwd %xmm0,%ymm1
   0x0000000000001669 <+73>:	c4 e3 7d 39 c0 01	vextracti128 $0x1,%ymm0,%xmm0
   0x000000000000166f <+79>:	c4 e2 7d 33 c0	vpmovzxwd %xmm0,%ymm0
   0x0000000000001674 <+84>:	c4 e2 7d 35 d0	vpmovzxdq %xmm0,%ymm2
   0x0000000000001679 <+89>:	c4 e3 7d 39 c0 01	vextracti128 $0x1,%ymm0,%xmm0
   0x000000000000167f <+95>:	c4 e2 7d 35 c0	vpmovzxdq %xmm0,%ymm0
   0x0000000000001684 <+100>:	c5 ed d4 c0	vpaddq %ymm0,%ymm2,%ymm0
   0x0000000000001688 <+104>:	c4 e2 7d 35 d1	vpmovzxdq %xmm1,%ymm2
   0x000000000000168d <+109>:	c4 e3 7d 39 c9 01	vextracti128 $0x1,%ymm1,%xmm1
   0x0000000000001693 <+115>:	c4 e2 7d 35 c9	vpmovzxdq %xmm1,%ymm1
   0x0000000000001698 <+120>:	c5 ed d4 c9	vpaddq %ymm1,%ymm2,%ymm1
   0x000000000000169c <+124>:	c5 fd d4 c1	vpaddq %ymm1,%ymm0,%ymm0
   0x00000000000016a0 <+128>:	c5 e5 d4 d8	vpaddq %ymm0,%ymm3,%ymm3

14	  for (uint64_t i = 0; i < size; i++) {
   0x00000000000016a4 <+132>:	48 39 f8	cmp    %rdi,%rax
   0x00000000000016a7 <+135>:	75 af	jne    0x1658 <count_pairs+56>
   0x00000000000016a9 <+137>:	c5 f9 6f c3	vmovdqa %xmm3,%xmm0
   0x00000000000016ad <+141>:	62 f3 fd 28 39 db 01	vextracti64x2 $0x1,%ymm3,%xmm3
   0x00000000000016b4 <+148>:	c5 f9 d4 c3	vpaddq %xmm3,%xmm0,%xmm0
   0x00000000000016b8 <+152>:	c5 f1 73 d8 08	vpsrldq $0x8,%xmm0,%xmm1
   0x00000000000016bd <+157>:	c5 f9 d4 c1	vpaddq %xmm1,%xmm0,%xmm0
   0x00000000000016c1 <+161>:	c4 e1 f9 7e c0	vmovq  %xmm0,%rax

16	  }
17	  return total;
   0x00000000000016c6 <+166>:	c5 f8 77	vzeroupper 
   0x00000000000016c9 <+169>:	c3	retq   
End of assembler dump.


```

The compiler can do a lot to optimize and use the vector registers to speed up the code, but we can also do it manually.

Intrinsics are basically functions which will compile down to a single special assembly instruction that we can use to use the vector instrctions in c or c++ code without having to write raw assembly.  The list of them can be found [https://software.intel.com/sites/landingpage/IntrinsicsGuide/](https://software.intel.com/sites/landingpage/IntrinsicsGuide/)

![_mm256_cmpeq_epi16](source/_mm256_cmpeq_epi16.JPG "_mm256_cmpeq_epi16")




Here we manually vectorize this code 
```c++
uint64_t count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
  data = __builtin_assume_aligned(data, 32);
  uint64_t total = 0;
  uint16_t check = target | (target << 8U);
  // set up a vector so that you can comare the pair of bytes in all 16 positions
  __m256i compare = _mm256_set1_epi16(check);
  for (uint64_t i = 0; i < size; i += 16) {
    // do 16 comparisons at once
    uint32_t block = _mm256_movemask_epi8( _mm256_cmpeq_epi16(_mm256_load_si256((__m256i *)(data + i)), compare));
    // once we move the result of the comparison into a normal register, count the set bits
    total += __builtin_popcount(block);
  }
  return total / 2;
}
```

- _mm256_set1_epi16 takes in a 16-bit object and puts it at all 16 16-bit locations in a 256 bit register 
- _mm256_load_si256 loads 256 bits of data from the given memory address
- _mm256_cmpeq_epi16 compares the two vectors element wise
- _mm256_movemask_epi8 takes the top bit from each byte into a 32-bit object, we use it to get our data out of the vector register into the normal registers
- __builtin_popcount counts the number of set bits in a normal 32-bit register


Most of these require the `avx2` flag above, but popcount requires the special `popcnt` flag

So what we are doing here, is for each set of 16 elements, we compare if they are equal to the vector of a pair of target bytes, then we move these out into a normal register, we then count up the number of equal shorts with the pop count, however, since we moved it out by bytes and not shorts we need to divide by 2 at the end.

We get fairly good performance with a significant improvement over the compiler

| Version | time to process 1 Gib in ms |
|--- | --- |
| unoptimized | 1013 |
| O3 | 301 |
| march=native | 114 |
| manual vectorization | 77 |

Let us take a look at the assembly generated this time.

```
Dump of assembler code for function count_pairs:
ex1c.c:
8	count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
   0x0000000000001620 <+0>:	f3 0f 1e fa	endbr64 

9	  data = __builtin_assume_aligned(data, 32);
10	  uint64_t total = 0;
11	  uint16_t check = target | (target << 8U);
   0x0000000000001624 <+4>:	89 d0	mov    %edx,%eax
   0x0000000000001626 <+6>:	c1 e0 08	shl    $0x8,%eax
   0x0000000000001629 <+9>:	0f b6 d2	movzbl %dl,%edx
   0x000000000000162c <+12>:	09 d0	or     %edx,%eax

/usr/lib/gcc/x86_64-linux-gnu/11/include/avxintrin.h:
1335	  return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
   0x000000000000162e <+14>:	62 f2 7d 28 7b c8	vpbroadcastw %eax,%ymm1

ex1c.c:
13	  for (uint64_t i = 0; i < size; i += 16) {
   0x0000000000001634 <+20>:	48 85 f6	test   %rsi,%rsi
   0x0000000000001637 <+23>:	74 27	je     0x1660 <count_pairs+64>
   0x0000000000001639 <+25>:	31 d2	xor    %edx,%edx
   0x000000000000163b <+27>:	31 c0	xor    %eax,%eax
   0x000000000000163d <+29>:	0f 1f 00	nopl   (%rax)

/usr/lib/gcc/x86_64-linux-gnu/11/include/avx2intrin.h:
240	  return (__m256i) ((__v16hi)__A == (__v16hi)__B);
   0x0000000000001640 <+32>:	c5 f5 75 04 57	vpcmpeqw (%rdi,%rdx,2),%ymm1,%ymm0

   0x0000000000001645 <+37>:	48 83 c2 10	add    $0x10,%rdx
   0x0000000000001649 <+41>:	c5 fd d7 c8	vpmovmskb %ymm0,%ecx

ex1c.c:
16	    total += __builtin_popcount(block);
   0x000000000000164d <+45>:	f3 0f b8 c9	popcnt %ecx,%ecx
   0x0000000000001651 <+49>:	48 01 c8	add    %rcx,%rax

13	  for (uint64_t i = 0; i < size; i += 16) {
   0x0000000000001654 <+52>:	48 39 d6	cmp    %rdx,%rsi
   0x0000000000001657 <+55>:	77 e7	ja     0x1640 <count_pairs+32>
   0x0000000000001659 <+57>:	48 d1 e8	shr    %rax
   0x000000000000165c <+60>:	c5 f8 77	vzeroupper 
   0x000000000000165f <+63>:	c3	retq   
   0x0000000000001660 <+64>:	31 c0	xor    %eax,%eax

17	  }
18	  return total / 2;
   0x0000000000001662 <+66>:	c5 f8 77	vzeroupper 
   0x0000000000001665 <+69>:	c3	retq   
End of assembler dump.


```

This ends up being one of the simpler assembly versions we have seen so far

This is mostly due to how it generated basically just what we told it to and nothing else.

| Version | time to process 1 Gib in ms |
|--- | --- |
| unoptimized | 1013 |
| O3 | 301 |
| march=native | 114 |
| manual vectorization | 77 |


So all together we are able to get a 14x speedup, 4x over optimized code, and 1.5x over the best the compiler could do, but it takes a lot more effort to write the last two versions than the first three.

# Problem 2

Now we will look at a slight variation of the problem.

We remove the requirement that the pairs are aligned, before the pairs had to be in even - odd postions, not they can be either even-odd, or odd-even.

The code for this is not much more complicated, we look at every byte position now instead of each short position.

```c++
uint64_t count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
  uint64_t total = 0;
  uint16_t check = target | (target << 8U);
  for (uint64_t i = 0; i < size * 2 - 1; i++) {
    total += (load16(data + i) == check);
  }
  return total;
}
```


| Version | time to process 1 Gib in ms |
|--- | --- |
| unoptimized | 3265 |
| O3 | 587 |
| march=native | 619|

Unoptimized 
```
Dump of assembler code for function count_pairs:
ex2a.c:
7	count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
   0x0000000000001567 <+0>:	f3 0f 1e fa	endbr64 
   0x000000000000156b <+4>:	55	push   %rbp
   0x000000000000156c <+5>:	48 89 e5	mov    %rsp,%rbp
   0x000000000000156f <+8>:	48 83 ec 40	sub    $0x40,%rsp
   0x0000000000001573 <+12>:	48 89 7d d8	mov    %rdi,-0x28(%rbp)
   0x0000000000001577 <+16>:	48 89 75 d0	mov    %rsi,-0x30(%rbp)
   0x000000000000157b <+20>:	89 d0	mov    %edx,%eax
   0x000000000000157d <+22>:	88 45 cc	mov    %al,-0x34(%rbp)

8	  uint64_t total = 0;
   0x0000000000001580 <+25>:	48 c7 45 f0 00 00 00 00	movq   $0x0,-0x10(%rbp)

9	  uint16_t check = target | (target << 8U);
   0x0000000000001588 <+33>:	0f b6 55 cc	movzbl -0x34(%rbp),%edx
   0x000000000000158c <+37>:	0f b6 45 cc	movzbl -0x34(%rbp),%eax
   0x0000000000001590 <+41>:	c1 e0 08	shl    $0x8,%eax
   0x0000000000001593 <+44>:	09 d0	or     %edx,%eax
   0x0000000000001595 <+46>:	66 89 45 ee	mov    %ax,-0x12(%rbp)

10	  for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001599 <+50>:	48 c7 45 f8 00 00 00 00	movq   $0x0,-0x8(%rbp)
   0x00000000000015a1 <+58>:	eb 28	jmp    0x15cb <count_pairs+100>

11	    total += (load16(data + i) == check);
   0x00000000000015a3 <+60>:	48 8b 55 d8	mov    -0x28(%rbp),%rdx
   0x00000000000015a7 <+64>:	48 8b 45 f8	mov    -0x8(%rbp),%rax
   0x00000000000015ab <+68>:	48 01 d0	add    %rdx,%rax
   0x00000000000015ae <+71>:	48 89 c7	mov    %rax,%rdi
   0x00000000000015b1 <+74>:	e8 4f fd ff ff	callq  0x1305 <load16>
   0x00000000000015b6 <+79>:	66 39 45 ee	cmp    %ax,-0x12(%rbp)
   0x00000000000015ba <+83>:	0f 94 c0	sete   %al
   0x00000000000015bd <+86>:	0f b6 c0	movzbl %al,%eax
   0x00000000000015c0 <+89>:	48 98	cltq   
   0x00000000000015c2 <+91>:	48 01 45 f0	add    %rax,-0x10(%rbp)

10	  for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x00000000000015c6 <+95>:	48 83 45 f8 01	addq   $0x1,-0x8(%rbp)
   0x00000000000015cb <+100>:	48 8b 45 d0	mov    -0x30(%rbp),%rax
   0x00000000000015cf <+104>:	48 01 c0	add    %rax,%rax
   0x00000000000015d2 <+107>:	48 83 e8 01	sub    $0x1,%rax
   0x00000000000015d6 <+111>:	48 39 45 f8	cmp    %rax,-0x8(%rbp)
   0x00000000000015da <+115>:	72 c7	jb     0x15a3 <count_pairs+60>

12	  }
13	  return total;
   0x00000000000015dc <+117>:	48 8b 45 f0	mov    -0x10(%rbp),%rax

14	}
   0x00000000000015e0 <+121>:	c9	leaveq 
   0x00000000000015e1 <+122>:	c3	retq   
End of assembler dump.

```

O3
```
Dump of assembler code for function count_pairs:
ex2a.c:
7	count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
   0x0000000000001630 <+0>:	f3 0f 1e fa	endbr64 

8	  uint64_t total = 0;
9	  uint16_t check = target | (target << 8U);
   0x0000000000001634 <+4>:	89 d1	mov    %edx,%ecx
   0x0000000000001636 <+6>:	0f b6 d2	movzbl %dl,%edx
   0x0000000000001639 <+9>:	48 8d 74 77 ff	lea    -0x1(%rdi,%rsi,2),%rsi
   0x000000000000163e <+14>:	31 c0	xor    %eax,%eax
   0x0000000000001640 <+16>:	c1 e1 08	shl    $0x8,%ecx
   0x0000000000001643 <+19>:	09 d1	or     %edx,%ecx

10	  for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001645 <+21>:	0f 1f 00	nopl   (%rax)

11	    total += (load16(data + i) == check);
   0x0000000000001648 <+24>:	31 d2	xor    %edx,%edx
   0x000000000000164a <+26>:	66 39 0f	cmp    %cx,(%rdi)
   0x000000000000164d <+29>:	0f 94 c2	sete   %dl
   0x0000000000001650 <+32>:	48 83 c7 01	add    $0x1,%rdi
   0x0000000000001654 <+36>:	48 01 d0	add    %rdx,%rax

10	  for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001657 <+39>:	48 39 fe	cmp    %rdi,%rsi
   0x000000000000165a <+42>:	75 ec	jne    0x1648 <count_pairs+24>

12	  }
13	  return total;
   0x000000000000165c <+44>:	c3	retq   
End of assembler dump.


```

We see how it is able to simplify the inner loop as before, but it no longer unrolls the loop at all

march=native
```
Dump of assembler code for function count_pairs:
ex2a.c:
7	count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
   0x0000000000001620 <+0>:	f3 0f 1e fa	endbr64 

8	  uint64_t total = 0;
9	  uint16_t check = target | (target << 8U);
   0x0000000000001624 <+4>:	89 d1	mov    %edx,%ecx
   0x0000000000001626 <+6>:	c1 e1 08	shl    $0x8,%ecx
   0x0000000000001629 <+9>:	0f b6 d2	movzbl %dl,%edx
   0x000000000000162c <+12>:	09 d1	or     %edx,%ecx

10	  for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x000000000000162e <+14>:	48 8d 74 77 ff	lea    -0x1(%rdi,%rsi,2),%rsi
   0x0000000000001633 <+19>:	31 c0	xor    %eax,%eax
   0x0000000000001635 <+21>:	0f 1f 00	nopl   (%rax)

11	    total += (load16(data + i) == check);
   0x0000000000001638 <+24>:	31 d2	xor    %edx,%edx
   0x000000000000163a <+26>:	66 39 0f	cmp    %cx,(%rdi)
   0x000000000000163d <+29>:	0f 94 c2	sete   %dl
   0x0000000000001640 <+32>:	48 ff c7	inc    %rdi
   0x0000000000001643 <+35>:	48 01 d0	add    %rdx,%rax

10	  for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001646 <+38>:	48 39 fe	cmp    %rdi,%rsi
   0x0000000000001649 <+41>:	75 ed	jne    0x1638 <count_pairs+24>

12	  }
13	  return total;
   0x000000000000164b <+43>:	c3	retq   
End of assembler dump.


```

Some minor differences in the output, the time difference could just be noise, but sometimes adding these flags can make your code slower, always benchmark and check

| Version | time to process 1 Gib in ms |
|--- | --- |
| unoptimized | 3265 |
| O3 | 587 |
| march=native | 619|

Here the work is about twice as much, so the first two numbers seem reasonable, but now the compiler is unable to gain benefit from vectorization.

This is because without the alignment restriction this is a much messier problem.



However, we can still do it by hand.

```c++
uint64_t count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
  uint64_t total = 0;
  uint32_t last_bit = 0;
  __m256i compare = _mm256_set1_epi8(target);
  for (uint64_t i = 0; i < size * 2; i += 32) {
    uint32_t block = _mm256_movemask_epi8( _mm256_cmpeq_epi8(_mm256_load_si256((__m256i *)(data + i)), compare));
    total += __builtin_popcount(block & (block >> 1U));
    if (last_bit) {
      total += last_bit & block;
    }
    last_bit = block >> 31U;
  }
  return total;
}
```

Let us walk through the loop together

We first do 
```c++
uint32_t block = _mm256_movemask_epi8(_mm256_cmpeq_epi8(_mm256_load_si256((__m256i *)(data + i)), compare));
```
Which tells us which byte is equal to the target byte and moves this data into a normal 32-bit register

We then do 
`block & (block >> 1U)`
Which actually uses bit level parallelism and results in a 1 exactly where both that location, and the location to the left are equal to the target byte, exactly what we are looking for.

Then we use pop count to see how many of these pairs there was.

Lastly we need to do some extra bookkeeping to manage when pairs go between vectors of data.


| Version | time to process 1 Gib in ms |
|--- | --- |
| unoptimized | 2897 |
| O3 | 610 |
| march=native | 584|
| manual | 77|

Dump of assembler code for function count_pairs:
```
Dump of assembler code for function count_pairs:
ex2b.c:
8	count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
   0x0000000000001620 <+0>:	f3 0f 1e fa	endbr64 

/usr/lib/gcc/x86_64-linux-gnu/11/include/avxintrin.h:
1342	  return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
   0x0000000000001624 <+4>:	48 01 f6	add    %rsi,%rsi
   0x0000000000001627 <+7>:	49 89 f8	mov    %rdi,%r8
   0x000000000000162a <+10>:	62 f2 7d 28 7a ca	vpbroadcastb %edx,%ymm1

ex2b.c:
12	  for (uint64_t i = 0; i < size * 2; i += 32) {
   0x0000000000001630 <+16>:	49 89 f1	mov    %rsi,%r9
   0x0000000000001633 <+19>:	74 63	je     0x1698 <count_pairs+120>

/usr/lib/gcc/x86_64-linux-gnu/11/include/avx2intrin.h:
233	  return (__m256i) ((__v32qi)__A == (__v32qi)__B);
   0x0000000000001635 <+21>:	c5 f5 74 07	vpcmpeqb (%rdi),%ymm1,%ymm0

   0x0000000000001639 <+25>:	be 20 00 00 00	mov    $0x20,%esi
   0x000000000000163e <+30>:	c5 fd d7 f8	vpmovmskb %ymm0,%edi

ex2b.c:
15	    total += __builtin_popcount(block & (block >> 1U));
   0x0000000000001642 <+34>:	89 f8	mov    %edi,%eax
   0x0000000000001644 <+36>:	d1 e8	shr    %eax
   0x0000000000001646 <+38>:	21 f8	and    %edi,%eax
   0x0000000000001648 <+40>:	f3 0f b8 c0	popcnt %eax,%eax

17	      total += last_bit & block;
18	    }
19	    last_bit = block >> 31U;
   0x000000000000164c <+44>:	c1 ef 1f	shr    $0x1f,%edi

12	  for (uint64_t i = 0; i < size * 2; i += 32) {
   0x000000000000164f <+47>:	49 83 f9 20	cmp    $0x20,%r9
   0x0000000000001653 <+51>:	76 3b	jbe    0x1690 <count_pairs+112>
   0x0000000000001655 <+53>:	0f 1f 00	nopl   (%rax)

/usr/lib/gcc/x86_64-linux-gnu/11/include/avx2intrin.h:
233	  return (__m256i) ((__v32qi)__A == (__v32qi)__B);
   0x0000000000001658 <+56>:	c4 c1 75 74 04 30	vpcmpeqb (%r8,%rsi,1),%ymm1,%ymm0

   0x000000000000165e <+62>:	c5 fd d7 c8	vpmovmskb %ymm0,%ecx

ex2b.c:
15	    total += __builtin_popcount(block & (block >> 1U));
   0x0000000000001662 <+66>:	89 ca	mov    %ecx,%edx
   0x0000000000001664 <+68>:	d1 ea	shr    %edx
   0x0000000000001666 <+70>:	21 ca	and    %ecx,%edx
   0x0000000000001668 <+72>:	41 89 ca	mov    %ecx,%r10d
   0x000000000000166b <+75>:	f3 0f b8 d2	popcnt %edx,%edx
   0x000000000000166f <+79>:	41 83 e2 01	and    $0x1,%r10d
   0x0000000000001673 <+83>:	48 01 d0	add    %rdx,%rax

16	    if (last_bit) {
   0x0000000000001676 <+86>:	48 89 c2	mov    %rax,%rdx
   0x0000000000001679 <+89>:	4c 01 d0	add    %r10,%rax
   0x000000000000167c <+92>:	85 ff	test   %edi,%edi
   0x000000000000167e <+94>:	48 0f 44 c2	cmove  %rdx,%rax

17	      total += last_bit & block;
18	    }
19	    last_bit = block >> 31U;
   0x0000000000001682 <+98>:	89 cf	mov    %ecx,%edi
   0x0000000000001684 <+100>:	48 83 c6 20	add    $0x20,%rsi
   0x0000000000001688 <+104>:	c1 ef 1f	shr    $0x1f,%edi

12	  for (uint64_t i = 0; i < size * 2; i += 32) {
   0x000000000000168b <+107>:	49 39 f1	cmp    %rsi,%r9
   0x000000000000168e <+110>:	77 c8	ja     0x1658 <count_pairs+56>
   0x0000000000001690 <+112>:	c5 f8 77	vzeroupper 
   0x0000000000001693 <+115>:	c3	retq   
   0x0000000000001694 <+116>:	0f 1f 40 00	nopl   0x0(%rax)
   0x0000000000001698 <+120>:	31 c0	xor    %eax,%eax
   0x000000000000169a <+122>:	c5 f8 77	vzeroupper 
   0x000000000000169d <+125>:	c3	retq   
End of assembler dump.

```

So in total in this version manual vectorization was very worth it since it achieved almost 10x speedup over what the compiler was able to achieve.

In general whenever you think to manually vectorize it is worth it to way the extra cost in implementation, maintainability, and generality, over the speed performance that you can get.

# Summary 

- vectorization can help us speed up our code
- the compiler can help us with many forms of vectorization
- when the compiler can't help us we can do it manually using intrinsics



# More Bit and Byte Level Parallelism

## Compression

Let's say we want to store an array of integers using less space by compressing them.

Most commonly used integers use less than the full 64 bits, so we may be able to store them much smaller.

For example instead of storing full integers we can instead use what is known as run length encoding

A standard integer is stored as follow (I will be using little endian for examples)

![Little Endian](source/little_endian.JPG "Little Endian")

Instead of always storing all 4 bytes, we can instead use the bottom 7 bits of each bytes to store actual data, then the top bit of each byte to store if we are done with the integer or need to keep reading.

This way if the integer can be represented in less than 49 bits, we are able to use 7 bytes or less and save space.  This method does take more space if the integer needs more than 56 bits for its data.

Another thing we can do is sort the numbers, and only store the differences instead of the actual numbers to make the average size of the elements we are storing smaller. 

Let's look at some code for how we can decode an element
At the end we want what the difference we found was, and how what the difference we found was

```c
decode_return Decode(const uint8_t *loc) {
  // first check the case if we only use 1 byte by checking the top bit of the first byte
  // this is done becuase we need to special case what happens if we are 0
  decode_return ret;
  if ((*loc & 0x80UL) == 0) {
    ret.old_size = *loc > 0;
    ret.difference = *loc;
    return ret;
  }
  ret.difference = *loc & 0x7FUL;
  ret.old_size = 1;
  uint64_t shift_amount = 7;
  // loop over each byte and shift the bits to the correct position
  do {
    loc += 1;
    ret.difference = ret.difference | ((*loc & 0x7FUL) << shift_amount);
    ret.old_size += 1;
    shift_amount += 7;
  } while (*loc & 0x80UL);
  return ret;
}
```

As an experiemnt we are going to decode and sum up the numbers stored in a compressed array.  Our array will store 1 million elements, each element takes bwteeen 1 and 6 bytes to store the difference uniformaly at random.

| Version | time to process 1 million elements in microseconds |
|--- | --- |
| unoptimized | 20003 |
| O3 | 8389 |
| march=native | 8227|

What if instead of having to read these bytes in 1 at a time we could read them all in parallel

We will use some features from the BMI2 instruction set which allows us to do parallel operations on all bits in a word

We will use two different instructions to help us.

The first is `_pext_u64` which takes in a 64 bit integer, and a 64 bit pattern, it extracts the specified bits from the integer and puts them in continuous low bits of the output.

The second is `__tzcnt_u64` which returns the number of trailing zeros in an integer


```c

static uint64_t extract_masks[] = {
    0x000000000000007FUL, 0x0000000000007F7FUL, 0x00000000007F7F7FUL,
    0x000000007F7F7F7FUL, 0x0000007F7F7F7F7FUL, 0x00007F7F7F7F7F7FUL, 0x007F7F7F7F7F7F7FUL};


decode_return Decode(const uint8_t *loc) {
  decode_return ret;
  // once again special case the first byte
  if ((*loc & 0x80UL) == 0) {
    ret.old_size = *loc > 0;
    ret.difference = *loc;
    return ret;
  }
  // load the next 64 bits of data which is big enough to always contain the next compressed integer
  uint64_t chunks = load64(loc);

  // extract the top bit from each byte
  uint64_t mask = _pext_u64(chunks, 0x8080808080808080UL);

  // find the first bit that is 0 which corosponds to how long the compressed integer is
  int32_t index = __tzcnt_u64(~mask);

  // extract the data bits 
  uint64_t difference = _pext_u64(chunks, extract_masks[index]);
  ret.difference = difference;
  ret.old_size = index+1;

  return ret;
}
```


| Version | time to process 1 million elements in microseconds |
|--- | --- |
| unoptimized | 20003 |
| O3 | 8389 |
| march=native | 8227|
| manual1 | 5457|

We can actually do this slightly better by exploiting some pipeline parallelism

```c

static uint64_t extract_masks2[] = {
    0b1111111UL,
    0b11111111111111UL,
    0b111111111111111111111UL,
    0b1111111111111111111111111111UL,
    0b11111111111111111111111111111111111ULL,
    0b111111111111111111111111111111111111111111UL,
    0b1111111111111111111111111111111111111111111111111UL
    };


decode_return Decode(const uint8_t *loc) {
  decode_return ret;
  // once again special case the first byte
  if ((*loc & 0x80UL) == 0) {
    ret.old_size = *loc > 0;
    ret.difference = *loc;
    return ret;
  }
  // load the next 64 bits of data which is big enough to always contain the next compressed difference
  uint64_t chunks = load64(loc);

  // extract the top bit from each byte
  uint64_t mask = _pext_u64(chunks, 0x8080808080808080UL);

  // find the first bit that is 0 which corosponds to how long the compressed integer is
  int32_t index = __tzcnt_u64(~mask);

  // extract out the low 7 bits from each byte
  // this operation is no longer dependant on previous __tzcnt_u64, so both can happen in parallel at a hardware level
  uint64_t data_bits = _pext_u64(chunks, 0x7F7F7F7F7F7F7F7FUL);

  // mask out the data you don't care about, notice the masks are different since it is after the extract and pack
  uint64_t difference = data_bits & extract_masks2[index];
  ret.difference = difference;
  ret.old_size = index+1;

  return ret;
}
```


| Version | time to process 1 million elements in microseconds |
|--- | --- |
| unoptimized | 20003 |
| O3 | 8389 |
| march=native | 8227|
| manual1 | 5457|
| manual2 | 5184|