# Vectorization

Big Picture
- What is vectorization?
- When is it useful?
- What can the compiler do?
- How can you do it manually?


All the code can be found at [https://github.com/wheatman/Vectorization_Notes](https://github.com/wheatman/Vectorization_Notes)

First let's look at an example problem

# Problem 1

Given a list of data count the pairs of a given byte

```c
uint64_t count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
  uint64_t total = 0;
  // we will want to compare each short to the pair of target bytes
  uint16_t check = target | (target << 8U);
  for (uint64_t i = 0; i < size; i++) {
    // boolean automaitcally converts to an integer (true = 1, false = 0)
    total += (data[i] == check);
  }
  return total;
}

```

We will be doing all of our experiments with 1 GiB of data

We compile and run this, and it takes 1 second to run

Let us take a look at what it is doing

```assembly
Dump of assembler code for function count_pairs:
ex1a.c:
7    count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
   0x0000000000001567 <+0>:    f3 0f 1e fa    endbr64 
   0x000000000000156b <+4>:    55             push   %rbp
   0x000000000000156c <+5>:    48 89 e5       mov    %rsp,%rbp
   0x000000000000156f <+8>:    48 89 7d d8    mov    %rdi,-0x28(%rbp)
   0x0000000000001573 <+12>:    48 89 75 d0   mov    %rsi,-0x30(%rbp)
   0x0000000000001577 <+16>:    89 d0         mov    %edx,%eax
   0x0000000000001579 <+18>:    88 45 cc      mov    %al,-0x34(%rbp)

8      uint64_t total = 0;
   0x000000000000157c <+21>:    48 c7 45 f0 00 00 00 00    movq   $0x0,-0x10(%rbp)

9      uint16_t check = target | (target << 8U);
   0x0000000000001584 <+29>:    0f b6 55 cc    movzbl -0x34(%rbp),%edx
   0x0000000000001588 <+33>:    0f b6 45 cc    movzbl -0x34(%rbp),%eax
   0x000000000000158c <+37>:    c1 e0 08       shl    $0x8,%eax
   0x000000000000158f <+40>:    09 d0          or    %edx,%eax
   0x0000000000001591 <+42>:    66 89 45 ee    mov    %ax,-0x12(%rbp)

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001595 <+46>:    48 c7 45 f8 00 00 00 00    movq   $0x0,-0x8(%rbp)
   0x000000000000159d <+54>:    eb 25                      jmp    0x15c4 <count_pairs+93>

11           total += (data[i] == check);
   0x000000000000159f <+56>:    48 8b 45 f8    mov    -0x8(%rbp),%rax
   0x00000000000015a3 <+60>:    48 8d 14 00    lea    (%rax,%rax,1),%rdx
   0x00000000000015a7 <+64>:    48 8b 45 d8    mov    -0x28(%rbp),%rax
   0x00000000000015ab <+68>:    48 01 d0       add    %rdx,%rax
   0x00000000000015ae <+71>:    0f b7 00       movzwl (%rax),%eax
   0x00000000000015b1 <+74>:    66 39 45 ee    cmp    %ax,-0x12(%rbp)
   0x00000000000015b5 <+78>:    0f 94 c0       sete   %al
   0x00000000000015b8 <+81>:    0f b6 c0       movzbl %al,%eax
   0x00000000000015bb <+84>:    48 01 45 f0    add    %rax,-0x10(%rbp)

10      for (uint64_t i = 0; i < size; i++) {
   0x00000000000015bf <+88>:    48 83 45 f8 01    addq   $0x1,-0x8(%rbp)
   0x00000000000015c4 <+93>:    48 8b 45 f8       mov    -0x8(%rbp),%rax
   0x00000000000015c8 <+97>:    48 3b 45 d0       cmp    -0x30(%rbp),%rax
   0x00000000000015cc <+101>:    72 d1    jb      0x159f <count_pairs+56>

12      }
13      return total;
   0x00000000000015ce <+103>:    48 8b 45 f0    mov    -0x10(%rbp),%rax

14    }
   0x00000000000015d2 <+107>:    5d    pop      %rbp
   0x00000000000015d3 <+108>:    c3    retq   
End of assembler dump.
```

However, we are running compiled code, and the compiler has the ability to optimize our code for us, so let us tell the compiler to take the same code, but this time optimize it.

We can add the compiler flag `-O3`

Now it only takes 301 milliseconds, which is a speedup of over 3x.

Lets see what it is doing

```assembly
Dump of assembler code for function count_pairs:
ex1a.c:
7    count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
   0x0000000000001630 <+0>:    f3 0f 1e fa    endbr64 

8      uint64_t total = 0;
9      uint16_t check = target | (target << 8U);
   0x0000000000001634 <+4>:    89 d1       mov    %edx,%ecx
   0x0000000000001636 <+6>:    0f b6 d2    movzbl %dl,%edx
   0x0000000000001639 <+9>:    c1 e1 08    shl    $0x8,%ecx
   0x000000000000163c <+12>:    09 d1      or     %edx,%ecx

10      for (uint64_t i = 0; i < size; i++) {
   0x000000000000163e <+14>:    48 85 f6                test   %rsi,%rsi
   0x0000000000001641 <+17>:    74 25                   je     0x1668 <count_pairs+56>
   0x0000000000001643 <+19>:    48 8d 34 77             lea    (%rdi,%rsi,2),%rsi
   0x0000000000001647 <+23>:    31 c0                   xor    %eax,%eax
   0x0000000000001649 <+25>:    0f 1f 80 00 00 00 00    nopl   0x0(%rax)

11        total += (data[i] == check);
   0x0000000000001650 <+32>:    31 d2          xor    %edx,%edx
   0x0000000000001652 <+34>:    66 39 0f       cmp    %cx,(%rdi)
   0x0000000000001655 <+37>:    0f 94 c2       sete   %dl
   0x0000000000001658 <+40>:    48 83 c7 02    add    $0x2,%rdi
   0x000000000000165c <+44>:    48 01 d0       add    %rdx,%rax

10      for (uint64_t i = 0; i < size; i++) {
   0x000000000000165f <+47>:    48 39 fe    cmp    %rdi,%rsi
   0x0000000000001662 <+50>:    75 ec       jne    0x1650 <count_pairs+32>
   0x0000000000001664 <+52>:    c3          retq   
   0x0000000000001665 <+53>:    0f 1f 00    nopl   (%rax)
   0x0000000000001668 <+56>:    31 c0       xor    %eax,%eax

12      }
13      return total;
   0x000000000000166a <+58>:    c3    retq   
End of assembler dump.


```

Lets zoom in to the main compute loop

```assembly
    // zero out the edx register 
   0x0000000000001650 <+32>:    31 d2        xor    %edx,%edx
    // compare what is currently at the location pointed at by rdi with the element in cx
    // cx was set to the 16 bit target in the setup phase before the loop
   0x0000000000001652 <+34>:    66 39 0f    cmp    %cx,(%rdi)
    // gets the result of the comparison and place it in dl
   0x0000000000001655 <+37>:    0f 94 c2    sete   %dl
    // move the pointer to the data forward by 2 bytes
   0x0000000000001658 <+40>:    48 83 c7 02    add    $0x2,%rdi
    // add rdx, which contains dl to rax which is storing the count
   0x000000000000165c <+44>:    48 01 d0    add    %rdx,%rax
```

![x86 registers](source/registers.JPG "X86 registers")

We can go another step farther and add another compiler flag of `-funroll-loops`

This speeds ups the code by almost another factor of 2 giving us the result in only 180ms

| Version | time to process 1 Gib in ms |
|--- | --- |
| unoptimized | 1045 |
| -O3 | 301 |
| -O3 -funroll-loops | 181 |


```assembly
Dump of assembler code for function count_pairs:
ex1a.c:
7    count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
   0x00000000000019c0 <+0>:    f3 0f 1e fa    endbr64 

8      uint64_t total = 0;
9      uint16_t check = target | (target << 8U);
   0x00000000000019c4 <+4>:    89 d0       mov    %edx,%eax
   0x00000000000019c6 <+6>:    c1 e2 08    shl    $0x8,%edx
   0x00000000000019c9 <+9>:    0f b6 c8    movzbl %al,%ecx
   0x00000000000019cc <+12>:    09 ca      or     %ecx,%edx

10      for (uint64_t i = 0; i < size; i++) {
   0x00000000000019ce <+14>:    48 85 f6             test   %rsi,%rsi
   0x00000000000019d1 <+17>:    0f 84 41 01 00 00    je     0x1b18 <count_pairs+344>
   0x00000000000019d7 <+23>:    4c 8d 0c 77          lea    (%rdi,%rsi,2),%r9
   0x00000000000019db <+27>:    48 8d 74 36 fe       lea    -0x2(%rsi,%rsi,1),%rsi
   0x00000000000019e0 <+32>:    45 31 c0             xor    %r8d,%r8d
   0x00000000000019e3 <+35>:    48 d1 ee             shr    %rsi
   0x00000000000019e6 <+38>:    48 83 c6 01          add    $0x1,%rsi
   0x00000000000019ea <+42>:    83 e6 07             and    $0x7,%esi
   0x00000000000019ed <+45>:    0f 84 8e 00 00 00    je     0x1a81 <count_pairs+193>
   0x00000000000019f3 <+51>:    48 83 fe 01          cmp    $0x1,%rsi
   0x00000000000019f7 <+55>:    74 72                je     0x1a6b <count_pairs+171>
   0x00000000000019f9 <+57>:    48 83 fe 02          cmp    $0x2,%rsi
   0x00000000000019fd <+61>:    74 5c                je     0x1a5b <count_pairs+155>
   0x00000000000019ff <+63>:    48 83 fe 03          cmp    $0x3,%rsi
   0x0000000000001a03 <+67>:    74 47                je     0x1a4c <count_pairs+140>
   0x0000000000001a05 <+69>:    48 83 fe 04          cmp    $0x4,%rsi
   0x0000000000001a09 <+73>:    74 32                je     0x1a3d <count_pairs+125>
   0x0000000000001a0b <+75>:    48 83 fe 05          cmp    $0x5,%rsi
   0x0000000000001a0f <+79>:    74 1b                je     0x1a2c <count_pairs+108>
   0x0000000000001a11 <+81>:    48 83 fe 06          cmp    $0x6,%rsi
   0x0000000000001a15 <+85>:    0f 85 e5 00 00 00    jne    0x1b00 <count_pairs+320>

11        total += (data[i] == check);
   0x0000000000001a1b <+91>:    45 31 d2        xor    %r10d,%r10d
   0x0000000000001a1e <+94>:    66 39 17        cmp    %dx,(%rdi)
   0x0000000000001a21 <+97>:    41 0f 94 c2     sete   %r10b
   0x0000000000001a25 <+101>:    48 83 c7 02    add    $0x2,%rdi
   0x0000000000001a29 <+105>:    4d 01 d0       add    %r10,%r8
   0x0000000000001a2c <+108>:    45 31 db       xor    %r11d,%r11d
   0x0000000000001a2f <+111>:    66 39 17       cmp    %dx,(%rdi)
   0x0000000000001a32 <+114>:    41 0f 94 c3    sete   %r11b
   0x0000000000001a36 <+118>:    48 83 c7 02    add    $0x2,%rdi
   0x0000000000001a3a <+122>:    4d 01 d8       add    %r11,%r8
   0x0000000000001a3d <+125>:    31 c0          xor    %eax,%eax
   0x0000000000001a3f <+127>:    66 39 17       cmp    %dx,(%rdi)
   0x0000000000001a42 <+130>:    0f 94 c0       sete   %al
   0x0000000000001a45 <+133>:    48 83 c7 02    add    $0x2,%rdi
   0x0000000000001a49 <+137>:    49 01 c0       add    %rax,%r8
   0x0000000000001a4c <+140>:    31 c9          xor    %ecx,%ecx
   0x0000000000001a4e <+142>:    66 39 17       cmp    %dx,(%rdi)
   0x0000000000001a51 <+145>:    0f 94 c1       sete   %cl
   0x0000000000001a54 <+148>:    48 83 c7 02    add    $0x2,%rdi
   0x0000000000001a58 <+152>:    49 01 c8       add    %rcx,%r8
   0x0000000000001a5b <+155>:    31 f6          xor    %esi,%esi
   0x0000000000001a5d <+157>:    66 39 17       cmp    %dx,(%rdi)
   0x0000000000001a60 <+160>:    40 0f 94 c6    sete   %sil
   0x0000000000001a64 <+164>:    48 83 c7 02    add    $0x2,%rdi
   0x0000000000001a68 <+168>:    49 01 f0       add    %rsi,%r8
   0x0000000000001a6b <+171>:    45 31 d2       xor    %r10d,%r10d
   0x0000000000001a6e <+174>:    66 39 17       cmp    %dx,(%rdi)
   0x0000000000001a71 <+177>:    41 0f 94 c2    sete   %r10b
   0x0000000000001a75 <+181>:    48 83 c7 02    add    $0x2,%rdi
   0x0000000000001a79 <+185>:    4d 01 d0       add    %r10,%r8

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001a7c <+188>:    49 39 f9    cmp    %rdi,%r9
   0x0000000000001a7f <+191>:    74 73       je     0x1af4 <count_pairs+308>

11        total += (data[i] == check);
   0x0000000000001a81 <+193>:    45 31 db       xor    %r11d,%r11d
   0x0000000000001a84 <+196>:    66 39 17       cmp    %dx,(%rdi)
   0x0000000000001a87 <+199>:    41 0f 94 c3    sete   %r11b
   0x0000000000001a8b <+203>:    4d 01 c3       add    %r8,%r11
   0x0000000000001a8e <+206>:    45 31 c0       xor    %r8d,%r8d
   0x0000000000001a91 <+209>:    66 39 57 02    cmp    %dx,0x2(%rdi)
   0x0000000000001a95 <+213>:    41 0f 94 c0    sete   %r8b
   0x0000000000001a99 <+217>:    31 c0          xor    %eax,%eax
   0x0000000000001a9b <+219>:    4d 01 c3       add    %r8,%r11
   0x0000000000001a9e <+222>:    66 39 57 04    cmp    %dx,0x4(%rdi)
   0x0000000000001aa2 <+226>:    0f 94 c0       sete   %al
   0x0000000000001aa5 <+229>:    31 c9          xor    %ecx,%ecx
   0x0000000000001aa7 <+231>:    49 01 c3       add    %rax,%r11
   0x0000000000001aaa <+234>:    66 39 57 06    cmp    %dx,0x6(%rdi)
   0x0000000000001aae <+238>:    0f 94 c1       sete   %cl
   0x0000000000001ab1 <+241>:    31 f6          xor    %esi,%esi
   0x0000000000001ab3 <+243>:    49 01 cb       add    %rcx,%r11
   0x0000000000001ab6 <+246>:    66 39 57 08    cmp    %dx,0x8(%rdi)
   0x0000000000001aba <+250>:    40 0f 94 c6    sete   %sil
   0x0000000000001abe <+254>:    45 31 d2       xor    %r10d,%r10d
   0x0000000000001ac1 <+257>:    49 01 f3       add    %rsi,%r11
   0x0000000000001ac4 <+260>:    66 39 57 0a    cmp    %dx,0xa(%rdi)
   0x0000000000001ac8 <+264>:    41 0f 94 c2    sete   %r10b
   0x0000000000001acc <+268>:    45 31 c0       xor    %r8d,%r8d
   0x0000000000001acf <+271>:    4d 01 d3       add    %r10,%r11
   0x0000000000001ad2 <+274>:    66 39 57 0c    cmp    %dx,0xc(%rdi)
   0x0000000000001ad6 <+278>:    41 0f 94 c0    sete   %r8b
   0x0000000000001ada <+282>:    4d 01 c3       add    %r8,%r11
   0x0000000000001add <+285>:    45 31 c0       xor    %r8d,%r8d
   0x0000000000001ae0 <+288>:    66 39 57 0e    cmp    %dx,0xe(%rdi)
   0x0000000000001ae4 <+292>:    41 0f 94 c0    sete   %r8b
   0x0000000000001ae8 <+296>:    48 83 c7 10    add    $0x10,%rdi
   0x0000000000001aec <+300>:    4d 01 d8       add    %r11,%r8

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001aef <+303>:    49 39 f9                   cmp    %rdi,%r9
   0x0000000000001af2 <+306>:    75 8d                      jne    0x1a81 <count_pairs+193>
   0x0000000000001af4 <+308>:    4c 89 c0                   mov    %r8,%rax
   0x0000000000001af7 <+311>:    c3                         retq   
   0x0000000000001af8 <+312>:    0f 1f 84 00 00 00 00 00    nopl   0x0(%rax,%rax,1)

11        total += (data[i] == check);
   0x0000000000001b00 <+320>:    45 31 c0       xor    %r8d,%r8d
   0x0000000000001b03 <+323>:    66 39 17       cmp    %dx,(%rdi)
   0x0000000000001b06 <+326>:    41 0f 94 c0    sete   %r8b

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001b0a <+330>:    48 83 c7 02        add    $0x2,%rdi
   0x0000000000001b0e <+334>:    e9 08 ff ff ff     jmpq   0x1a1b <count_pairs+91>
   0x0000000000001b13 <+339>:    0f 1f 44 00 00     nopl   0x0(%rax,%rax,1)
   0x0000000000001b18 <+344>:    45 31 c0           xor    %r8d,%r8d

12      }
13      return total;
   0x0000000000001b1b <+347>:    4c 89 c0    mov    %r8,%rax
   0x0000000000001b1e <+350>:    c3          retq   
End of assembler dump.

```

Now we are going to add a special flag "-march=native"
This tells the compiler that we will be running the code on the same machine that we are compiling it on.
This allows the compiler to use any special things that this computer has which are non-standard.
Most of the time when we compile code we want it to work across a broad range of machines, but newer hardware comes with more optimizations. 

If you want to see what sort of extra hardware is on your machine you can look with `lscpu` or `cat /proc/cpuinfo`

For example if we look at my laptop 

```
Architecture:            x86_64
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         46 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  20
  On-line CPU(s) list:   0-19
Vendor ID:               GenuineIntel
  Model name:            13th Gen Intel(R) Core(TM) i9-13900H
    CPU family:          6
    Model:               186
    Thread(s) per core:  2
    Core(s) per socket:  10
    Socket(s):           1
    Stepping:            2
    BogoMIPS:            5990.40
    Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid pni pclmulqdq vmx ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves avx_vnni umip waitpkg gfni vaes vpclmulqdq rdpid movdiri movdir64b fsrm serialize flush_l1d arch_capabilities
Virtualization features: 
  Virtualization:        VT-x
  Hypervisor vendor:     Microsoft
  Virtualization type:   full
Caches (sum of all):     
  L1d:                   480 KiB (10 instances)
  L1i:                   320 KiB (10 instances)
  L2:                    12.5 MiB (10 instances)
  L3:                    24 MiB (1 instance)
```
Specifically we are looking at the flags

Here is a modern high end intel server

```
Flags:                           
fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
```

We will mostly be concerned with the following types

 - SSE (Streaming SIMD Extensions)
 - AVX (Advanced Vector Extensions)

This introduces 2 terms, SIMD and Vectors

SIMD stands for single instruction multiple data, which means we have a single instruction which processes multiple pieces of data at the same time
AVX is basically just the successor to SIMD

These both allow us to use vectorization 

# What is vectorization

Operations on wide registers

Most registers are between 8 and 64 bits

Operations on these registers happen on the entire register

Vector registers are normally a fair amount bigger 128 - 512 bits

They are normally thought of as a vector of data, where operations happen individually on each item in the vector

![Vector Operation](source/vector_op.JPG "Vector Operation")


For example `vpcmpeqw ` is a special instruction which operated on 2 256 bit registers, it treats these registers as having 16, 16-bit objects, compares all 16 of them for equality and then outputs -1 in the same location of the output 256-bit register, if they are equal, and 0 otherwise.

This would allow us to perform 16, 16-bit operations at once.

This is still the same code, we are just changing how we compile it 
```c
uint64_t count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
  uint64_t total = 0;
  // we will want to compare each short to the pair of target bytes
  uint16_t check = target | (target << 8U);
  for (uint64_t i = 0; i < size; i++) {
    total += (data[i] == check);
  }
  return total;
}

```

Compiling with `-march=native` gives us over another 1.5x speedup

| Version | time to process 1 Gib in ms |
|--- | --- |
| unoptimized | 1045 |
| -O3 | 301 |
| -O3 -funroll-loops | 181 |
| -march=native | 116 |

Let us take a look at the assembly for this one 

```assembly
Dump of assembler code for function count_pairs:
ex1a.c:
7    count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
   0x00000000000019c0 <+0>:    f3 0f 1e fa    endbr64 

8      uint64_t total = 0;
9      uint16_t check = target | (target << 8U);
   0x00000000000019c4 <+4>:    41 89 d0    mov    %edx,%r8d
   0x00000000000019c7 <+7>:    41 c1 e0 08    shl    $0x8,%r8d
   0x00000000000019cb <+11>:    0f b6 d2    movzbl %dl,%edx
   0x00000000000019ce <+14>:    41 09 d0    or     %edx,%r8d

10      for (uint64_t i = 0; i < size; i++) {
   0x00000000000019d1 <+17>:    48 85 f6    test   %rsi,%rsi
   0x00000000000019d4 <+20>:    0f 84 a6 02 00 00    je     0x1c80 <count_pairs+704>
   0x00000000000019da <+26>:    48 8d 46 ff    lea    -0x1(%rsi),%rax
   0x00000000000019de <+30>:    48 83 f8 0e    cmp    $0xe,%rax
   0x00000000000019e2 <+34>:    0f 86 9b 02 00 00    jbe    0x1c83 <count_pairs+707>
   0x00000000000019e8 <+40>:    49 89 f1    mov    %rsi,%r9
   0x00000000000019eb <+43>:    49 c1 e9 04    shr    $0x4,%r9
   0x00000000000019ef <+47>:    49 c1 e1 05    shl    $0x5,%r9
   0x00000000000019f3 <+51>:    49 8d 0c 39    lea    (%r9,%rdi,1),%rcx
   0x00000000000019f7 <+55>:    41 83 e1 20    and    $0x20,%r9d
   0x00000000000019fb <+59>:    c4 e2 7d 79 25 f0 06 00 00    vpbroadcastw 0x6f0(%rip),%ymm4        # 0x20f4
   0x0000000000001a04 <+68>:    62 d2 7d 28 7b e8    vpbroadcastw %r8d,%ymm5
   0x0000000000001a0a <+74>:    49 89 fa    mov    %rdi,%r10
   0x0000000000001a0d <+77>:    c4 41 01 ef ff    vpxor  %xmm15,%xmm15,%xmm15
   0x0000000000001a12 <+82>:    74 5c    je     0x1a70 <count_pairs+176>

11        total += (data[i] == check);
   0x0000000000001a14 <+84>:    c5 d5 75 07    vpcmpeqw (%rdi),%ymm5,%ymm0
   0x0000000000001a18 <+88>:    4c 8d 57 20    lea    0x20(%rdi),%r10
   0x0000000000001a1c <+92>:    c5 fd db d4    vpand  %ymm4,%ymm0,%ymm2
   0x0000000000001a20 <+96>:    c4 e3 7d 39 d3 01    vextracti128 $0x1,%ymm2,%xmm3
   0x0000000000001a26 <+102>:    c4 e2 7d 33 ca    vpmovzxwd %xmm2,%ymm1
   0x0000000000001a2b <+107>:    c4 e2 7d 33 f3    vpmovzxwd %xmm3,%ymm6
   0x0000000000001a30 <+112>:    c4 c3 7d 39 f0 01    vextracti128 $0x1,%ymm6,%xmm8
   0x0000000000001a36 <+118>:    c4 c3 7d 39 cc 01    vextracti128 $0x1,%ymm1,%xmm12
   0x0000000000001a3c <+124>:    c4 e2 7d 35 fe    vpmovzxdq %xmm6,%ymm7
   0x0000000000001a41 <+129>:    c4 42 7d 35 c8    vpmovzxdq %xmm8,%ymm9
   0x0000000000001a46 <+134>:    c4 62 7d 35 d9    vpmovzxdq %xmm1,%ymm11
   0x0000000000001a4b <+139>:    c4 42 7d 35 ec    vpmovzxdq %xmm12,%ymm13
   0x0000000000001a50 <+144>:    c4 41 45 d4 d1    vpaddq %ymm9,%ymm7,%ymm10
   0x0000000000001a55 <+149>:    c4 41 25 d4 f5    vpaddq %ymm13,%ymm11,%ymm14
   0x0000000000001a5a <+154>:    c4 41 2d d4 fe    vpaddq %ymm14,%ymm10,%ymm15

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001a5f <+159>:    49 39 ca    cmp    %rcx,%r10
   0x0000000000001a62 <+162>:    0f 84 ad 00 00 00    je     0x1b15 <count_pairs+341>
   0x0000000000001a68 <+168>:    0f 1f 84 00 00 00 00 00    nopl   0x0(%rax,%rax,1)

11        total += (data[i] == check);
   0x0000000000001a70 <+176>:    c4 c1 55 75 02    vpcmpeqw (%r10),%ymm5,%ymm0
   0x0000000000001a75 <+181>:    49 83 c2 40    add    $0x40,%r10
   0x0000000000001a79 <+185>:    c5 fd db d4    vpand  %ymm4,%ymm0,%ymm2
   0x0000000000001a7d <+189>:    c4 e2 7d 33 ca    vpmovzxwd %xmm2,%ymm1
   0x0000000000001a82 <+194>:    c4 e3 7d 39 d3 01    vextracti128 $0x1,%ymm2,%xmm3
   0x0000000000001a88 <+200>:    c4 c1 55 75 52 e0    vpcmpeqw -0x20(%r10),%ymm5,%ymm2
   0x0000000000001a8e <+206>:    c4 e2 7d 33 f3    vpmovzxwd %xmm3,%ymm6
   0x0000000000001a93 <+211>:    c4 c3 7d 39 f0 01    vextracti128 $0x1,%ymm6,%xmm8
   0x0000000000001a99 <+217>:    c5 ed db dc    vpand  %ymm4,%ymm2,%ymm3
   0x0000000000001a9d <+221>:    c4 e2 7d 35 fe    vpmovzxdq %xmm6,%ymm7
   0x0000000000001aa2 <+226>:    c4 42 7d 35 c8    vpmovzxdq %xmm8,%ymm9
   0x0000000000001aa7 <+231>:    c4 c3 7d 39 cc 01    vextracti128 $0x1,%ymm1,%xmm12
   0x0000000000001aad <+237>:    c4 e3 7d 39 de 01    vextracti128 $0x1,%ymm3,%xmm6
   0x0000000000001ab3 <+243>:    c4 41 45 d4 d1    vpaddq %ymm9,%ymm7,%ymm10
   0x0000000000001ab8 <+248>:    c4 62 7d 35 d9    vpmovzxdq %xmm1,%ymm11
   0x0000000000001abd <+253>:    c4 42 7d 35 ec    vpmovzxdq %xmm12,%ymm13
   0x0000000000001ac2 <+258>:    c4 e2 7d 33 cb    vpmovzxwd %xmm3,%ymm1
   0x0000000000001ac7 <+263>:    c4 e2 7d 33 fe    vpmovzxwd %xmm6,%ymm7
   0x0000000000001acc <+268>:    c4 41 25 d4 f5    vpaddq %ymm13,%ymm11,%ymm14
   0x0000000000001ad1 <+273>:    c4 c3 7d 39 f9 01    vextracti128 $0x1,%ymm7,%xmm9
   0x0000000000001ad7 <+279>:    c4 c3 7d 39 cd 01    vextracti128 $0x1,%ymm1,%xmm13
   0x0000000000001add <+285>:    c4 c1 2d d4 c6    vpaddq %ymm14,%ymm10,%ymm0
   0x0000000000001ae2 <+290>:    c4 62 7d 35 c7    vpmovzxdq %xmm7,%ymm8
   0x0000000000001ae7 <+295>:    c4 42 7d 35 d1    vpmovzxdq %xmm9,%ymm10
   0x0000000000001aec <+300>:    c4 62 7d 35 e1    vpmovzxdq %xmm1,%ymm12
   0x0000000000001af1 <+305>:    c4 42 7d 35 f5    vpmovzxdq %xmm13,%ymm14
   0x0000000000001af6 <+310>:    c5 05 d4 f8    vpaddq %ymm0,%ymm15,%ymm15
   0x0000000000001afa <+314>:    c4 41 3d d4 da    vpaddq %ymm10,%ymm8,%ymm11
   0x0000000000001aff <+319>:    c4 c1 1d d4 c6    vpaddq %ymm14,%ymm12,%ymm0
   0x0000000000001b04 <+324>:    c5 a5 d4 d0    vpaddq %ymm0,%ymm11,%ymm2
   0x0000000000001b08 <+328>:    c5 05 d4 fa    vpaddq %ymm2,%ymm15,%ymm15

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001b0c <+332>:    49 39 ca    cmp    %rcx,%r10
   0x0000000000001b0f <+335>:    0f 85 5b ff ff ff    jne    0x1a70 <count_pairs+176>
   0x0000000000001b15 <+341>:    62 73 fd 28 39 fc 01    vextracti64x2 $0x1,%ymm15,%xmm4
   0x0000000000001b1c <+348>:    c5 01 d4 fc    vpaddq %xmm4,%xmm15,%xmm15
   0x0000000000001b20 <+352>:    c4 c1 61 73 df 08    vpsrldq $0x8,%xmm15,%xmm3
   0x0000000000001b26 <+358>:    c5 81 d4 cb    vpaddq %xmm3,%xmm15,%xmm1
   0x0000000000001b2a <+362>:    49 89 f3    mov    %rsi,%r11
   0x0000000000001b2d <+365>:    c4 e1 f9 7e c8    vmovq  %xmm1,%rax
   0x0000000000001b32 <+370>:    49 83 e3 f0    and    $0xfffffffffffffff0,%r11
   0x0000000000001b36 <+374>:    40 f6 c6 0f    test   $0xf,%sil
   0x0000000000001b3a <+378>:    0f 84 4d 01 00 00    je     0x1c8d <count_pairs+717>
   0x0000000000001b40 <+384>:    c5 f8 77    vzeroupper 
   0x0000000000001b43 <+387>:    49 89 f1    mov    %rsi,%r9
   0x0000000000001b46 <+390>:    4d 29 d9    sub    %r11,%r9
   0x0000000000001b49 <+393>:    49 8d 51 ff    lea    -0x1(%r9),%rdx
   0x0000000000001b4d <+397>:    48 83 fa 06    cmp    $0x6,%rdx
   0x0000000000001b51 <+401>:    76 7b    jbe    0x1bce <count_pairs+526>

11        total += (data[i] == check);
   0x0000000000001b53 <+403>:    62 d2 7d 08 7b f0    vpbroadcastw %r8d,%xmm6
   0x0000000000001b59 <+409>:    c4 a1 49 75 3c 5f    vpcmpeqw (%rdi,%r11,2),%xmm6,%xmm7
   0x0000000000001b5f <+415>:    c4 62 79 79 05 8c 05 00 00    vpbroadcastw 0x58c(%rip),%xmm8        # 0x20f4
   0x0000000000001b68 <+424>:    4d 89 ca    mov    %r9,%r10
   0x0000000000001b6b <+427>:    c4 41 41 db c8    vpand  %xmm8,%xmm7,%xmm9
   0x0000000000001b70 <+432>:    c4 c1 21 73 d9 08    vpsrldq $0x8,%xmm9,%xmm11
   0x0000000000001b76 <+438>:    c4 42 79 33 d1    vpmovzxwd %xmm9,%xmm10
   0x0000000000001b7b <+443>:    c4 42 79 33 e3    vpmovzxwd %xmm11,%xmm12
   0x0000000000001b80 <+448>:    c4 c1 09 73 da 08    vpsrldq $0x8,%xmm10,%xmm14
   0x0000000000001b86 <+454>:    c4 c1 59 73 dc 08    vpsrldq $0x8,%xmm12,%xmm4
   0x0000000000001b8c <+460>:    c4 42 79 35 ea    vpmovzxdq %xmm10,%xmm13
   0x0000000000001b91 <+465>:    c4 c2 79 35 c6    vpmovzxdq %xmm14,%xmm0
   0x0000000000001b96 <+470>:    c4 c2 79 35 d4    vpmovzxdq %xmm12,%xmm2
   0x0000000000001b9b <+475>:    c4 62 79 35 fc    vpmovzxdq %xmm4,%xmm15
   0x0000000000001ba0 <+480>:    c5 91 d4 e8    vpaddq %xmm0,%xmm13,%xmm5
   0x0000000000001ba4 <+484>:    c4 c1 69 d4 df    vpaddq %xmm15,%xmm2,%xmm3
   0x0000000000001ba9 <+489>:    c5 d1 d4 f3    vpaddq %xmm3,%xmm5,%xmm6

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001bad <+493>:    c5 f1 73 de 08    vpsrldq $0x8,%xmm6,%xmm1
   0x0000000000001bb2 <+498>:    c5 c9 d4 f9    vpaddq %xmm1,%xmm6,%xmm7
   0x0000000000001bb6 <+502>:    c4 e1 f9 7e f9    vmovq  %xmm7,%rcx
   0x0000000000001bbb <+507>:    49 83 e2 f8    and    $0xfffffffffffffff8,%r10
   0x0000000000001bbf <+511>:    48 01 c8    add    %rcx,%rax
   0x0000000000001bc2 <+514>:    4d 01 d3    add    %r10,%r11
   0x0000000000001bc5 <+517>:    4d 39 d1    cmp    %r10,%r9
   0x0000000000001bc8 <+520>:    0f 84 b4 00 00 00    je     0x1c82 <count_pairs+706>

11        total += (data[i] == check);
   0x0000000000001bce <+526>:    45 31 c9    xor    %r9d,%r9d
   0x0000000000001bd1 <+529>:    66 46 39 04 5f    cmp    %r8w,(%rdi,%r11,2)
   0x0000000000001bd6 <+534>:    41 0f 94 c1    sete   %r9b
   0x0000000000001bda <+538>:    49 8d 4b 01    lea    0x1(%r11),%rcx
   0x0000000000001bde <+542>:    4b 8d 14 1b    lea    (%r11,%r11,1),%rdx
   0x0000000000001be2 <+546>:    4c 01 c8    add    %r9,%rax

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001be5 <+549>:    48 39 ce    cmp    %rcx,%rsi
   0x0000000000001be8 <+552>:    0f 86 94 00 00 00    jbe    0x1c82 <count_pairs+706>

11        total += (data[i] == check);
   0x0000000000001bee <+558>:    45 31 d2    xor    %r10d,%r10d
   0x0000000000001bf1 <+561>:    66 44 39 44 17 02    cmp    %r8w,0x2(%rdi,%rdx,1)
   0x0000000000001bf7 <+567>:    41 0f 94 c2    sete   %r10b
   0x0000000000001bfb <+571>:    4d 8d 4b 02    lea    0x2(%r11),%r9
   0x0000000000001bff <+575>:    4c 01 d0    add    %r10,%rax

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001c02 <+578>:    49 39 f1    cmp    %rsi,%r9
   0x0000000000001c05 <+581>:    73 7b    jae    0x1c82 <count_pairs+706>

11        total += (data[i] == check);
   0x0000000000001c07 <+583>:    31 c9    xor    %ecx,%ecx
   0x0000000000001c09 <+585>:    66 44 39 44 17 04    cmp    %r8w,0x4(%rdi,%rdx,1)
   0x0000000000001c0f <+591>:    0f 94 c1    sete   %cl
   0x0000000000001c12 <+594>:    4d 8d 53 03    lea    0x3(%r11),%r10
   0x0000000000001c16 <+598>:    48 01 c8    add    %rcx,%rax

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001c19 <+601>:    4c 39 d6    cmp    %r10,%rsi
   0x0000000000001c1c <+604>:    76 64    jbe    0x1c82 <count_pairs+706>

11        total += (data[i] == check);
   0x0000000000001c1e <+606>:    45 31 c9    xor    %r9d,%r9d
   0x0000000000001c21 <+609>:    66 44 39 44 17 06    cmp    %r8w,0x6(%rdi,%rdx,1)
   0x0000000000001c27 <+615>:    41 0f 94 c1    sete   %r9b
   0x0000000000001c2b <+619>:    49 8d 4b 04    lea    0x4(%r11),%rcx
   0x0000000000001c2f <+623>:    4c 01 c8    add    %r9,%rax

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001c32 <+626>:    48 39 ce    cmp    %rcx,%rsi
   0x0000000000001c35 <+629>:    76 4b    jbe    0x1c82 <count_pairs+706>

11        total += (data[i] == check);
   0x0000000000001c37 <+631>:    45 31 d2    xor    %r10d,%r10d
   0x0000000000001c3a <+634>:    66 44 39 44 17 08    cmp    %r8w,0x8(%rdi,%rdx,1)
   0x0000000000001c40 <+640>:    41 0f 94 c2    sete   %r10b
   0x0000000000001c44 <+644>:    4d 8d 4b 05    lea    0x5(%r11),%r9
   0x0000000000001c48 <+648>:    4c 01 d0    add    %r10,%rax

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001c4b <+651>:    4c 39 ce    cmp    %r9,%rsi
   0x0000000000001c4e <+654>:    76 32    jbe    0x1c82 <count_pairs+706>

11        total += (data[i] == check);
   0x0000000000001c50 <+656>:    31 c9    xor    %ecx,%ecx
   0x0000000000001c52 <+658>:    66 44 39 44 17 0a    cmp    %r8w,0xa(%rdi,%rdx,1)
   0x0000000000001c58 <+664>:    0f 94 c1    sete   %cl
   0x0000000000001c5b <+667>:    49 83 c3 06    add    $0x6,%r11
   0x0000000000001c5f <+671>:    48 01 c8    add    %rcx,%rax

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001c62 <+674>:    4c 39 de    cmp    %r11,%rsi
   0x0000000000001c65 <+677>:    76 1b    jbe    0x1c82 <count_pairs+706>

11        total += (data[i] == check);
   0x0000000000001c67 <+679>:    66 44 39 44 17 0c    cmp    %r8w,0xc(%rdi,%rdx,1)
   0x0000000000001c6d <+685>:    40 0f 94 c6    sete   %sil
   0x0000000000001c71 <+689>:    40 0f b6 fe    movzbl %sil,%edi
   0x0000000000001c75 <+693>:    48 01 f8    add    %rdi,%rax

10      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001c78 <+696>:    c3    retq   
   0x0000000000001c79 <+697>:    0f 1f 80 00 00 00 00    nopl   0x0(%rax)
   0x0000000000001c80 <+704>:    31 c0    xor    %eax,%eax

12      }
13      return total;
   0x0000000000001c82 <+706>:    c3    retq   
   0x0000000000001c83 <+707>:    45 31 db    xor    %r11d,%r11d
   0x0000000000001c86 <+710>:    31 c0    xor    %eax,%eax
   0x0000000000001c88 <+712>:    e9 b6 fe ff ff    jmpq   0x1b43 <count_pairs+387>
   0x0000000000001c8d <+717>:    c5 f8 77    vzeroupper 
   0x0000000000001c90 <+720>:    c3    retq   
End of assembler dump.


```

It becomes a giant mess

We need multiple versions of the loops depending on the number of iterations

Sometimes we need to do special work at the beginning or end if the alignment is not proper

We can fix some of these issues by giving the compiler more information.
Here is the same code, but with some extra information given to the compiler

```c
uint64_t count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
  // tell the compiler that data is aligned to 32 bytes
  data = __builtin_assume_aligned(data, 32);
  // tell the compiler that size is a multiple of 32
  size = size & (~31U);
  uint64_t total = 0;
  uint16_t check = target | (target << 8U);
  for (uint64_t i = 0; i < size; i++) {
    total += (data[i] == check);
  }
  return total;
}
```

In this case the start and end work is trivial, so it runs at the same speed, but if we look at the assembly, it is a bit simpler 

```assembly
Dump of assembler code for function count_pairs:
ex1b.c:
7    count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
   0x00000000000019c0 <+0>:    f3 0f 1e fa    endbr64 

8      // tell the compilier that data is aligned to 32 bytes
9      data = __builtin_assume_aligned(data, 32);
10      // tell the compilier that size is a multiple of 32
11      size = size & (~31U);
12      uint64_t total = 0;
13      uint16_t check = target | (target << 8U);
   0x00000000000019c4 <+4>:    89 d1        mov    %edx,%ecx
   0x00000000000019c6 <+6>:    c1 e1 08     shl    $0x8,%ecx
   0x00000000000019c9 <+9>:    0f b6 d2     movzbl %dl,%edx
   0x00000000000019cc <+12>:    48 89 f0    mov    %rsi,%rax
   0x00000000000019cf <+15>:    09 d1       or     %edx,%ecx

14      for (uint64_t i = 0; i < size; i++) {
   0x00000000000019d1 <+17>:    83 e0 e0                      and    $0xffffffe0,%eax
   0x00000000000019d4 <+20>:    0f 84 d6 00 00 00             je     0x1ab0 <count_pairs+240>
   0x00000000000019da <+26>:    c4 e2 7d 79 25 11 07 00 00    vpbroadcastw 0x711(%rip),%ymm4        # 0x20f4
   0x00000000000019e3 <+35>:    62 f2 7d 28 7b d9             vpbroadcastw %ecx,%ymm3
   0x00000000000019e9 <+41>:    48 8d 04 47                   lea    (%rdi,%rax,2),%rax
   0x00000000000019ed <+45>:    c5 d1 ef ed                   vpxor  %xmm5,%xmm5,%xmm5

15        total += (data[i] == check);
   0x00000000000019f1 <+49>:    c5 e5 75 07           vpcmpeqw (%rdi),%ymm3,%ymm0
   0x00000000000019f5 <+53>:    48 83 c7 40           add    $0x40,%rdi
   0x00000000000019f9 <+57>:    c5 fd db d4           vpand  %ymm4,%ymm0,%ymm2
   0x00000000000019fd <+61>:    c4 e2 7d 33 ca        vpmovzxwd %xmm2,%ymm1
   0x0000000000001a02 <+66>:    c4 e3 7d 39 d6 01     vextracti128 $0x1,%ymm2,%xmm6
   0x0000000000001a08 <+72>:    c5 e5 75 57 e0        vpcmpeqw -0x20(%rdi),%ymm3,%ymm2
   0x0000000000001a0d <+77>:    c4 e2 7d 33 fe        vpmovzxwd %xmm6,%ymm7
   0x0000000000001a12 <+82>:    c4 c3 7d 39 f9 01     vextracti128 $0x1,%ymm7,%xmm9
   0x0000000000001a18 <+88>:    c5 ed db f4           vpand  %ymm4,%ymm2,%ymm6
   0x0000000000001a1c <+92>:    c4 62 7d 35 c7        vpmovzxdq %xmm7,%ymm8
   0x0000000000001a21 <+97>:    c4 42 7d 35 d1        vpmovzxdq %xmm9,%ymm10
   0x0000000000001a26 <+102>:    c4 c3 7d 39 cd 01    vextracti128 $0x1,%ymm1,%xmm13
   0x0000000000001a2c <+108>:    c4 e3 7d 39 f7 01    vextracti128 $0x1,%ymm6,%xmm7
   0x0000000000001a32 <+114>:    c4 41 3d d4 da       vpaddq %ymm10,%ymm8,%ymm11
   0x0000000000001a37 <+119>:    c4 62 7d 35 e1       vpmovzxdq %xmm1,%ymm12
   0x0000000000001a3c <+124>:    c4 42 7d 35 f5       vpmovzxdq %xmm13,%ymm14
   0x0000000000001a41 <+129>:    c4 e2 7d 33 ce       vpmovzxwd %xmm6,%ymm1
   0x0000000000001a46 <+134>:    c4 62 7d 33 c7       vpmovzxwd %xmm7,%ymm8
   0x0000000000001a4b <+139>:    c4 41 1d d4 fe       vpaddq %ymm14,%ymm12,%ymm15
   0x0000000000001a50 <+144>:    c4 43 7d 39 c2 01    vextracti128 $0x1,%ymm8,%xmm10
   0x0000000000001a56 <+150>:    c4 c3 7d 39 ce 01    vextracti128 $0x1,%ymm1,%xmm14
   0x0000000000001a5c <+156>:    c4 c1 25 d4 c7       vpaddq %ymm15,%ymm11,%ymm0
   0x0000000000001a61 <+161>:    c4 42 7d 35 c8       vpmovzxdq %xmm8,%ymm9
   0x0000000000001a66 <+166>:    c4 42 7d 35 da       vpmovzxdq %xmm10,%ymm11
   0x0000000000001a6b <+171>:    c4 62 7d 35 e9       vpmovzxdq %xmm1,%ymm13
   0x0000000000001a70 <+176>:    c4 42 7d 35 fe       vpmovzxdq %xmm14,%ymm15
   0x0000000000001a75 <+181>:    c5 d5 d4 e8          vpaddq %ymm0,%ymm5,%ymm5
   0x0000000000001a79 <+185>:    c4 41 35 d4 e3       vpaddq %ymm11,%ymm9,%ymm12
   0x0000000000001a7e <+190>:    c4 c1 15 d4 c7       vpaddq %ymm15,%ymm13,%ymm0
   0x0000000000001a83 <+195>:    c5 9d d4 d0          vpaddq %ymm0,%ymm12,%ymm2
   0x0000000000001a87 <+199>:    c5 d5 d4 ea          vpaddq %ymm2,%ymm5,%ymm5

14      for (uint64_t i = 0; i < size; i++) {
   0x0000000000001a8b <+203>:    48 39 f8                cmp    %rdi,%rax
   0x0000000000001a8e <+206>:    0f 85 5d ff ff ff       jne    0x19f1 <count_pairs+49>
   0x0000000000001a94 <+212>:    62 f3 fd 28 39 ec 01    vextracti64x2 $0x1,%ymm5,%xmm4
   0x0000000000001a9b <+219>:    c5 d1 d4 f4             vpaddq %xmm4,%xmm5,%xmm6
   0x0000000000001a9f <+223>:    c5 f1 73 de 08          vpsrldq $0x8,%xmm6,%xmm1
   0x0000000000001aa4 <+228>:    c5 c9 d4 f9             vpaddq %xmm1,%xmm6,%xmm7
   0x0000000000001aa8 <+232>:    c4 e1 f9 7e f8          vmovq  %xmm7,%rax

16      }
17      return total;
   0x0000000000001aad <+237>:    c5 f8 77    vzeroupper 
   0x0000000000001ab0 <+240>:    c3          retq   
End of assembler dump.

```

The compiler can do a lot to optimize and use the vector registers to speed up the code, but we can also do it manually.

Intrinsics are basically functions which will compile down to a single special assembly instruction that we can use to use the vector instructions in c or c++ code without having to write raw assembly.  The list of them can be found [https://software.intel.com/sites/landingpage/IntrinsicsGuide/](https://software.intel.com/sites/landingpage/IntrinsicsGuide/)

![_mm256_cmpeq_epi16](source/_mm256_cmpeq_epi16.JPG "_mm256_cmpeq_epi16")




Here we manually vectorize this code 
```c
uint64_t count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
  data = __builtin_assume_aligned(data, 32);
  uint64_t total = 0;
  uint16_t check = target | (target << 8U);
  // set up a vector so that you can compare the pair of bytes in all 16 positions
  __m256i compare = _mm256_set1_epi16(check);
  for (uint64_t i = 0; i < size; i += 16) {
    // do 16 comparisons at once
    uint32_t block = _mm256_movemask_epi8( _mm256_cmpeq_epi16(_mm256_load_si256((__m256i *)(data + i)), compare));
    // once we move the result of the comparison into a normal register, count the set bits
    total += __builtin_popcount(block);
  }
  return total / 2;
}
```

- `_mm256_set1_epi16` takes in a 16-bit object and puts it at all 16 16-bit locations in a 256 bit register 
- `_mm256_load_si256` loads 256 bits of data from the given memory address
- `_mm256_cmpeq_epi16` compares the two vectors element wise
- `_mm256_movemask_epi8` takes the top bit from each byte into a 32-bit object, we use it to get our data out of the vector register into the normal registers
- `__builtin_popcount` counts the number of set bits in a normal 32-bit register


Most of these require the `avx2` flag above, but popcount requires the special `popcnt` flag

So what we are doing here, is for each set of 16 elements, we compare if they are equal to the vector of a pair of target bytes, then we move these out into a normal register, we then count up the number of equal shorts with the pop count, however, since we moved it out by bytes and not shorts we need to divide by 2 at the end.

We get fairly good performance with a significant improvement over the compiler

| Version | time to process 1 Gib in ms |
|--- | --- |
| unoptimized | 1045 |
| -O3 | 301 |
| -O3 -funroll-loops | 181 |
| -march=native | 116 |
| manual vectorization | 79 |

Let us take a look at the assembly generated this time.

```assembly
Dump of assembler code for function count_pairs:
ex1c.c:
8    count_pairs(uint16_t *data, uint64_t size, uint8_t target) {
   0x0000000000001620 <+0>:    f3 0f 1e fa    endbr64 

9      data = __builtin_assume_aligned(data, 32);
10      uint64_t total = 0;
11      uint16_t check = target | (target << 8U);
   0x0000000000001624 <+4>:    89 d0        mov    %edx,%eax
   0x0000000000001626 <+6>:    c1 e0 08     shl    $0x8,%eax
   0x0000000000001629 <+9>:    0f b6 d2     movzbl %dl,%edx
   0x000000000000162c <+12>:    09 d0       or     %edx,%eax

/usr/lib/gcc/x86_64-linux-gnu/11/include/avxintrin.h:
1335      return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
   0x000000000000162e <+14>:    62 f2 7d 28 7b c8    vpbroadcastw %eax,%ymm1

ex1c.c:
13      for (uint64_t i = 0; i < size; i += 16) {
   0x0000000000001634 <+20>:    48 85 f6    test   %rsi,%rsi
   0x0000000000001637 <+23>:    74 27       je     0x1660 <count_pairs+64>
   0x0000000000001639 <+25>:    31 d2       xor    %edx,%edx
   0x000000000000163b <+27>:    31 c0       xor    %eax,%eax
   0x000000000000163d <+29>:    0f 1f 00    nopl   (%rax)

/usr/lib/gcc/x86_64-linux-gnu/11/include/avx2intrin.h:
240      return (__m256i) ((__v16hi)__A == (__v16hi)__B);
   0x0000000000001640 <+32>:    c5 f5 75 04 57    vpcmpeqw (%rdi,%rdx,2),%ymm1,%ymm0
   0x0000000000001645 <+37>:    48 83 c2 10       add    $0x10,%rdx
   0x0000000000001649 <+41>:    c5 fd d7 c8       vpmovmskb %ymm0,%ecx

ex1c.c:
16        total += __builtin_popcount(block);
   0x000000000000164d <+45>:    f3 0f b8 c9    popcnt %ecx,%ecx
   0x0000000000001651 <+49>:    48 01 c8       add    %rcx,%rax

13      for (uint64_t i = 0; i < size; i += 16) {
   0x0000000000001654 <+52>:    48 39 d6       cmp    %rdx,%rsi
   0x0000000000001657 <+55>:    77 e7          ja     0x1640 <count_pairs+32>
   0x0000000000001659 <+57>:    48 d1 e8       shr    %rax
   0x000000000000165c <+60>:    c5 f8 77       vzeroupper 
   0x000000000000165f <+63>:    c3             retq   
   0x0000000000001660 <+64>:    31 c0          xor    %eax,%eax

17      }
18      return total / 2;
   0x0000000000001662 <+66>:    c5 f8 77    vzeroupper 
   0x0000000000001665 <+69>:    c3          retq   
End of assembler dump.


```

| Version | time to process 1 Gib in ms |
|--- | --- |
| unoptimized | 1045 |
| -O3 | 301 |
| -O3 -funroll-loops | 181 |
| -march=native | 116 |
| manual vectorization | 79 |


So all together we are able to get a 14x speedup, 4x over optimized code, and 1.5x over the best the compiler could do, but it takes a lot more effort to write the last versions than the first three.

# Problem 2

Now we will look at a slight variation of the problem.

We remove the requirement that the pairs are aligned, before the pairs had to be in even - odd positions, not they can be either even-odd, or odd-even.

The code for this is not much more complicated, we look at every byte position now instead of each short position.

```c
uint64_t count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
  uint64_t total = 0;
  uint16_t check = target | (target << 8U);
  for (uint64_t i = 0; i < size * 2 - 1; i++) {
    total += (load16(data + i) == check);
  }
  return total;
}
```


| Version | Problem 1 | Problem 2|
|--- | --- | -- |
| unoptimized |      1045    |3276 |
| -O3 |             301      |587 |
| -O3 -funroll-loops |  181  |324 |
| -march=native |    116     |325|

Unoptimized 
```assembly
Dump of assembler code for function count_pairs:
ex2a.c:
7    count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
   0x0000000000001567 <+0>:    f3 0f 1e fa    endbr64 
   0x000000000000156b <+4>:    55             push   %rbp
   0x000000000000156c <+5>:    48 89 e5       mov    %rsp,%rbp
   0x000000000000156f <+8>:    48 83 ec 40    sub    $0x40,%rsp
   0x0000000000001573 <+12>:    48 89 7d d8   mov    %rdi,-0x28(%rbp)
   0x0000000000001577 <+16>:    48 89 75 d0   mov    %rsi,-0x30(%rbp)
   0x000000000000157b <+20>:    89 d0         mov    %edx,%eax
   0x000000000000157d <+22>:    88 45 cc      mov    %al,-0x34(%rbp)

8      uint64_t total = 0;
   0x0000000000001580 <+25>:    48 c7 45 f0 00 00 00 00    movq   $0x0,-0x10(%rbp)

9      uint16_t check = target | (target << 8U);
   0x0000000000001588 <+33>:    0f b6 55 cc    movzbl -0x34(%rbp),%edx
   0x000000000000158c <+37>:    0f b6 45 cc    movzbl -0x34(%rbp),%eax
   0x0000000000001590 <+41>:    c1 e0 08       shl    $0x8,%eax
   0x0000000000001593 <+44>:    09 d0          or     %edx,%eax
   0x0000000000001595 <+46>:    66 89 45 ee    mov    %ax,-0x12(%rbp)

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001599 <+50>:    48 c7 45 f8 00 00 00 00    movq   $0x0,-0x8(%rbp)
   0x00000000000015a1 <+58>:    eb 28                      jmp    0x15cb <count_pairs+100>

11        total += (load16(data + i) == check);
   0x00000000000015a3 <+60>:    48 8b 55 d8       mov    -0x28(%rbp),%rdx
   0x00000000000015a7 <+64>:    48 8b 45 f8       mov    -0x8(%rbp),%rax
   0x00000000000015ab <+68>:    48 01 d0          add    %rdx,%rax
   0x00000000000015ae <+71>:    48 89 c7          mov    %rax,%rdi
   0x00000000000015b1 <+74>:    e8 4f fd ff ff    callq  0x1305 <load16>
   0x00000000000015b6 <+79>:    66 39 45 ee       cmp    %ax,-0x12(%rbp)
   0x00000000000015ba <+83>:    0f 94 c0          sete   %al
   0x00000000000015bd <+86>:    0f b6 c0          movzbl %al,%eax
   0x00000000000015c0 <+89>:    48 98             cltq   
   0x00000000000015c2 <+91>:    48 01 45 f0       add    %rax,-0x10(%rbp)

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x00000000000015c6 <+95>:    48 83 45 f8 01    addq   $0x1,-0x8(%rbp)
   0x00000000000015cb <+100>:    48 8b 45 d0      mov    -0x30(%rbp),%rax
   0x00000000000015cf <+104>:    48 01 c0         add    %rax,%rax
   0x00000000000015d2 <+107>:    48 83 e8 01      sub    $0x1,%rax
   0x00000000000015d6 <+111>:    48 39 45 f8      cmp    %rax,-0x8(%rbp)
   0x00000000000015da <+115>:    72 c7    jb      0x15a3 <count_pairs+60>

12      }
13      return total;
   0x00000000000015dc <+117>:    48 8b 45 f0    mov    -0x10(%rbp),%rax

14    }
   0x00000000000015e0 <+121>:    c9    leaveq 
   0x00000000000015e1 <+122>:    c3    retq   
End of assembler dump.


```

-O3
```assembly
Dump of assembler code for function count_pairs:
ex2a.c:
7    count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
   0x0000000000001630 <+0>:    f3 0f 1e fa    endbr64 

8      uint64_t total = 0;
9      uint16_t check = target | (target << 8U);
   0x0000000000001634 <+4>:    89 d1             mov    %edx,%ecx
   0x0000000000001636 <+6>:    0f b6 d2          movzbl %dl,%edx
   0x0000000000001639 <+9>:    48 8d 74 77 ff    lea    -0x1(%rdi,%rsi,2),%rsi
   0x000000000000163e <+14>:    31 c0            xor    %eax,%eax
   0x0000000000001640 <+16>:    c1 e1 08         shl    $0x8,%ecx
   0x0000000000001643 <+19>:    09 d1            or     %edx,%ecx

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001645 <+21>:    0f 1f 00    nopl   (%rax)

11        total += (load16(data + i) == check);
   0x0000000000001648 <+24>:    31 d2           xor    %edx,%edx
   0x000000000000164a <+26>:    66 39 0f        cmp    %cx,(%rdi)
   0x000000000000164d <+29>:    0f 94 c2        sete   %dl
   0x0000000000001650 <+32>:    48 83 c7 01     add    $0x1,%rdi
   0x0000000000001654 <+36>:    48 01 d0        add    %rdx,%rax

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001657 <+39>:    48 39 fe    cmp    %rdi,%rsi
   0x000000000000165a <+42>:    75 ec       jne    0x1648 <count_pairs+24>

12      }
13      return total;
   0x000000000000165c <+44>:    c3    retq   
End of assembler dump.
```

-O3 -funnroll-loops
```assembly
Dump of assembler code for function count_pairs:
ex2a.c:
7    count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
   0x00000000000019c0 <+0>:    f3 0f 1e fa    endbr64 

8      uint64_t total = 0;
9      uint16_t check = target | (target << 8U);
   0x00000000000019c4 <+4>:    48 8d 74 77 ff    lea    -0x1(%rdi,%rsi,2),%rsi
   0x00000000000019c9 <+9>:    41 89 d0    mov    %edx,%r8d
   0x00000000000019cc <+12>:    0f b6 d2    movzbl %dl,%edx
   0x00000000000019cf <+15>:    31 c0    xor    %eax,%eax
   0x00000000000019d1 <+17>:    48 89 f1    mov    %rsi,%rcx
   0x00000000000019d4 <+20>:    41 c1 e0 08    shl    $0x8,%r8d
   0x00000000000019d8 <+24>:    48 29 f9    sub    %rdi,%rcx
   0x00000000000019db <+27>:    41 09 d0    or     %edx,%r8d

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x00000000000019de <+30>:    83 e1 07    and    $0x7,%ecx
   0x00000000000019e1 <+33>:    0f 84 99 00 00 00    je     0x1a80 <count_pairs+192>
   0x00000000000019e7 <+39>:    48 83 f9 01    cmp    $0x1,%rcx
   0x00000000000019eb <+43>:    74 78    je     0x1a65 <count_pairs+165>
   0x00000000000019ed <+45>:    48 83 f9 02    cmp    $0x2,%rcx
   0x00000000000019f1 <+49>:    74 62    je     0x1a55 <count_pairs+149>
   0x00000000000019f3 <+51>:    48 83 f9 03    cmp    $0x3,%rcx
   0x00000000000019f7 <+55>:    74 4c    je     0x1a45 <count_pairs+133>
   0x00000000000019f9 <+57>:    48 83 f9 04    cmp    $0x4,%rcx
   0x00000000000019fd <+61>:    74 34    je     0x1a33 <count_pairs+115>
   0x00000000000019ff <+63>:    48 83 f9 05    cmp    $0x5,%rcx
   0x0000000000001a03 <+67>:    74 1c    je     0x1a21 <count_pairs+97>
   0x0000000000001a05 <+69>:    48 83 f9 06    cmp    $0x6,%rcx
   0x0000000000001a09 <+73>:    0f 85 f1 00 00 00    jne    0x1b00 <count_pairs+320>

11        total += (load16(data + i) == check);
   0x0000000000001a0f <+79>:    45 31 c9    xor    %r9d,%r9d
   0x0000000000001a12 <+82>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a16 <+86>:    41 0f 94 c1    sete   %r9b
   0x0000000000001a1a <+90>:    48 83 c7 01    add    $0x1,%rdi
   0x0000000000001a1e <+94>:    4c 01 c8    add    %r9,%rax
   0x0000000000001a21 <+97>:    45 31 d2    xor    %r10d,%r10d
   0x0000000000001a24 <+100>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a28 <+104>:    41 0f 94 c2    sete   %r10b
   0x0000000000001a2c <+108>:    48 83 c7 01    add    $0x1,%rdi
   0x0000000000001a30 <+112>:    4c 01 d0    add    %r10,%rax
   0x0000000000001a33 <+115>:    45 31 db    xor    %r11d,%r11d
   0x0000000000001a36 <+118>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a3a <+122>:    41 0f 94 c3    sete   %r11b
   0x0000000000001a3e <+126>:    48 83 c7 01    add    $0x1,%rdi
   0x0000000000001a42 <+130>:    4c 01 d8    add    %r11,%rax
   0x0000000000001a45 <+133>:    31 d2    xor    %edx,%edx
   0x0000000000001a47 <+135>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a4b <+139>:    0f 94 c2    sete   %dl
   0x0000000000001a4e <+142>:    48 83 c7 01    add    $0x1,%rdi
   0x0000000000001a52 <+146>:    48 01 d0    add    %rdx,%rax
   0x0000000000001a55 <+149>:    31 c9    xor    %ecx,%ecx
   0x0000000000001a57 <+151>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a5b <+155>:    0f 94 c1    sete   %cl
   0x0000000000001a5e <+158>:    48 83 c7 01    add    $0x1,%rdi
   0x0000000000001a62 <+162>:    48 01 c8    add    %rcx,%rax
   0x0000000000001a65 <+165>:    45 31 c9    xor    %r9d,%r9d
   0x0000000000001a68 <+168>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a6c <+172>:    41 0f 94 c1    sete   %r9b
   0x0000000000001a70 <+176>:    48 83 c7 01    add    $0x1,%rdi
   0x0000000000001a74 <+180>:    4c 01 c8    add    %r9,%rax

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001a77 <+183>:    48 39 fe    cmp    %rdi,%rsi
   0x0000000000001a7a <+186>:    0f 84 98 00 00 00    je     0x1b18 <count_pairs+344>

11        total += (load16(data + i) == check);
   0x0000000000001a80 <+192>:    45 31 d2    xor    %r10d,%r10d
   0x0000000000001a83 <+195>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a87 <+199>:    41 0f 94 c2    sete   %r10b
   0x0000000000001a8b <+203>:    45 31 db    xor    %r11d,%r11d
   0x0000000000001a8e <+206>:    4c 01 d0    add    %r10,%rax
   0x0000000000001a91 <+209>:    66 44 39 47 01    cmp    %r8w,0x1(%rdi)
   0x0000000000001a96 <+214>:    41 0f 94 c3    sete   %r11b
   0x0000000000001a9a <+218>:    31 d2    xor    %edx,%edx
   0x0000000000001a9c <+220>:    4c 01 d8    add    %r11,%rax
   0x0000000000001a9f <+223>:    66 44 39 47 02    cmp    %r8w,0x2(%rdi)
   0x0000000000001aa4 <+228>:    0f 94 c2    sete   %dl
   0x0000000000001aa7 <+231>:    31 c9    xor    %ecx,%ecx
   0x0000000000001aa9 <+233>:    48 01 d0    add    %rdx,%rax
   0x0000000000001aac <+236>:    66 44 39 47 03    cmp    %r8w,0x3(%rdi)
   0x0000000000001ab1 <+241>:    0f 94 c1    sete   %cl
   0x0000000000001ab4 <+244>:    45 31 c9    xor    %r9d,%r9d
   0x0000000000001ab7 <+247>:    48 01 c8    add    %rcx,%rax
   0x0000000000001aba <+250>:    66 44 39 47 04    cmp    %r8w,0x4(%rdi)
   0x0000000000001abf <+255>:    41 0f 94 c1    sete   %r9b
   0x0000000000001ac3 <+259>:    45 31 d2    xor    %r10d,%r10d
   0x0000000000001ac6 <+262>:    4c 01 c8    add    %r9,%rax
   0x0000000000001ac9 <+265>:    66 44 39 47 05    cmp    %r8w,0x5(%rdi)
   0x0000000000001ace <+270>:    41 0f 94 c2    sete   %r10b
   0x0000000000001ad2 <+274>:    45 31 db    xor    %r11d,%r11d
   0x0000000000001ad5 <+277>:    4c 01 d0    add    %r10,%rax
   0x0000000000001ad8 <+280>:    66 44 39 47 06    cmp    %r8w,0x6(%rdi)
   0x0000000000001add <+285>:    41 0f 94 c3    sete   %r11b
   0x0000000000001ae1 <+289>:    31 d2    xor    %edx,%edx
   0x0000000000001ae3 <+291>:    4c 01 d8    add    %r11,%rax
   0x0000000000001ae6 <+294>:    66 44 39 47 07    cmp    %r8w,0x7(%rdi)
   0x0000000000001aeb <+299>:    0f 94 c2    sete   %dl
   0x0000000000001aee <+302>:    48 83 c7 08    add    $0x8,%rdi
   0x0000000000001af2 <+306>:    48 01 d0    add    %rdx,%rax

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001af5 <+309>:    48 39 fe    cmp    %rdi,%rsi
   0x0000000000001af8 <+312>:    75 86    jne    0x1a80 <count_pairs+192>

12      }
13      return total;
   0x0000000000001afa <+314>:    c3    retq   
   0x0000000000001afb <+315>:    0f 1f 44 00 00    nopl   0x0(%rax,%rax,1)

11        total += (load16(data + i) == check);
   0x0000000000001b00 <+320>:    31 c0    xor    %eax,%eax
   0x0000000000001b02 <+322>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001b06 <+326>:    0f 94 c0    sete   %al

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001b09 <+329>:    48 83 c7 01    add    $0x1,%rdi
   0x0000000000001b0d <+333>:    e9 fd fe ff ff    jmpq   0x1a0f <count_pairs+79>
   0x0000000000001b12 <+338>:    66 0f 1f 44 00 00    nopw   0x0(%rax,%rax,1)
   0x0000000000001b18 <+344>:    c3    retq   
End of assembler dump.
```

-march=native
```assembly
Dump of assembler code for function count_pairs:
ex2a.c:
7    count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
   0x00000000000019c0 <+0>:    f3 0f 1e fa    endbr64 

8      uint64_t total = 0;
9      uint16_t check = target | (target << 8U);
   0x00000000000019c4 <+4>:    48 8d 74 77 ff    lea    -0x1(%rdi,%rsi,2),%rsi
   0x00000000000019c9 <+9>:    41 89 d0    mov    %edx,%r8d
   0x00000000000019cc <+12>:    48 89 f1    mov    %rsi,%rcx
   0x00000000000019cf <+15>:    41 c1 e0 08    shl    $0x8,%r8d
   0x00000000000019d3 <+19>:    0f b6 d2    movzbl %dl,%edx
   0x00000000000019d6 <+22>:    48 29 f9    sub    %rdi,%rcx
   0x00000000000019d9 <+25>:    41 09 d0    or     %edx,%r8d

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x00000000000019dc <+28>:    31 c0    xor    %eax,%eax
   0x00000000000019de <+30>:    83 e1 07    and    $0x7,%ecx
   0x00000000000019e1 <+33>:    0f 84 93 00 00 00    je     0x1a7a <count_pairs+186>
   0x00000000000019e7 <+39>:    48 83 f9 01    cmp    $0x1,%rcx
   0x00000000000019eb <+43>:    74 73    je     0x1a60 <count_pairs+160>
   0x00000000000019ed <+45>:    48 83 f9 02    cmp    $0x2,%rcx
   0x00000000000019f1 <+49>:    74 5e    je     0x1a51 <count_pairs+145>
   0x00000000000019f3 <+51>:    48 83 f9 03    cmp    $0x3,%rcx
   0x00000000000019f7 <+55>:    74 49    je     0x1a42 <count_pairs+130>
   0x00000000000019f9 <+57>:    48 83 f9 04    cmp    $0x4,%rcx
   0x00000000000019fd <+61>:    74 32    je     0x1a31 <count_pairs+113>
   0x00000000000019ff <+63>:    48 83 f9 05    cmp    $0x5,%rcx
   0x0000000000001a03 <+67>:    74 1b    je     0x1a20 <count_pairs+96>
   0x0000000000001a05 <+69>:    48 83 f9 06    cmp    $0x6,%rcx
   0x0000000000001a09 <+73>:    0f 85 e9 00 00 00    jne    0x1af8 <count_pairs+312>

11        total += (load16(data + i) == check);
   0x0000000000001a0f <+79>:    45 31 c9    xor    %r9d,%r9d
   0x0000000000001a12 <+82>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a16 <+86>:    41 0f 94 c1    sete   %r9b
   0x0000000000001a1a <+90>:    4c 01 c8    add    %r9,%rax

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001a1d <+93>:    48 ff c7    inc    %rdi

11        total += (load16(data + i) == check);
   0x0000000000001a20 <+96>:    45 31 d2    xor    %r10d,%r10d
   0x0000000000001a23 <+99>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a27 <+103>:    41 0f 94 c2    sete   %r10b
   0x0000000000001a2b <+107>:    4c 01 d0    add    %r10,%rax

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001a2e <+110>:    48 ff c7    inc    %rdi

11        total += (load16(data + i) == check);
   0x0000000000001a31 <+113>:    45 31 db    xor    %r11d,%r11d
   0x0000000000001a34 <+116>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a38 <+120>:    41 0f 94 c3    sete   %r11b
   0x0000000000001a3c <+124>:    4c 01 d8    add    %r11,%rax

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001a3f <+127>:    48 ff c7    inc    %rdi

11        total += (load16(data + i) == check);
   0x0000000000001a42 <+130>:    31 d2    xor    %edx,%edx
   0x0000000000001a44 <+132>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a48 <+136>:    0f 94 c2    sete   %dl
   0x0000000000001a4b <+139>:    48 01 d0    add    %rdx,%rax

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001a4e <+142>:    48 ff c7    inc    %rdi

11        total += (load16(data + i) == check);
   0x0000000000001a51 <+145>:    31 c9    xor    %ecx,%ecx
   0x0000000000001a53 <+147>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a57 <+151>:    0f 94 c1    sete   %cl
   0x0000000000001a5a <+154>:    48 01 c8    add    %rcx,%rax

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001a5d <+157>:    48 ff c7    inc    %rdi

11        total += (load16(data + i) == check);
   0x0000000000001a60 <+160>:    45 31 c9    xor    %r9d,%r9d
   0x0000000000001a63 <+163>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a67 <+167>:    41 0f 94 c1    sete   %r9b
   0x0000000000001a6b <+171>:    48 ff c7    inc    %rdi
   0x0000000000001a6e <+174>:    4c 01 c8    add    %r9,%rax

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001a71 <+177>:    48 39 fe    cmp    %rdi,%rsi
   0x0000000000001a74 <+180>:    0f 84 96 00 00 00    je     0x1b10 <count_pairs+336>

11        total += (load16(data + i) == check);
   0x0000000000001a7a <+186>:    45 31 d2    xor    %r10d,%r10d
   0x0000000000001a7d <+189>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001a81 <+193>:    41 0f 94 c2    sete   %r10b
   0x0000000000001a85 <+197>:    4c 01 d0    add    %r10,%rax
   0x0000000000001a88 <+200>:    45 31 db    xor    %r11d,%r11d
   0x0000000000001a8b <+203>:    66 44 39 47 01    cmp    %r8w,0x1(%rdi)
   0x0000000000001a90 <+208>:    41 0f 94 c3    sete   %r11b
   0x0000000000001a94 <+212>:    4c 01 d8    add    %r11,%rax
   0x0000000000001a97 <+215>:    31 d2    xor    %edx,%edx
   0x0000000000001a99 <+217>:    66 44 39 47 02    cmp    %r8w,0x2(%rdi)
   0x0000000000001a9e <+222>:    0f 94 c2    sete   %dl
   0x0000000000001aa1 <+225>:    48 01 d0    add    %rdx,%rax
   0x0000000000001aa4 <+228>:    31 c9    xor    %ecx,%ecx
   0x0000000000001aa6 <+230>:    66 44 39 47 03    cmp    %r8w,0x3(%rdi)
   0x0000000000001aab <+235>:    0f 94 c1    sete   %cl
   0x0000000000001aae <+238>:    48 01 c8    add    %rcx,%rax
   0x0000000000001ab1 <+241>:    45 31 c9    xor    %r9d,%r9d
   0x0000000000001ab4 <+244>:    66 44 39 47 04    cmp    %r8w,0x4(%rdi)
   0x0000000000001ab9 <+249>:    41 0f 94 c1    sete   %r9b
   0x0000000000001abd <+253>:    4c 01 c8    add    %r9,%rax
   0x0000000000001ac0 <+256>:    45 31 d2    xor    %r10d,%r10d
   0x0000000000001ac3 <+259>:    66 44 39 47 05    cmp    %r8w,0x5(%rdi)
   0x0000000000001ac8 <+264>:    41 0f 94 c2    sete   %r10b
   0x0000000000001acc <+268>:    4c 01 d0    add    %r10,%rax
   0x0000000000001acf <+271>:    45 31 db    xor    %r11d,%r11d
   0x0000000000001ad2 <+274>:    66 44 39 47 06    cmp    %r8w,0x6(%rdi)
   0x0000000000001ad7 <+279>:    41 0f 94 c3    sete   %r11b
   0x0000000000001adb <+283>:    4c 01 d8    add    %r11,%rax
   0x0000000000001ade <+286>:    31 d2    xor    %edx,%edx
   0x0000000000001ae0 <+288>:    66 44 39 47 07    cmp    %r8w,0x7(%rdi)
   0x0000000000001ae5 <+293>:    0f 94 c2    sete   %dl
   0x0000000000001ae8 <+296>:    48 83 c7 08    add    $0x8,%rdi
   0x0000000000001aec <+300>:    48 01 d0    add    %rdx,%rax

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001aef <+303>:    48 39 fe    cmp    %rdi,%rsi
   0x0000000000001af2 <+306>:    75 86    jne    0x1a7a <count_pairs+186>

12      }
13      return total;
   0x0000000000001af4 <+308>:    c3    retq   
   0x0000000000001af5 <+309>:    0f 1f 00    nopl   (%rax)

11        total += (load16(data + i) == check);
   0x0000000000001af8 <+312>:    31 c0    xor    %eax,%eax
   0x0000000000001afa <+314>:    66 44 39 07    cmp    %r8w,(%rdi)
   0x0000000000001afe <+318>:    0f 94 c0    sete   %al

10      for (uint64_t i = 0; i < size * 2 - 1; i++) {
   0x0000000000001b01 <+321>:    48 ff c7    inc    %rdi
   0x0000000000001b04 <+324>:    e9 06 ff ff ff    jmpq   0x1a0f <count_pairs+79>
   0x0000000000001b09 <+329>:    0f 1f 80 00 00 00 00    nopl   0x0(%rax)
   0x0000000000001b10 <+336>:    c3    retq   
End of assembler dump.
```

`-march=native` was not able to help

| Version | Problem 1 | Problem 2|
|--- | --- | -- |
| unoptimized |      1045    |3276 |
| -O3 |             301      |587 |
| -O3 -funroll-loops |  181  |324 |
| -march=native |    116     |325|

Here the work is about twice as much, so the first two numbers seem reasonable, but now the compiler is unable to gain benefit from vectorization.

This is because without the alignment restriction this is a much messier problem.



However, we can still do it by hand.

```c
uint64_t count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
  uint64_t total = 0;
  uint32_t last_bit = 0;
  __m256i compare = _mm256_set1_epi8(target);
  for (uint64_t i = 0; i < size * 2; i += 32) {
    uint32_t block = _mm256_movemask_epi8( _mm256_cmpeq_epi8(_mm256_load_si256((__m256i *)(data + i)), compare));
    total += __builtin_popcount(block & (block >> 1U));
    if (last_bit) {
      total += last_bit & block;
    }
    last_bit = block >> 31U;
  }
  return total;
}
```

Let us walk through the loop together

We first do 
```c++
uint32_t block = _mm256_movemask_epi8(_mm256_cmpeq_epi8(_mm256_load_si256((__m256i *)(data + i)), compare));
```
Which tells us which byte is equal to the target byte and moves this data into a normal 32-bit register

We then do 
`block & (block >> 1U)`
Which actually uses bit level parallelism and results in a 1 exactly where both that location, and the location to the left are equal to the target byte, exactly what we are looking for.

Then we use pop count to see how many of these pairs there were.

Lastly we need to do some extra bookkeeping to manage when pairs go between vectors of data.


| Version | Problem 1 | Problem 2|
|--- | --- | -- |
| unoptimized |      1045    |3276 |
| -O3 |             301      |587 |
| -O3 -funroll-loops |  181  |324 |
| -march=native |    116     |325|
| manual |          79         | 88|

```assembly
Dump of assembler code for function count_pairs:
ex2b.c:
8    count_pairs(uint8_t *data, uint64_t size, uint8_t target) {
   0x00000000000019c0 <+0>:    f3 0f 1e fa    endbr64 
   0x00000000000019c4 <+4>:    48 89 f9    mov    %rdi,%rcx
   0x00000000000019c7 <+7>:    48 89 f7    mov    %rsi,%rdi
   0x00000000000019ca <+10>:    62 f2 7d 28 7a c2    vpbroadcastb %edx,%ymm0ex2b.c:
12      for (uint64_t i = 0; i < size * 2; i += 32) {
   0x00000000000019d0 <+16>:    48 01 ff    add    %rdi,%rdi
   0x00000000000019d3 <+19>:    0f 84 27 02 00 00    je     0x1c00 <count_pairs+576>
   0x00000000000019d9 <+25>:    c5 fd 74 09    vpcmpeqb (%rcx),%ymm0,%ymm1
   0x00000000000019dd <+29>:    c5 fd d7 c1    vpmovmskb %ymm1,%eaxex2b.c:
15        total += __builtin_popcount(block & (block >> 1U));
   0x00000000000019e1 <+33>:    41 89 c0    mov    %eax,%r8d
   0x00000000000019e4 <+36>:    41 d1 e8    shr    %r8d
   0x00000000000019e7 <+39>:    41 21 c0    and    %eax,%r8d
   0x00000000000019ea <+42>:    f3 45 0f b8 c0    popcnt %r8d,%r8d17          total += last_bit & block;
18        }
19        last_bit = block >> 31U;
   0x00000000000019ef <+47>:    c1 e8 1f    shr    $0x1f,%eax12      for (uint64_t i = 0; i < size * 2; i += 32) {
   0x00000000000019f2 <+50>:    48 83 ff 20    cmp    $0x20,%rdi
   0x00000000000019f6 <+54>:    0f 86 fa 01 00 00    jbe    0x1bf6 <count_pairs+566>
   0x00000000000019fc <+60>:    c5 fd 74 51 20    vpcmpeqb 0x20(%rcx),%ymm0,%ymm2
   0x0000000000001a01 <+65>:    45 31 d2    xor    %r10d,%r10d
   0x0000000000001a04 <+68>:    4c 8d 4f df    lea    -0x21(%rdi),%r9
   0x0000000000001a08 <+72>:    c5 fd d7 d2    vpmovmskb %ymm2,%edx
   0x0000000000001a0c <+76>:    89 d6    mov    %edx,%esi
   0x0000000000001a0e <+78>:    d1 ee    shr    %esi
   0x0000000000001a10 <+80>:    21 d6    and    %edx,%esi
   0x0000000000001a12 <+82>:    f3 44 0f b8 d6    popcnt %esi,%r10d
   0x0000000000001a17 <+87>:    89 d6    mov    %edx,%esi
   0x0000000000001a19 <+89>:    4d 01 d0    add    %r10,%r8
   0x0000000000001a1c <+92>:    49 c1 e9 05    shr    $0x5,%r9
   0x0000000000001a20 <+96>:    83 e6 01    and    $0x1,%esi
   0x0000000000001a23 <+99>:    4c 01 c6    add    %r8,%rsi
   0x0000000000001a26 <+102>:    4d 89 c3    mov    %r8,%r11
   0x0000000000001a29 <+105>:    41 83 e1 03    and    $0x3,%r9d16        if (last_bit) {
   0x0000000000001a2d <+109>:    85 c0    test   %eax,%eax
   0x0000000000001a2f <+111>:    4c 0f 45 de    cmovne %rsi,%r11
   0x0000000000001a33 <+115>:    c1 ea 1f    shr    $0x1f,%edx
   0x0000000000001a36 <+118>:    4d 89 d8    mov    %r11,%r817          total += last_bit & block;
18        }
19        last_bit = block >> 31U;
   0x0000000000001a39 <+121>:    41 bb 40 00 00 00    mov    $0x40,%r11d12      for (uint64_t i = 0; i < size * 2; i += 32) {
   0x0000000000001a3f <+127>:    48 83 ff 40    cmp    $0x40,%rdi
   0x0000000000001a43 <+131>:    0f 86 ad 01 00 00    jbe    0x1bf6 <count_pairs+566>
   0x0000000000001a49 <+137>:    4d 85 c9    test   %r9,%r9
   0x0000000000001a4c <+140>:    0f 84 c9 00 00 00    je     0x1b1b <count_pairs+347>
   0x0000000000001a52 <+146>:    49 83 f9 01    cmp    $0x1,%r9
   0x0000000000001a56 <+150>:    74 7e    je     0x1ad6 <count_pairs+278>
   0x0000000000001a58 <+152>:    49 83 f9 02    cmp    $0x2,%r9
   0x0000000000001a5c <+156>:    74 3c    je     0x1a9a <count_pairs+218>
   0x0000000000001a5e <+158>:    c4 a1 7d 74 1c 19    vpcmpeqb (%rcx,%r11,1),%ymm0,%ymm3
   0x0000000000001a64 <+164>:    45 31 d2    xor    %r10d,%r10d
   0x0000000000001a67 <+167>:    c5 7d d7 cb    vpmovmskb %ymm3,%r9dex2b.c:
15        total += __builtin_popcount(block & (block >> 1U));
   0x0000000000001a6b <+171>:    44 89 c8    mov    %r9d,%eax
   0x0000000000001a6e <+174>:    d1 e8    shr    %eax
   0x0000000000001a70 <+176>:    44 21 c8    and    %r9d,%eax
   0x0000000000001a73 <+179>:    f3 44 0f b8 d0    popcnt %eax,%r10d
   0x0000000000001a78 <+184>:    44 89 c8    mov    %r9d,%eax
   0x0000000000001a7b <+187>:    4d 01 d0    add    %r10,%r816        if (last_bit) {
   0x0000000000001a7e <+190>:    83 e0 01    and    $0x1,%eax
   0x0000000000001a81 <+193>:    4c 01 c0    add    %r8,%rax
   0x0000000000001a84 <+196>:    4c 89 c6    mov    %r8,%rsi
   0x0000000000001a87 <+199>:    85 d2    test   %edx,%edx
   0x0000000000001a89 <+201>:    48 0f 45 f0    cmovne %rax,%rsi
   0x0000000000001a8d <+205>:    44 89 ca    mov    %r9d,%edx
   0x0000000000001a90 <+208>:    49 89 f0    mov    %rsi,%r817          total += last_bit & block;
18        }
19        last_bit = block >> 31U;
   0x0000000000001a93 <+211>:    c1 ea 1f    shr    $0x1f,%edx12      for (uint64_t i = 0; i < size * 2; i += 32) {
   0x0000000000001a96 <+214>:    49 83 c3 20    add    $0x20,%r11
   0x0000000000001a9a <+218>:    c4 a1 7d 74 24 19    vpcmpeqb (%rcx,%r11,1),%ymm0,%ymm4
   0x0000000000001aa0 <+224>:    31 f6    xor    %esi,%esi
   0x0000000000001aa2 <+226>:    c5 7d d7 cc    vpmovmskb %ymm4,%r9dex2b.c:
15        total += __builtin_popcount(block & (block >> 1U));
   0x0000000000001aa6 <+230>:    45 89 ca    mov    %r9d,%r10d
   0x0000000000001aa9 <+233>:    41 d1 ea    shr    %r10d
   0x0000000000001aac <+236>:    45 21 ca    and    %r9d,%r10d
   0x0000000000001aaf <+239>:    44 89 c8    mov    %r9d,%eax
   0x0000000000001ab2 <+242>:    f3 41 0f b8 f2    popcnt %r10d,%esi
   0x0000000000001ab7 <+247>:    83 e0 01    and    $0x1,%eax
   0x0000000000001aba <+250>:    49 01 f0    add    %rsi,%r816        if (last_bit) {
   0x0000000000001abd <+253>:    4c 01 c0    add    %r8,%rax
   0x0000000000001ac0 <+256>:    4d 89 c2    mov    %r8,%r10
   0x0000000000001ac3 <+259>:    85 d2    test   %edx,%edx
   0x0000000000001ac5 <+261>:    4c 0f 45 d0    cmovne %rax,%r10
   0x0000000000001ac9 <+265>:    44 89 ca    mov    %r9d,%edx
   0x0000000000001acc <+268>:    4d 89 d0    mov    %r10,%r817          total += last_bit & block;
18        }
19        last_bit = block >> 31U;
   0x0000000000001acf <+271>:    c1 ea 1f    shr    $0x1f,%edx12      for (uint64_t i = 0; i < size * 2; i += 32) {
   0x0000000000001ad2 <+274>:    49 83 c3 20    add    $0x20,%r11
   0x0000000000001ad6 <+278>:    c4 a1 7d 74 2c 19    vpcmpeqb (%rcx,%r11,1),%ymm0,%ymm5
   0x0000000000001adc <+284>:    45 31 d2    xor    %r10d,%r10d
   0x0000000000001adf <+287>:    c5 7d d7 cd    vpmovmskb %ymm5,%r9dex2b.c:
15        total += __builtin_popcount(block & (block >> 1U));
   0x0000000000001ae3 <+291>:    44 89 ce    mov    %r9d,%esi
   0x0000000000001ae6 <+294>:    d1 ee    shr    %esi
   0x0000000000001ae8 <+296>:    44 21 ce    and    %r9d,%esi
   0x0000000000001aeb <+299>:    44 89 c8    mov    %r9d,%eax
   0x0000000000001aee <+302>:    f3 44 0f b8 d6    popcnt %esi,%r10d
   0x0000000000001af3 <+307>:    83 e0 01    and    $0x1,%eax
   0x0000000000001af6 <+310>:    4d 01 d0    add    %r10,%r816        if (last_bit) {
   0x0000000000001af9 <+313>:    4c 01 c0    add    %r8,%rax
   0x0000000000001afc <+316>:    4c 89 c6    mov    %r8,%rsi
   0x0000000000001aff <+319>:    85 d2    test   %edx,%edx
   0x0000000000001b01 <+321>:    48 0f 45 f0    cmovne %rax,%rsi
   0x0000000000001b05 <+325>:    44 89 ca    mov    %r9d,%edx
   0x0000000000001b08 <+328>:    49 83 c3 20    add    $0x20,%r11
   0x0000000000001b0c <+332>:    49 89 f0    mov    %rsi,%r817          total += last_bit & block;
18        }
19        last_bit = block >> 31U;
   0x0000000000001b0f <+335>:    c1 ea 1f    shr    $0x1f,%edx12      for (uint64_t i = 0; i < size * 2; i += 32) {
   0x0000000000001b12 <+338>:    4c 39 df    cmp    %r11,%rdi
   0x0000000000001b15 <+341>:    0f 86 db 00 00 00    jbe    0x1bf6 <count_pairs+566>
   0x0000000000001b1b <+347>:    c4 a1 7d 74 34 19    vpcmpeqb (%rcx,%r11,1),%ymm0,%ymm6
   0x0000000000001b21 <+353>:    c4 a1 7d 74 7c 19 20    vpcmpeqb 0x20(%rcx,%r11,1),%ymm0,%ymm7
   0x0000000000001b28 <+360>:    31 c0    xor    %eax,%eax
   0x0000000000001b2a <+362>:    c5 7d d7 d6    vpmovmskb %ymm6,%r10dex2b.c:
15        total += __builtin_popcount(block & (block >> 1U));
   0x0000000000001b2e <+366>:    45 89 d1    mov    %r10d,%r9d
   0x0000000000001b31 <+369>:    41 d1 e9    shr    %r9d
   0x0000000000001b34 <+372>:    45 21 d1    and    %r10d,%r9d
   0x0000000000001b37 <+375>:    f3 41 0f b8 c1    popcnt %r9d,%eax
   0x0000000000001b3c <+380>:    4c 01 c0    add    %r8,%rax16        if (last_bit) {
   0x0000000000001b3f <+383>:    45 89 d0    mov    %r10d,%r8d
   0x0000000000001b42 <+386>:    4d 8d 4b 20    lea    0x20(%r11),%r9
   0x0000000000001b46 <+390>:    41 83 e0 01    and    $0x1,%r8d
   0x0000000000001b4a <+394>:    c5 7d d7 df    vpmovmskb %ymm7,%r11d
   0x0000000000001b4e <+398>:    48 89 c6    mov    %rax,%rsi
   0x0000000000001b51 <+401>:    c4 21 7d 74 44 09 20    vpcmpeqb 0x20(%rcx,%r9,1),%ymm0,%ymm8
   0x0000000000001b58 <+408>:    4c 01 c0    add    %r8,%rax
   0x0000000000001b5b <+411>:    85 d2    test   %edx,%edx
   0x0000000000001b5d <+413>:    44 89 da    mov    %r11d,%edx
   0x0000000000001b60 <+416>:    48 0f 44 c6    cmove  %rsi,%rax15        total += __builtin_popcount(block & (block >> 1U));
   0x0000000000001b64 <+420>:    d1 ea    shr    %edx
   0x0000000000001b66 <+422>:    44 21 da    and    %r11d,%edx
   0x0000000000001b69 <+425>:    31 f6    xor    %esi,%esi
   0x0000000000001b6b <+427>:    45 89 d8    mov    %r11d,%r8d
   0x0000000000001b6e <+430>:    f3 0f b8 f2    popcnt %edx,%esi
   0x0000000000001b72 <+434>:    41 83 e0 01    and    $0x1,%r8d
   0x0000000000001b76 <+438>:    48 01 f0    add    %rsi,%rax16        if (last_bit) {
   0x0000000000001b79 <+441>:    c4 c1 7d d7 f0    vpmovmskb %ymm8,%esi
   0x0000000000001b7e <+446>:    48 89 c2    mov    %rax,%rdx
   0x0000000000001b81 <+449>:    4c 01 c0    add    %r8,%rax
   0x0000000000001b84 <+452>:    45 85 d2    test   %r10d,%r10d
   0x0000000000001b87 <+455>:    41 89 f2    mov    %esi,%r10d
   0x0000000000001b8a <+458>:    48 0f 49 c2    cmovns %rdx,%rax15        total += __builtin_popcount(block & (block >> 1U));
   0x0000000000001b8e <+462>:    c4 21 7d 74 4c 09 40    vpcmpeqb 0x40(%rcx,%r9,1),%ymm0,%ymm9
   0x0000000000001b95 <+469>:    41 d1 ea    shr    %r10d
   0x0000000000001b98 <+472>:    41 21 f2    and    %esi,%r10d
   0x0000000000001b9b <+475>:    31 d2    xor    %edx,%edx
   0x0000000000001b9d <+477>:    f3 41 0f b8 d2    popcnt %r10d,%edx
   0x0000000000001ba2 <+482>:    41 89 f2    mov    %esi,%r10d
   0x0000000000001ba5 <+485>:    48 01 d0    add    %rdx,%rax16        if (last_bit) {
   0x0000000000001ba8 <+488>:    41 83 e2 01    and    $0x1,%r10d
   0x0000000000001bac <+492>:    c4 c1 7d d7 d1    vpmovmskb %ymm9,%edx
   0x0000000000001bb1 <+497>:    49 89 c0    mov    %rax,%r8
   0x0000000000001bb4 <+500>:    4c 01 d0    add    %r10,%rax
   0x0000000000001bb7 <+503>:    45 85 db    test   %r11d,%r11d
   0x0000000000001bba <+506>:    41 89 d3    mov    %edx,%r11d
   0x0000000000001bbd <+509>:    49 0f 49 c0    cmovns %r8,%rax15        total += __builtin_popcount(block & (block >> 1U));
   0x0000000000001bc1 <+513>:    41 d1 eb    shr    %r11d
   0x0000000000001bc4 <+516>:    41 21 d3    and    %edx,%r11d
   0x0000000000001bc7 <+519>:    45 31 c0    xor    %r8d,%r8d
   0x0000000000001bca <+522>:    f3 45 0f b8 c3    popcnt %r11d,%r8d
   0x0000000000001bcf <+527>:    49 01 c0    add    %rax,%r816        if (last_bit) {
   0x0000000000001bd2 <+530>:    89 d0    mov    %edx,%eax
   0x0000000000001bd4 <+532>:    83 e0 01    and    $0x1,%eax
   0x0000000000001bd7 <+535>:    4c 01 c0    add    %r8,%rax
   0x0000000000001bda <+538>:    4d 89 c2    mov    %r8,%r10
   0x0000000000001bdd <+541>:    85 f6    test   %esi,%esi
   0x0000000000001bdf <+543>:    4c 0f 48 d0    cmovs  %rax,%r10
   0x0000000000001be3 <+547>:    4d 8d 59 60    lea    0x60(%r9),%r11
   0x0000000000001be7 <+551>:    4d 89 d0    mov    %r10,%r817          total += last_bit & block;
18        }
19        last_bit = block >> 31U;
   0x0000000000001bea <+554>:    c1 ea 1f    shr    $0x1f,%edx12      for (uint64_t i = 0; i < size * 2; i += 32) {
   0x0000000000001bed <+557>:    4c 39 df    cmp    %r11,%rdi
   0x0000000000001bf0 <+560>:    0f 87 25 ff ff ff    ja     0x1b1b <count_pairs+347>
   0x0000000000001bf6 <+566>:    4c 89 c0    mov    %r8,%rax
   0x0000000000001bf9 <+569>:    c5 f8 77    vzeroupper 
   0x0000000000001bfc <+572>:    c3    retq   
   0x0000000000001bfd <+573>:    0f 1f 00    nopl   (%rax)
   0x0000000000001c00 <+576>:    45 31 c0    xor    %r8d,%r8d
   0x0000000000001c03 <+579>:    4c 89 c0    mov    %r8,%rax
   0x0000000000001c06 <+582>:    c5 f8 77    vzeroupper 
   0x0000000000001c09 <+585>:    c3    retq   
End of assembler dump.
```

So in total in this version manual vectorization was very worth it since it achieved almost 4x speedup over what the compiler was able to achieve.

| Version | Problem 1 | Problem 2|
|--- | --- | -- |
| unoptimized |      1045    |3276 |
| -O3 |             301      |587 |
| -O3 -funroll-loops |  181  |324 |
| -march=native |    116     |325|
| manual |          79         | 88|

In general whenever you think to manually vectorize it is worth it to way the extra cost in implementation, maintainability, and generality, over the speed performance that you can get.

# Summary 

- vectorization can help us speed up our code
- the compiler can help us with many forms of vectorization
- when the compiler can't help us we can do it manually using intrinsics



# More Bit and Byte Level Parallelism

## Compression

Let's say we want to store an array of integers using less space by compressing them.

Most commonly used integers use less than the full 64 bits, so we may be able to store them much smaller.

A standard integer is stored as follow (I will be using little endian for examples)

![Little Endian](source/little_endian.JPG "Little Endian")

For example instead of storing full integers we can instead use what is known as run length encoding

Instead of always storing all 8 bytes, we can instead use the bottom 7 bits of each bytes to store actual data, then the top bit of each byte to store if we are done with the integer or need to keep reading.

This way if the integer can be represented in less than 49 bits, we are able to use 7 bytes or less and save space.  This method does take more space if the integer needs more than 56 bits for its data.

Another thing we can do is sort the numbers, and only store the differences instead of the actual numbers to make the average size of the elements we are storing smaller. 

Let's look at some code for how we can decode an element
At the end we want what the difference we found was, and how what the difference we found was

```c
decode_return Decode(const uint8_t *loc) {
  // first check the case if we only use 1 byte by checking the top bit of the first byte
  // this is done because we need to special case what happens if we are 0
  decode_return ret;
  if ((*loc & 0x80UL) == 0) {
    ret.old_size = *loc > 0;
    ret.difference = *loc;
    return ret;
  }
  ret.difference = *loc & 0x7FUL;
  ret.old_size = 1;
  uint64_t shift_amount = 7;
  // loop over each byte and shift the bits to the correct position
  do {
    loc += 1;
    ret.difference = ret.difference | ((*loc & 0x7FUL) << shift_amount);
    ret.old_size += 1;
    shift_amount += 7;
  } while (*loc & 0x80UL);
  return ret;
}
```

As an experiment we are going to decode and sum up the numbers stored in a compressed array.  Our array will store 1 million elements, each element takes between 1 and 6 bytes to store the difference uniformly at random.

| Version | time to process 1 million elements in microseconds |
|--- | --- |
| unoptimized | 19898 |
| -O3 | 8595 |
| -march=native | 8239|

What if instead of having to read these bytes in 1 at a time we could read them all in parallel

We will use some features from the BMI2 instruction set which allows us to do parallel operations on all bits in a word

We will use two different instructions to help us.

The first is `_pext_u64` which takes in a 64 bit integer, and a 64 bit pattern, it extracts the specified bits from the integer and puts them in continuous low bits of the output.

The second is `__tzcnt_u64` which returns the number of trailing zeros in an integer


```c

static uint64_t extract_masks[] = {
    0x000000000000007FUL, 0x0000000000007F7FUL, 0x00000000007F7F7FUL,
    0x000000007F7F7F7FUL, 0x0000007F7F7F7F7FUL, 0x00007F7F7F7F7F7FUL, 0x007F7F7F7F7F7F7FUL};


decode_return Decode(const uint8_t *loc) {
  decode_return ret;
  // once again special case the first byte
  if ((*loc & 0x80UL) == 0) {
    ret.old_size = *loc > 0;
    ret.difference = *loc;
    return ret;
  }
  // load the next 64 bits of data which is big enough to always contain the next compressed integer
  uint64_t chunks = load64(loc);

  // extract the top bit from each byte
  uint64_t mask = _pext_u64(chunks, 0x8080808080808080UL);

  // find the first bit that is 0 which corresponds to how long the compressed integer is
  int32_t index = __tzcnt_u64(~mask);

  // extract the data bits 
  uint64_t difference = _pext_u64(chunks, extract_masks[index]);
  ret.difference = difference;
  ret.old_size = index+1;

  return ret;
}
```


| Version | time to process 1 million elements in microseconds |
|--- | --- |
| unoptimized | 19898 |
| O3 | 8595 |
| march=native | 8239|
| manual1 | 5454|

We can actually do this slightly better by exploiting some pipeline parallelism

```c

static uint64_t extract_masks2[] = {
    0b1111111UL,
    0b11111111111111UL,
    0b111111111111111111111UL,
    0b1111111111111111111111111111UL,
    0b11111111111111111111111111111111111ULL,
    0b111111111111111111111111111111111111111111UL,
    0b1111111111111111111111111111111111111111111111111UL
    };


decode_return Decode(const uint8_t *loc) {
  decode_return ret;
  // once again special case the first byte
  if ((*loc & 0x80UL) == 0) {
    ret.old_size = *loc > 0;
    ret.difference = *loc;
    return ret;
  }
  // load the next 64 bits of data which is big enough to always contain the next compressed difference
  uint64_t chunks = load64(loc);

  // extract the top bit from each byte
  uint64_t mask = _pext_u64(chunks, 0x8080808080808080UL);

  // find the first bit that is 0 which corresponds to how long the compressed integer is
  int32_t index = __tzcnt_u64(~mask);

  // extract out the low 7 bits from each byte
  // this operation is no longer dependant on previous __tzcnt_u64, so both can happen in parallel at a hardware level
  uint64_t data_bits = _pext_u64(chunks, 0x7F7F7F7F7F7F7F7FUL);

  // mask out the data you don't care about, notice the masks are different since it is after the extract and pack
  uint64_t difference = data_bits & extract_masks2[index];
  ret.difference = difference;
  ret.old_size = index+1;

  return ret;
}
```


| Version | time to process 1 million elements in microseconds |
|--- | --- |
| unoptimized | 19898 |
| O3 | 8595 |
| march=native | 8239|
| manual1 | 5454|
| manual2 | 5185|