In [1]:
import numpy as np
from numba import jit

In [2]:
N = 1000000

a = np.random.rand(N)
b = np.random.rand(N)

# Without using Numba

## Normal for loop

In [3]:
def normal_for_dot(a, b):
    sum = 0
    
    for i in range(N):
        sum += a[i] * b[i]
    
    return sum

%timeit normal_for_dot(a, b)

141 ms ± 2.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Normal while loop

In [4]:
def normal_while_dot(a, b):
    i = 0
    sum = 0
    
    while i < N:
        sum += a[i] * b[i]
        
        i += 1
    
    return sum

%timeit normal_while_dot(a, b)

169 ms ± 1.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## While loop with 2-step unrolling (single sum)

In [5]:
def step_2_while_dot(a, b):
    i = 0
    sum = 0

    while i < N // 2:
        sum += a[2 * i + 0] * b[2 * i + 0]
        sum += a[2 * i + 1] * b[2 * i + 1]

        i += 1

    return sum

%timeit step_2_while_dot(a, b)

224 ms ± 4.97 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## While loop with 2-step unrolling (multiple sum)

In [6]:
def step_2_while_dot(a, b):
    i = 0
    sum0 = 0
    sum1 = 0

    while i < N // 2:
        sum0 += a[2 * i + 0] * b[2 * i + 0]
        sum1 += a[2 * i + 1] * b[2 * i + 1]

        i += 1

    return sum0 + sum1

%timeit step_2_while_dot(a, b)

217 ms ± 1.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## While loop with 4-step unrolling (single sum)

In [7]:
def step_4_while_dot(a, b):
    i = 0
    sum = 0

    while i < N // 4:
        sum += a[4 * i + 0] * b[4 * i + 0]
        sum += a[4 * i + 1] * b[4 * i + 1]
        sum += a[4 * i + 2] * b[4 * i + 2]
        sum += a[4 * i + 3] * b[4 * i + 3]

        i += 1

    return sum

%timeit step_4_while_dot(a, b)

202 ms ± 1.26 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## While loop with 4-step unrolling (multiple sum)

In [8]:
def step_4_while_dot(a, b):
    i = 0
    sum0 = 0
    sum1 = 0
    sum2 = 0
    sum3 = 0
    
    while i < N // 4:
        sum0 += a[4 * i + 0] * b[4 * i + 0]
        sum1 += a[4 * i + 1] * b[4 * i + 1]
        sum2 += a[4 * i + 2] * b[4 * i + 2]
        sum3 += a[4 * i + 3] * b[4 * i + 3]

        i += 1

    return sum0 + sum1 + sum2 + sum3

%timeit step_4_while_dot(a, b)

199 ms ± 1.22 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## While loop with 8-step unrolling (single sum)

In [9]:
def step_8_while_dot(a, b):
    i = 0
    sum = 0

    while i < N // 8:
        sum += a[8 * i + 0] * b[8 * i + 0]
        sum += a[8 * i + 1] * b[8 * i + 1]
        sum += a[8 * i + 2] * b[8 * i + 2]
        sum += a[8 * i + 3] * b[8 * i + 3]
        sum += a[8 * i + 4] * b[8 * i + 4]
        sum += a[8 * i + 5] * b[8 * i + 5]
        sum += a[8 * i + 6] * b[8 * i + 6]
        sum += a[8 * i + 7] * b[8 * i + 7]

        i += 1

    return sum

%timeit step_8_while_dot(a, b)

196 ms ± 781 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## While loop with 2-step unrolling (multiple sum)

In [10]:
def step_8_while_dot(a, b):
    i = 0
    sum0 = 0
    sum1 = 0
    sum2 = 0
    sum3 = 0
    sum4 = 0
    sum5 = 0
    sum6 = 0
    sum7 = 0

    while i < N // 8:
        sum0 += a[8 * i + 0] * b[8 * i + 0]
        sum1 += a[8 * i + 1] * b[8 * i + 1]
        sum2 += a[8 * i + 2] * b[8 * i + 2]
        sum3 += a[8 * i + 3] * b[8 * i + 3]
        sum4 += a[8 * i + 4] * b[8 * i + 4]
        sum5 += a[8 * i + 5] * b[8 * i + 5]
        sum6 += a[8 * i + 6] * b[8 * i + 6]
        sum7 += a[8 * i + 7] * b[8 * i + 7]

        i += 1

    return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7

%timeit step_8_while_dot(a, b)

197 ms ± 422 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Numpy dot product

In [11]:
def numpy_dot(a, b):
    return np.dot(a, b)

%timeit numpy_dot(a, b)

515 µs ± 5.63 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# With using Numba

## Normal for loop

In [12]:
@jit(nopython=True)
def normal_for_dot(a, b):
    sum = 0
    
    for i in range(N):
        sum += a[i] * b[i]
    
    return sum

%timeit normal_for_dot(a, b)

1.04 ms ± 46.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## Normal while loop

In [13]:
@jit(nopython=True)
def normal_while_dot(a, b):
    i = 0
    sum = 0
    
    while i < N:
        sum += a[i] * b[i]
        
        i += 1
    
    return sum

%timeit normal_while_dot(a, b)

980 µs ± 5.47 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## While loop with 2-step unrolling (single sum)

In [14]:
@jit(nopython=True)
def step_2_while_dot(a, b):
    i = 0
    sum = 0

    while i < N // 2:
        sum += a[2 * i + 0] * b[2 * i + 0]
        sum += a[2 * i + 1] * b[2 * i + 1]

        i += 1

    return sum

%timeit step_2_while_dot(a, b)

1.03 ms ± 26.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## While loop with 2-step unrolling (multiple sum)

In [15]:
@jit(nopython=True)
def step_2_while_dot(a, b):
    i = 0
    sum0 = 0
    sum1 = 0

    while i < N // 2:
        sum0 += a[2 * i + 0] * b[2 * i + 0]
        sum1 += a[2 * i + 1] * b[2 * i + 1]

        i += 1

    return sum0 + sum1

%timeit step_2_while_dot(a, b)

546 µs ± 15.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## While loop with 4-step unrolling (single sum)

In [16]:
@jit(nopython=True)
def step_4_while_dot(a, b):
    i = 0
    sum = 0

    while i < N // 4:
        sum += a[4 * i + 0] * b[4 * i + 0]
        sum += a[4 * i + 1] * b[4 * i + 1]
        sum += a[4 * i + 2] * b[4 * i + 2]
        sum += a[4 * i + 3] * b[4 * i + 3]

        i += 1

    return sum

%timeit step_4_while_dot(a, b)

1 ms ± 24.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## While loop with 4-step unrolling (multiple sum)

In [17]:
@jit(nopython=True)
def step_4_while_dot(a, b):
    i = 0
    sum0 = 0
    sum1 = 0
    sum2 = 0
    sum3 = 0
    
    while i < N // 4:
        sum0 += a[4 * i + 0] * b[4 * i + 0]
        sum1 += a[4 * i + 1] * b[4 * i + 1]
        sum2 += a[4 * i + 2] * b[4 * i + 2]
        sum3 += a[4 * i + 3] * b[4 * i + 3]

        i += 1

    return sum0 + sum1 + sum2 + sum3

%timeit step_4_while_dot(a, b)

338 µs ± 22.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## While loop with 8-step unrolling (single sum)

In [18]:
@jit(nopython=True)
def step_8_while_dot(a, b):
    i = 0
    sum = 0

    while i < N // 8:
        sum += a[8 * i + 0] * b[8 * i + 0]
        sum += a[8 * i + 1] * b[8 * i + 1]
        sum += a[8 * i + 2] * b[8 * i + 2]
        sum += a[8 * i + 3] * b[8 * i + 3]
        sum += a[8 * i + 4] * b[8 * i + 4]
        sum += a[8 * i + 5] * b[8 * i + 5]
        sum += a[8 * i + 6] * b[8 * i + 6]
        sum += a[8 * i + 7] * b[8 * i + 7]

        i += 1

    return sum

%timeit step_8_while_dot(a, b)

975 µs ± 18.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## While loop with 8-step unrolling (multiple sum)

In [19]:
@jit(nopython=True)
def step_8_while_dot(a, b):
    i = 0
    sum0 = 0
    sum1 = 0
    sum2 = 0
    sum3 = 0
    sum4 = 0
    sum5 = 0
    sum6 = 0
    sum7 = 0

    while i < N // 8:
        sum0 += a[8 * i + 0] * b[8 * i + 0]
        sum1 += a[8 * i + 1] * b[8 * i + 1]
        sum2 += a[8 * i + 2] * b[8 * i + 2]
        sum3 += a[8 * i + 3] * b[8 * i + 3]
        sum4 += a[8 * i + 4] * b[8 * i + 4]
        sum5 += a[8 * i + 5] * b[8 * i + 5]
        sum6 += a[8 * i + 6] * b[8 * i + 6]
        sum7 += a[8 * i + 7] * b[8 * i + 7]

        i += 1

    return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7

%timeit step_8_while_dot(a, b)

329 µs ± 28.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


## Numpy dot product

In [20]:
@jit(nopython=True)
def numpy_dot(a, b):
    return np.dot(a, b)

%timeit numpy_dot(a, b)

508 µs ± 1.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# Result conclutsion:

The tests are obtained on Apple M1 CPU. The results are shown in the table below. The results are obtained by running the code multiple times and taking the average time.

## Without using Numba

| Unrollings | Sumiation | Time              |
|:----------:|:---------:|:-----------------:|
| 1          | Single    | 169 ms ± 1.4 ms   |
| 2          | Single    | 224 ms ± 4.97 ms  |
| 2          | Multiple  | 217 ms ± 1.36 ms  |
| 4          | Single    | 202 ms ± 1.26 ms  |
| 4          | Multiple  | 199 ms ± 1.22 ms  |
| 8          | Single    | 196 ms ± 781 µs   |
| 8          | Multiple  | 197 ms ± 422 µs   |
| numpy.dot  | Single    | 515 µs ± 5.63 µs  |

## With using Numba

| Unrollings | Sumiation | Time              |
|:----------:|:---------:|:-----------------:|
| 1          | Single    | 980 µs ± 5.47 µs  |
| 2          | Single    | 1.03 ms ± 26.5 µs |
| 2          | Multiple  | 546 µs ± 15.6 µs  |
| 4          | Single    | 1 ms ± 24.7 µs    |
| 4          | Multiple  | 338 µs ± 22.2 µs  |
| 8          | Single    | 975 µs ± 18.2 µs  |
| 8          | Multiple  | 329 µs ± 28.6 µs  |
| numpy.dot  | Single    | 508 µs ± 1.4 µs   |

## Conclusion

It seens that Numba is very useful for speeding up the code. When not using Numba, the speed of the code is not improving by unrolling the loop. However, when using Numba, the speed of the code only improves by unrolling if we use multiple sum. When not using Numba the single sum and multiple sun dosn't make a difference. However, when using Numba the multiple sum is much faster than the single sum. The numpy.dot is the fastest method.