<a href="https://colab.research.google.com/github/trefftzc/cis677/blob/main/Exploring_numba.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NUMBA compiler messages

This example is taken from numba's tutorial
 https://numba.readthedocs.io/en/stable/user/5minguide.html

In [None]:
import numba
import time

@numba.jit(nopython=True)
def go_fast(n):
  acum = 0
  for i in range(n):
    acum += i
  return acum

if __name__ == "__main__":
  x = 1_000_000
# DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME!
  start = time.perf_counter()
  result = go_fast(x)
  end = time.perf_counter()
  print("Elapsed (with compilation) = {}s".format((end - start)))

# NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE
  start = time.perf_counter()
  result = go_fast(x)
  end = time.perf_counter()
  print("Result = {}".format(result))


Elapsed (with compilation) = 0.058271418000003905s
Result = 499999500000


Now include a report about the activites that the compiler performed.
This is achieved with this line:
 go_fast.parallel_diagnostics(level=4)

In [None]:
import numba
import time

@numba.jit(nopython=True,parallel=True)
def go_fast(n):
  acum = 0
  for i in range(n):
    acum += i
  return acum

if __name__ == "__main__":
  x = 1_000_000
# DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME!
  start = time.perf_counter()
  go_fast(x)
  end = time.perf_counter()
  print("Elapsed (with compilation) = {}s".format((end - start)))

# NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE
  start = time.perf_counter()
  go_fast(x)
  end = time.perf_counter()
  print("Elapsed (after compilation) = {}s".format((end - start)))
  print("Result = {}".format(result))
  go_fast.parallel_diagnostics(level=4)

Elapsed (with compilation) = 0.06036286900007326s
Elapsed (after compilation) = 6.329999905574368e-06s
Result = 499999500000
 
 Parallel Accelerator Optimizing:  Function go_fast, <ipython-
input-22-ba059c642406> (4)  
No source available
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
--------------------------------------------------------------------------------
------------------------------ After Optimisation ------------------------------
Parallel structure is already optimal.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 
---------------------------Loop invariant code motion---------------------------
Allocation hoisting:
No allocation hoisting found

Instruction hoisting:


The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.

File "<ipython-input-22-ba059c642406>", line 5:
@numba.jit(nopython=True,parallel=True)
def go_fast(n):
^



Now, let's use prange. prange parallelizes a for statement. It creates several threads of execution.

In [None]:
import numba
import time

@numba.jit(nopython=True,parallel=True)
def go_fast(n):
  acum = 0
  for i in numba.prange(n):
    acum += i
  return acum

if __name__ == "__main__":
  x = 1_000_000
# DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME!
  start = time.perf_counter()
  go_fast(x)
  end = time.perf_counter()
  print("Elapsed (with compilation) = {}s".format((end - start)))

# NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE
  start = time.perf_counter()
  go_fast(x)
  end = time.perf_counter()
  print("Elapsed (after compilation) = {}s".format((end - start)))
  print("Result = {}".format(result))
  go_fast.parallel_diagnostics(level=4)

Elapsed (with compilation) = 0.16227224499994009s
Elapsed (after compilation) = 0.0006767499999114079s
Result = 499999500000
 
 Parallel Accelerator Optimizing:  Function go_fast, <ipython-
input-23-0050f6a72f08> (4)  


Parallel loop listing for  Function go_fast, <ipython-input-23-0050f6a72f08> (4) 
-------------------------------------------|loop #ID
@numba.jit(nopython=True,parallel=True)    | 
def go_fast(n):                            | 
  acum = 0                                 | 
  for i in numba.prange(n):----------------| #2
    acum += i                              | 
  return acum                              | 
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
--------------------------------------------------------------------------------
------------------------------ After Optimi

# Let's try matrix multiplication


In [None]:
import numpy

def matrix_multiplication(A,B,C,N):
  # Initialize the result matrix to 0s

  for i in range(N):
    for j in range(N):
      for k in range(N):
        C[i][j] += A[i][k]*B[k][j]
  return C

if __name__ == "__main__":
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
# Multiply the matrices
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time = {}s".format((end_time - start_time)))

Elapsed time = 43.38834571838379s


# Now let's try with NUMBA (without parallelization)

In [None]:
import numpy
import numba

@numba.jit(nopython=True,parallel=True)
def matrix_multiplication(A,B,C,N):
  # Initialize the result matrix to 0s
  for i in range(N):
    for j in range(N):
      for k in range(N):
        C[i][j] += A[i][k]*B[k][j]
  return C

if __name__ == "__main__":

# First a run to compile the code
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time with compilation = {}s".format((end_time - start_time)))
# Now a second run that uses the compiled code to multiply the matrices
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time (withtout compilation)= {}s".format((end_time - start_time)))
  matrix_multiplication.parallel_diagnostics(level=4)

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.

File "<ipython-input-38-b94fd9b70383>", line 5:
@numba.jit(nopython=True,parallel=True)
def matrix_multiplication(A,B,C,N):
^



Elapsed time with compilation = 0.17173337936401367s
Elapsed time (withtout compilation)= 0.06038093566894531s
 
 Parallel Accelerator Optimizing:  Function matrix_multiplication, <ipython-
input-38-b94fd9b70383> (4)  
No source available
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
--------------------------------------------------------------------------------
------------------------------ After Optimisation ------------------------------
Parallel structure is already optimal.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 
---------------------------Loop invariant code motion---------------------------
Allocation hoisting:
No allocation hoisting found

Instruction hoisting:


# Now with prange

In [None]:
import numpy
import numba

@numba.jit(nopython=True,parallel=True)
def matrix_multiplication(A,B,C,N):
  # Initialize the result matrix to 0s
  for i in numba.prange(N):
    for j in range(N):
      for k in range(N):
        C[i][j] += A[i][k]*B[k][j]
  return C

if __name__ == "__main__":

# First a run to compile the code
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time with compilation = {}s".format((end_time - start_time)))
# Multiply the matrices
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time (withtout compilation)= {}s".format((end_time - start_time)))
  matrix_multiplication.parallel_diagnostics(level=4)

Elapsed time with compilation = 0.2875936031341553s
Elapsed time (withtout compilation)= 0.04929685592651367s
 
 Parallel Accelerator Optimizing:  Function matrix_multiplication, <ipython-
input-41-1f16350b723f> (4)  


Parallel loop listing for  Function matrix_multiplication, <ipython-input-41-1f16350b723f> (4) 
-------------------------------------------|loop #ID
@numba.jit(nopython=True,parallel=True)    | 
def matrix_multiplication(A,B,C,N):        | 
  # Initialize the result matrix to 0s     | 
  for i in numba.prange(N):----------------| #6
    for j in range(N):                     | 
      for k in range(N):                   | 
        C[i][j] += A[i][k]*B[k][j]         | 
  return C                                 | 
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
-------------------

# If we try two parallel prange statements, instead of just the outermost loop.


In [None]:
import numpy
import numba

@numba.jit(nopython=True,parallel=True)
def matrix_multiplication(A,B,C,N):
  # Initialize the result matrix to 0s
  for i in numba.prange(N):
    for j in numba.prange(N):
      for k in range(N):
        C[i][j] += A[i][k]*B[k][j]
  return C

if __name__ == "__main__":

# First a run to compile the code
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time with compilation = {}s".format((end_time - start_time)))
# Multiply the matrices
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time (withtout compilation)= {}s".format((end_time - start_time)))
  matrix_multiplication.parallel_diagnostics(level=4)

Elapsed time with compilation = 0.29268360137939453s
Elapsed time (withtout compilation)= 0.04774808883666992s
 
 Parallel Accelerator Optimizing:  Function matrix_multiplication, <ipython-
input-42-ba7bc4f3de40> (4)  


Parallel loop listing for  Function matrix_multiplication, <ipython-input-42-ba7bc4f3de40> (4) 
-------------------------------------------|loop #ID
@numba.jit(nopython=True,parallel=True)    | 
def matrix_multiplication(A,B,C,N):        | 
  # Initialize the result matrix to 0s     | 
  for i in numba.prange(N):----------------| #8
    for j in numba.prange(N):--------------| #7
      for k in range(N):                   | 
        C[i][j] += A[i][k]*B[k][j]         | 
  return C                                 | 
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
Parallel region 

# Now with a useless redundant statment to show how hoisting code works

In [None]:
import numpy
import numba

@numba.jit(nopython=True,parallel=True)
def matrix_multiplication(A,B,C,N):
  # Initialize the result matrix to 0s
  x = 0
  for i in numba.prange(N):
    for j in range(N):
      for k in range(N):
        # The following instruction is unnecessary
        x = 1
        C[i][j] += A[i][k]*B[k][j]
  return C

if __name__ == "__main__":

# First a run to compile the code
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time with compilation = {}s".format((end_time - start_time)))
# Multiply the matrices
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time (withtout compilation)= {}s".format((end_time - start_time)))
  matrix_multiplication.parallel_diagnostics(level=4)

Elapsed time with compilation = 0.3004417419433594s
Elapsed time (withtout compilation)= 0.0637974739074707s
 
 Parallel Accelerator Optimizing:  Function matrix_multiplication, <ipython-
input-54-0b2a14f66cfd> (4)  


Parallel loop listing for  Function matrix_multiplication, <ipython-input-54-0b2a14f66cfd> (4) 
------------------------------------------------------|loop #ID
@numba.jit(nopython=True,parallel=True)               | 
def matrix_multiplication(A,B,C,N):                   | 
  # Initialize the result matrix to 0s                | 
  x = 0                                               | 
  for i in numba.prange(N):---------------------------| #12
    for j in range(N):                                | 
      for k in range(N):                              | 
        # The following instruction is unnecessary    | 
        x = 1                                         | 
        C[i][j] += A[i][k]*B[k][j]                    | 
  return C                                      

# What happens if we try to parallelize code that depends on other iterations.

There is something wrong... but the compiler does not warn us...

In [17]:
import numba
import numpy as np

@numba.jit(nopython=True,parallel=True)
def depends_on_previous_iteration(A):
  result = np.zeros(A.shape)
  result[0] = A[0]
  for i in numba.prange(1,A.shape[0]):
    result[i] = A[i] + result[i-1]
  return result

if __name__ == "__main__":
  size = 1_000_000
  A = np.arange(size)
  B = depends_on_previous_iteration(A)
  sum_all_of_A = np.sum(A)
  print(sum_all_of_A)
  print(B[size-1])
  depends_on_previous_iteration.parallel_diagnostics(level=4)

499999500000
374999250000.0
 
 Parallel Accelerator Optimizing:  Function depends_on_previous_iteration, 
<ipython-input-17-10f45d01a851> (4)  


Parallel loop listing for  Function depends_on_previous_iteration, <ipython-input-17-10f45d01a851> (4) 
-------------------------------------------|loop #ID
@numba.jit(nopython=True,parallel=True)    | 
def depends_on_previous_iteration(A):      | 
  result = np.zeros(A.shape)---------------| #30
  result[0] = A[0]                         | 
  for i in numba.prange(1,A.shape[0]):-----| #31
    result[i] = A[i] + result[i-1]         | 
  return result                            | 
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
--------------------------------------------------------------------------------
------------------------------ After Optimisat