<a href="https://colab.research.google.com/github/trefftzc/cis677/blob/main/Exploring_numba.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NUMBA compiler messages

This example is taken from numba's tutorial
 https://numba.readthedocs.io/en/stable/user/5minguide.html

In [None]:
!find / -iname 'libdevice'
!find / -iname 'libnvvm.so'

Set the version of the cuda package according to the output of the previous cell.

In [1]:
import os
os.environ['NUMBAPRO_LIBDEVICE'] = "/usr/local/cuda-12.5/nvvm/libdevice"
os.environ['NUMBAPRO_NVVM'] = "/usr/local/cuda-12.5/nvvm/lib64/libnvvm.so"

In [2]:
!uv pip install -q --system numba-cuda==0.4.0

In [3]:
import numba
import time

@numba.jit(nopython=True)
def go_fast(n):
  acum = 0
  for i in range(n):
    acum += i
  return acum

if __name__ == "__main__":
  x = 1_000_000
# DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME!
  start = time.perf_counter()
  result = go_fast(x)
  end = time.perf_counter()
  print("Elapsed (with compilation) = {}s".format((end - start)))

# NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE
  start = time.perf_counter()
  result = go_fast(x)
  end = time.perf_counter()
  print("Result = {}".format(result))


Elapsed (with compilation) = 1.2162973130000125s
Result = 499999500000


Now include a report about the activites that the compiler performed.
This is achieved with this line:
 go_fast.parallel_diagnostics(level=4)

In [4]:
import numba
import time

@numba.jit(nopython=True,parallel=True)
def go_fast(n):
  acum = 0
  for i in range(n):
    acum += i
  return acum

if __name__ == "__main__":
  x = 1_000_000
# DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME!
  start = time.perf_counter()
  go_fast(x)
  end = time.perf_counter()
  print("Elapsed (with compilation) = {}s".format((end - start)))

# NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE
  start = time.perf_counter()
  go_fast(x)
  end = time.perf_counter()
  print("Elapsed (after compilation) = {}s".format((end - start)))
  print("Result = {}".format(result))
  go_fast.parallel_diagnostics(level=4)

Elapsed (with compilation) = 0.09202581499999951s
Elapsed (after compilation) = 2.8059999976903782e-06s
Result = 499999500000
 
 Parallel Accelerator Optimizing:  Function go_fast, /tmp/ipython-
input-4049838468.py (4)  
No source available
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
--------------------------------------------------------------------------------
------------------------------ After Optimisation ------------------------------
Parallel structure is already optimal.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 
---------------------------Loop invariant code motion---------------------------
Allocation hoisting:
No allocation hoisting found

Instruction hoisting

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.

File "../tmp/ipython-input-4049838468.py", line 4:
<source missing, REPL/exec in use?>



Now, let's use prange. prange parallelizes a for statement. It creates several threads of execution.

In [5]:
import numba
import time

@numba.jit(nopython=True,parallel=True)
def go_fast(n):
  acum = 0
  for i in numba.prange(n):
    acum += i
  return acum

if __name__ == "__main__":
  x = 1_000_000
# DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME!
  start = time.perf_counter()
  go_fast(x)
  end = time.perf_counter()
  print("Elapsed (with compilation) = {}s".format((end - start)))

# NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE
  start = time.perf_counter()
  go_fast(x)
  end = time.perf_counter()
  print("Elapsed (after compilation) = {}s".format((end - start)))
  print("Result = {}".format(result))
  go_fast.parallel_diagnostics(level=4)

Elapsed (with compilation) = 0.4855637940000008s
Elapsed (after compilation) = 0.0002784860000133449s
Result = 499999500000
 
 Parallel Accelerator Optimizing:  Function go_fast, /tmp/ipython-
input-3566615550.py (4)  


Parallel loop listing for  Function go_fast, /tmp/ipython-input-3566615550.py (4) 
-------------------------------------------|loop #ID
@numba.jit(nopython=True,parallel=True)    | 
def go_fast(n):                            | 
  acum = 0                                 | 
  for i in numba.prange(n):----------------| #0
    acum += i                              | 
  return acum                              | 
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
--------------------------------------------------------------------------------
------------------------------ After Optim

# Let's try matrix multiplication


In [6]:
import numpy

def matrix_multiplication(A,B,C,N):
  # Initialize the result matrix to 0s

  for i in range(N):
    for j in range(N):
      for k in range(N):
        C[i][j] += A[i][k]*B[k][j]
  return C

if __name__ == "__main__":
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
# Multiply the matrices
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time = {}s".format((end_time - start_time)))

Elapsed time = 70.26487374305725s


# Now let's try with NUMBA (without parallelization)

In [7]:
import numpy
import numba

@numba.jit(nopython=True,parallel=True)
def matrix_multiplication(A,B,C,N):
  # Initialize the result matrix to 0s
  for i in range(N):
    for j in range(N):
      for k in range(N):
        C[i][j] += A[i][k]*B[k][j]
  return C

if __name__ == "__main__":

# First a run to compile the code
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time with compilation = {}s".format((end_time - start_time)))
# Now a second run that uses the compiled code to multiply the matrices
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time (withtout compilation)= {}s".format((end_time - start_time)))
  matrix_multiplication.parallel_diagnostics(level=4)

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see https://numba.readthedocs.io/en/stable/user/parallel.html#diagnostics for help.

File "../tmp/ipython-input-1523915101.py", line 4:
<source missing, REPL/exec in use?>



Elapsed time with compilation = 0.27545619010925293s
Elapsed time (withtout compilation)= 0.10170125961303711s
 
 Parallel Accelerator Optimizing:  Function matrix_multiplication, /tmp/ipython-
input-1523915101.py (4)  
No source available
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
--------------------------------------------------------------------------------
------------------------------ After Optimisation ------------------------------
Parallel structure is already optimal.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
 
---------------------------Loop invariant code motion---------------------------
Allocation hoisting:
No allocation hoisting found

Instruction hoisting:

# Now with prange

In [8]:
import numpy
import numba

@numba.jit(nopython=True,parallel=True)
def matrix_multiplication(A,B,C,N):
  # Initialize the result matrix to 0s
  for i in numba.prange(N):
    for j in range(N):
      for k in range(N):
        C[i][j] += A[i][k]*B[k][j]
  return C

if __name__ == "__main__":

# First a run to compile the code
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time with compilation = {}s".format((end_time - start_time)))
# Multiply the matrices
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time (withtout compilation)= {}s".format((end_time - start_time)))
  matrix_multiplication.parallel_diagnostics(level=4)

Elapsed time with compilation = 0.8328351974487305s
Elapsed time (withtout compilation)= 0.10880875587463379s
 
 Parallel Accelerator Optimizing:  Function matrix_multiplication, /tmp/ipython-
input-3184916463.py (4)  


Parallel loop listing for  Function matrix_multiplication, /tmp/ipython-input-3184916463.py (4) 
-------------------------------------------|loop #ID
@numba.jit(nopython=True,parallel=True)    | 
def matrix_multiplication(A,B,C,N):        | 
  # Initialize the result matrix to 0s     | 
  for i in numba.prange(N):----------------| #1
    for j in range(N):                     | 
      for k in range(N):                   | 
        C[i][j] += A[i][k]*B[k][j]         | 
  return C                                 | 
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
-----------------

# If we try two parallel prange statements, instead of just the outermost loop.


In [9]:
import numpy
import numba

@numba.jit(nopython=True,parallel=True)
def matrix_multiplication(A,B,C,N):
  # Initialize the result matrix to 0s
  for i in numba.prange(N):
    for j in numba.prange(N):
      for k in range(N):
        C[i][j] += A[i][k]*B[k][j]
  return C

if __name__ == "__main__":

# First a run to compile the code
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time with compilation = {}s".format((end_time - start_time)))
# Multiply the matrices
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time (withtout compilation)= {}s".format((end_time - start_time)))
  matrix_multiplication.parallel_diagnostics(level=4)

Elapsed time with compilation = 0.5877289772033691s
Elapsed time (withtout compilation)= 0.0644083023071289s
 
 Parallel Accelerator Optimizing:  Function matrix_multiplication, /tmp/ipython-
input-553046427.py (4)  


Parallel loop listing for  Function matrix_multiplication, /tmp/ipython-input-553046427.py (4) 
-------------------------------------------|loop #ID
@numba.jit(nopython=True,parallel=True)    | 
def matrix_multiplication(A,B,C,N):        | 
  # Initialize the result matrix to 0s     | 
  for i in numba.prange(N):----------------| #3
    for j in numba.prange(N):--------------| #2
      for k in range(N):                   | 
        C[i][j] += A[i][k]*B[k][j]         | 
  return C                                 | 
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
Parallel region 0:

# Now with a useless redundant statment to show how hoisting code works

In [10]:
import numpy
import numba

@numba.jit(nopython=True,parallel=True)
def matrix_multiplication(A,B,C,N):
  # Initialize the result matrix to 0s
  x = 0
  for i in numba.prange(N):
    for j in range(N):
      for k in range(N):
        # The following instruction is unnecessary
        x = 1
        C[i][j] += A[i][k]*B[k][j]
  return C

if __name__ == "__main__":

# First a run to compile the code
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time with compilation = {}s".format((end_time - start_time)))
# Multiply the matrices
  size = 400
  A = numpy.random.rand(size,size)
  B = numpy.eye(size,size)
  C = numpy.zeros([size,size])
  start_time = time.time()
  C = matrix_multiplication(A,B,C,size)
  end_time = time.time()
  print("Elapsed time (withtout compilation)= {}s".format((end_time - start_time)))
  matrix_multiplication.parallel_diagnostics(level=4)

Elapsed time with compilation = 0.8892784118652344s
Elapsed time (withtout compilation)= 0.12043929100036621s
 
 Parallel Accelerator Optimizing:  Function matrix_multiplication, /tmp/ipython-
input-3120092189.py (4)  


Parallel loop listing for  Function matrix_multiplication, /tmp/ipython-input-3120092189.py (4) 
------------------------------------------------------|loop #ID
@numba.jit(nopython=True,parallel=True)               | 
def matrix_multiplication(A,B,C,N):                   | 
  # Initialize the result matrix to 0s                | 
  x = 0                                               | 
  for i in numba.prange(N):---------------------------| #4
    for j in range(N):                                | 
      for k in range(N):                              | 
        # The following instruction is unnecessary    | 
        x = 1                                         | 
        C[i][j] += A[i][k]*B[k][j]                    | 
  return C                                    

# What happens if we try to parallelize code that depends on other iterations.

There is something wrong... but the compiler does not warn us...

In [11]:
import numba
import numpy as np

@numba.jit(nopython=True,parallel=True)
def depends_on_previous_iteration(A):
  result = np.zeros(A.shape)
  result[0] = A[0]
  for i in numba.prange(1,A.shape[0]):
    result[i] = A[i] + result[i-1]
  return result

if __name__ == "__main__":
  size = 1_000_000
  A = np.arange(size)
  B = depends_on_previous_iteration(A)
  sum_all_of_A = np.sum(A)
  print(sum_all_of_A)
  print(B[size-1])
  depends_on_previous_iteration.parallel_diagnostics(level=4)

499999500000
374999250000.0
 
 Parallel Accelerator Optimizing:  Function depends_on_previous_iteration, 
/tmp/ipython-input-3374079850.py (4)  


Parallel loop listing for  Function depends_on_previous_iteration, /tmp/ipython-input-3374079850.py (4) 
-------------------------------------------|loop #ID
@numba.jit(nopython=True,parallel=True)    | 
def depends_on_previous_iteration(A):      | 
  result = np.zeros(A.shape)---------------| #5
  result[0] = A[0]                         | 
  for i in numba.prange(1,A.shape[0]):-----| #6
    result[i] = A[i] + result[i-1]         | 
  return result                            | 
--------------------------------- Fusing loops ---------------------------------
Attempting fusion of parallel loops (combines loops with similar properties)...
----------------------------- Before Optimisation ------------------------------
--------------------------------------------------------------------------------
------------------------------ After Optimisat