<a href="https://colab.research.google.com/github/vadhri/hpc-notebook/blob/main/Math/Matrix/multiprocess_py_matrix_mul.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Processes

The logic below uses the multiprocess module of python and does matrix multiplication with memory of 2 source and 1 target array in shared memory.

In [2]:
import multiprocessing as mp
import numpy as np
from multiprocessing import shared_memory
from itertools import product

# Generate random matrices
m1 = np.random.rand(117, 213)
m2 = np.random.rand(213, 300)

def multiply(args):
    row, col, shape_m1, shape_m2, name_m1, name_m2, name_out = args

    # Attach to existing shared memory blocks
    existing_m1 = shared_memory.SharedMemory(name=name_m1)
    existing_m2 = shared_memory.SharedMemory(name=name_m2)
    existing_out = shared_memory.SharedMemory(name=name_out)

    # Reconstruct arrays from shared memory
    sm1 = np.ndarray(shape_m1, dtype=np.float64, buffer=existing_m1.buf)
    sm2 = np.ndarray(shape_m2, dtype=np.float64, buffer=existing_m2.buf)
    out_matrix = np.ndarray((shape_m1[0], shape_m2[1]), dtype=np.float64, buffer=existing_out.buf)

    # Perform multiplication directly in shared memory
    out_matrix[row, col] = np.dot(sm1[row, :], sm2[:, col])

    # Close shared memory in child process
    existing_m1.close()
    existing_m2.close()
    existing_out.close()

if __name__ == "__main__":
    total_ops = m1.shape[0] * m2.shape[1]

    # Create shared memory blocks
    shm_m1 = shared_memory.SharedMemory(create=True, size=m1.nbytes)
    shm_m2 = shared_memory.SharedMemory(create=True, size=m2.nbytes)
    shm_out = shared_memory.SharedMemory(create=True, size=m1.shape[0] * m2.shape[1] * np.dtype(np.float64).itemsize)

    # Create shared NumPy arrays
    sm1 = np.ndarray(m1.shape, dtype=np.float64, buffer=shm_m1.buf)
    sm2 = np.ndarray(m2.shape, dtype=np.float64, buffer=shm_m2.buf)
    out_matrix = np.ndarray((m1.shape[0], m2.shape[1]), dtype=np.float64, buffer=shm_out.buf)

    # Copy data into shared memory without duplicating
    np.copyto(sm1, m1)
    np.copyto(sm2, m2)

    print(f"Multiplication operations {m1.shape} X {m2.shape} = {total_ops} mp.cpu_count() = ", mp.cpu_count())

    pool = mp.Pool(processes=mp.cpu_count())

    # Generate tasks with shared memory names
    tasks = [(row, col, m1.shape, m2.shape, shm_m1.name, shm_m2.name, shm_out.name)
             for row, col in product(range(m1.shape[0]), range(m2.shape[1]))]

    with pool:
        pool.map(multiply, tasks)

    # Convert shared output to a NumPy array
    output = np.copy(out_matrix)

    # Cleanup shared memory
    shm_m1.close()
    shm_m1.unlink()
    shm_m2.close()
    shm_m2.unlink()
    shm_out.close()
    shm_out.unlink()

    print(np.all(np.isclose(np.dot(m1, m2), output, atol=0.0001)))


Multiplication operations (117, 213) X (213, 300) = 35100 mp.cpu_count() =  2
True


In [3]:
from time import sleep
from datetime import datetime

def add(a,b):
  print (f"Process {i} at {datetime.now()}; value = {a+b} \n")

processes = []
for i in range(2):
    process = mp.Process(target=add, args=(i,i))
    processes.append(process)
    process.start()

for process in processes:
  process.join()

Process 0 at 2025-02-24 05:18:11.665544; value = 0 

Process 1 at 2025-02-24 05:18:11.676098; value = 2 



### Threads

Threading to calculte sum(a*b)

In [5]:
import threading

N = 20000
# Generate random matrices
m1 = np.random.random((N,N))
m2 = np.random.random((N,N))

max_threads = 2
sum = 0.0
blocksize = N//max_threads

tlock = threading.Lock()

def array_multiply(thread_idx):
  global sum
  start = (thread_idx * blocksize)
  end = ((thread_idx + 1) * blocksize)
  local_sum = np.sum(m1[start:end,:] * m2[start:end,:])
  tlock.acquire()
  sum += local_sum
  tlock.release()

t1 = datetime.now()

threads = []
for i in range(max_threads):
  thread = threading.Thread(target=array_multiply, args=(i,))
  threads.append(thread)
  thread.start()

for t in threads:
  t.join()

print (f'Time taken with {max_threads} threads : {datetime.now()-t1}')

t1 = datetime.now()
print (np.sum(np.multiply(m1, m2)), sum)
print (f'Time taken without threads : {datetime.now()-t1}')


Time taken with 2 threads : 0:00:01.663371
99999573.15497263 99999573.15497178
Time taken without threads : 0:00:01.937003
