# Basic Python multiprocessing

In [1]:
from time import sleep, time

In [3]:
def add(xy):
    sleep(0.1) # imagine this is some complicated, slow calculation
    return xy[1] + xy[1]

t0 = time()
print("result:", add((2,3)))
t1 = time()
print(t1-t0, "seconds")

result: 6
0.10050749778747559 seconds


In [4]:
xy_pairs = [(10,1),(10,2),(10,3),(10,4),(10,5),(10,6),(10,7),(10,8),(10,9),(10,10)]

t0 = time()
for xy in xy_pairs:
    print("result:", add(xy))
t1 = time()
print(t1-t0, "seconds")

result: 2
result: 4
result: 6
result: 8
result: 10
result: 12
result: 14
result: 16
result: 18
result: 20
1.0040991306304932 seconds


In [5]:
from multiprocessing import Pool

with Pool(5) as p:
    t0 = time()
    for result in p.map(add, xy_pairs):
        print("result:", result)
    t1 = time()
    print(t1-t0, "seconds")

result: 2
result: 4
result: 6
result: 8
result: 10
result: 12
result: 14
result: 16
result: 18
result: 20
0.20257806777954102 seconds


We can see some good speedups from the above examples. That's because the function mostly does sleep and you can pack many "sleep" functions with very limited CPU resources. 

# Python multiprocessing with some compute-intensive functions

In [6]:
def add(xy):
    for i in range(3000000): # loop 3 million times
        pass
    return xy[0] + xy[1]

with Pool(1) as p:
    t0 = time()
    for result in p.map(add, xy_pairs):
        print("result:", result)
    t1 = time()
    print(t1-t0, "seconds (1 process)")

with Pool(10) as p:
    t0 = time()
    for result in p.map(add, xy_pairs):
        print("result:", result)
    t1 = time()
    print(t1-t0, "seconds (10 processes)")

result: 11
result: 12
result: 13
result: 14
result: 15
result: 16
result: 17
result: 18
result: 19
result: 20
0.6887228488922119 seconds (1 process)
result: 11
result: 12
result: 13
result: 14
result: 15
result: 16
result: 17
result: 18
result: 19
result: 20
0.3569304943084717 seconds (10 processes)


In [7]:
0.6887228488922119 / 0.3569304943084717

1.929571330761652

For compute-intensive tasks, the "speedup" I can achieve is really bounded by the number of CPU cores you have on that computer, no matter how many processes I actually launched (in the above example, 10 processes). In this case, you can pretty much guess how many CPU cores I have on my EC2 VM. 

# Python thread-level parallelism

In [8]:
import threading

def cpu_bound_task():
    for i in range(3000000):
        pass

In [9]:
threads = []
t0 = time()
for _ in range(1):
    thread = threading.Thread(target=cpu_bound_task)
    thread.start()
    threads.append(thread) # insert created thread objcet into the list

for thread in threads:
    thread.join() # parent thread waits for child thread to complete and join
t1 = time()

print(t1-t0, "seconds (1 single python thread)")

0.06687521934509277 seconds (1 single python thread)


In [12]:
threads = []
t0 = time()
for _ in range(10):
    thread = threading.Thread(target=cpu_bound_task)
    thread.start()
    threads.append(thread) # insert created thread objcet into the list

for thread in threads:
    thread.join() # parent thread waits for child thread to complete and join
t1 = time()

print(t1-t0, "seconds (10 python threads)")

0.6619820594787598 seconds (10 python threads)


In [11]:
0.331967830657959 /0.06687521934509277

4.963988662899517

Python threads share the virtual memory address space, therefore different threads created within the same process see the same copy of global variable 'total'. 

# Global variable in Python threads vs. Python processes

In [16]:
total = 0 # define a global var

def increment(amt):
    global total
    total += amt
    print(f"sub total so far: {total}\n")

In [17]:
threads = []
for _ in range(8):
    thread = threading.Thread(target=increment, args=(5,))
    thread.start()
    threads.append(thread) # insert created thread objcet into the list

for thread in threads:
    thread.join() # parent thread waits for child thread to complete and join

print("Final result:", total)

sub total so far: 5

sub total so far: 10

sub total so far: 15

sub total so far: 20

sub total so far: 25

sub total so far: 30

sub total so far: 35

sub total so far: 40

Final result: 40


In [22]:
# using multiprocessing
import os

total = 0

def increment(amt):
    pid = os.getpid() # get the process identifier
    global total
    total += amt
    print(f"{pid}: sub total so far: {total}\n")

with Pool(2) as p:
    p.map(increment, [5,5,5,5,5,5,5,5])

2233: sub total so far: 5
2232: sub total so far: 5

2233: sub total so far: 10


2233: sub total so far: 15

2233: sub total so far: 20

2232: sub total so far: 10
2233: sub total so far: 25


2233: sub total so far: 30



In [23]:
total 

0

However, in the case of multi-process parallelism, things become a bit complicated. Child processes created from a parent process also create a separate copy of the global variable 'total' in their own virtual memory address space. Each child process will then work on its own copy of 'total'. 