In [14]:
# imports
import urllib
from time import time
import numpy as np
import string
import matplotlib.pyplot as plt
import threading
from multiprocessing import Process

source_url = "https://www.gutenberg.org/cache/epub/100/pg100.txt" # complete works
#source_url = "https://www.gutenberg.org/cache/epub/1513/pg1513.txt" # romeo and juliet

In [None]:
rawdata = urllib.request.urlopen(source_url)

In [None]:
letter_counts = np.zeros((26,1))

In [None]:
start_time = time()
for line in rawdata:
    thisline = line.decode('utf-8').lower()
    for i in range(0,26):
        letter_counts[i] += thisline.count(chr(i+97))
end_time = time()
print("Processing time: {:.4f}".format(end_time - start_time))

In [None]:
letter_counts.flatten()

In [None]:
fig, ax = plt.subplots()
letter_labels = list(string.ascii_lowercase)
ax.bar(letter_labels, letter_counts.flatten())
ax.set_ylabel('Count')
ax.set_title('Bard Letters')
plt.show()

In [15]:
def count_task():
    while(1):
        try:
            this_line = next(rawdata)   # note: uses iterator not readline()
        except StopIteration:
            return
        this_line = this_line.decode('utf-8').lower()
        for i in range(0,26):
            letter_counts[i] += this_line.count(chr(i+97))

# try this sequentially
rawdata = urllib.request.urlopen(source_url)
letter_counts = np.zeros((26,1))
start_time = time()
count_task()
end_time = time()
print("Processing time: {:.4f}".format(end_time - start_time))
print(letter_counts)

Processing time: 36.6955
[[312189.]
 [ 64358.]
 [ 92458.]
 [159836.]
 [487743.]
 [ 86457.]
 [ 73176.]
 [257182.]
 [272142.]
 [  5029.]
 [ 38002.]
 [181675.]
 [118339.]
 [262450.]
 [334272.]
 [ 61949.]
 [  4011.]
 [253822.]
 [269068.]
 [356288.]
 [137836.]
 [ 40543.]
 [ 96471.]
 [  5431.]
 [ 99792.]
 [  1796.]]


In [16]:
sourcelock = threading.Lock()
listlock = threading.Lock()
letter_counts = np.zeros((26,1))
def count_task(task_url,start_line,num_lines_to_read):

    # open url
    this_file = urllib.request.urlopen(task_url)
    
    # discard all lines ahead of our starting poitn
    for line_idx in range(0,start_line):
        next(this_file)

    # iterate through lines in the desired range
    for line_idx in range(start_line,start_line + num_lines_to_read):
        
        # read line, but fail if we've hit the end of the file iterator
        try:
            #with sourcelock:
            this_line = next(this_file)   # note: uses iterator not readline()
        except StopIteration:
            return
        
        this_line = this_line.decode('utf-8').lower()
        for i in range(0,2):
            this_count = this_line.count(chr(i+97))
            #with listlock:
            letter_counts[i] += this_count  # this is the critical line!

# try this with threads and locks
NUM_THREADS = 10

all_threads = list()

# open file from url and count lines
rawdata = urllib.request.urlopen(source_url)
letter_counts = np.zeros((26,1))
numlines = sum(1 for l in rawdata)
lines_per_thread = np.ceil(numlines/NUM_THREADS).astype(int)

# launch threads
start_time = time()
for thread_num in range(0,NUM_THREADS):
    start_line = np.ceil((thread_num)*lines_per_thread).astype(int)
    end_line = np.ceil(start_line + (lines_per_thread - 1)).astype(int)
    print('Thread {0:d} -> start: {1:d}, stop: {2:d}'.format(thread_num,start_line,end_line))
    thd = threading.Thread(target=count_task,args=(source_url,start_line,lines_per_thread))
    thd.start()
    all_threads.append(thd)

for thd in all_threads:
    thd.join()

end_time = time()
print("Processing time: {:.4f}".format(end_time - start_time))
print(letter_counts)

Thread 0 -> start: 0, stop: 19638
Thread 1 -> start: 19639, stop: 39277
Thread 2 -> start: 39278, stop: 58916
Thread 3 -> start: 58917, stop: 78555
Thread 4 -> start: 78556, stop: 98194
Thread 5 -> start: 98195, stop: 117833
Thread 6 -> start: 117834, stop: 137472
Thread 7 -> start: 137473, stop: 157111
Thread 8 -> start: 157112, stop: 176750
Thread 9 -> start: 176751, stop: 196389
Processing time: 5.9858
[[312189.]
 [ 64358.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]]


In [3]:
def p_count_task(task_url,start_line,num_lines_to_read):

    print("Starting process")
    
    # open url
    this_file = urllib.request.urlopen(task_url)
    
    # discard all lines ahead of our starting poitn
    for line_idx in range(0,start_line):
        next(this_file)

    # iterate through lines in the desired range
    for line_idx in range(start_line,start_line + num_lines_to_read):
        
        # read line, but fail if we've hit the end of the file iterator
        try:
            #with sourcelock:
            this_line = next(this_file)   # note: uses iterator not readline()
        except StopIteration:
            return
        
        this_line = this_line.decode('utf-8').lower()
        for i in range(0,26):
            this_count = this_line.count(chr(i+97))
            #with listlock:
            letter_counts[i] += this_count  # this is the critical line!

# try this with threads and locks
NUM_PROCESSES = 1

all_processes = list()

# open file from url and count lines
rawdata = urllib.request.urlopen(source_url)
letter_counts = np.zeros((26,1))
numlines = sum(1 for l in rawdata)
lines_per_process = np.ceil(numlines/NUM_PROCESSES).astype(int)

# launch threads
start_time = time()
for process_num in range(0,NUM_PROCESSES):
    start_line = np.ceil((process_num)*lines_per_process).astype(int)
    end_line = np.ceil(start_line + (lines_per_process - 1)).astype(int)
    print('Process {0:d} -> start: {1:d}, stop: {2:d}'.format(process_num,start_line,end_line))
    p = Process(target=p_count_task,args=(source_url,start_line,lines_per_process))
    p.start()
    all_processes.append(p)

for p in all_processes:
    p.join()

end_time = time()
print("Processing time: {:.4f}".format(end_time - start_time))
print(letter_counts)

Process 0 -> start: 0, stop: 196389
Processing time: 0.1818
[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [5]:
def f(name):
    print('hello', name)

p = Process(target=f, args=('bob',))
p.start()
p.join()

In [13]:
from random import randint
from time import sleep

def count_task():
    global total_count
    for i in range(0,50000):
        total_count += 1
        #sleep(randint(0,4)/1000000.0)
        
# try this with threads and locks
NUM_THREADS = 50
total_count = 0;
all_threads = list()

# launch threads
start_time = time()
for thread_num in range(0,NUM_THREADS):
    thd = threading.Thread(target=count_task,args=())
    thd.start()
    all_threads.append(thd)

for thd in all_threads:
    thd.join()

end_time = time()
print("Processing time: {:.4f}".format(end_time - start_time))
print(total_count)

Processing time: 0.4466
2500000
