# Sonnets

Build an inverted index of the words in the Shakespear sonnets.    
The sonnets can be found in the subdirectory sonnets.      
Create a python dictionary with:      
word file1,line [[file2,line] .. [filen,line]] 

Take multiple approaches:   
- single core
- multiprocessor
- multiprocessor vector

## Approach

Create a list of files    
Open the first files.   
- If the word is not already stored, create a new entry      
- If the word already stored, add to the list starting with the word the file name, line number   

Open the next file  

The storage is a dictionary of a list. 

In [12]:
import sys 
import os
import multiprocessing

In [13]:
sonnetfiles=os.listdir('sonnets')

In [15]:
%%timeit # This line has to be the first line in the cell
# one processor
mydict = {}
for sonnetfile in sonnetfiles:
    with open("sonnets/"+sonnetfile) as f:
        for lineno, line in enumerate(f):
            for word in line.split():
                # strip punctuation, quotes, ..
                clean_word = word.strip(',.:;\'?').lower()
                location = sonnetfile + '/' + str(lineno+1)
                #print(clean_word, location)
                if clean_word not in mydict:
                    mydict[clean_word] = [location]
                else:
                    mydict[clean_word].append(location)
print(mydict)
               
        

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



11.4 ms ± 73.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
%%file process_file.py

def process_file(file):
    this_dict={}
    with open("sonnets/"+file) as f:
        for lineno, line in enumerate(f):
            for word in line.split():
                # strip punctuation, quotes, ..
                clean_word = word.strip(',.:;\'?').lower()
                location = file + '/' + str(lineno+1)
                #print(clean_word, location)
                if clean_word not in this_dict:
                    this_dict[clean_word] = [location]
                else:
                    this_dict[clean_word].append(location)
    return this_dict

def ListMerge(dict1, dict2):
    mydict2 = {**dict1, **dict2}
    for key, value in mydict2.items():
        if key in dict1 and key in dict2:
               mydict2[key] = [value , dict1[key]]
    return mydict2

Overwriting process_file.py


In [17]:
import process_file

In [23]:
#%%timeit
# put file processing in a function, so it can be called in parallel
# Problem: 2 processes could modify the same dictionary entry at the same time. 
#   Which write is going to win? Both, last, first? 
sonnetfiles=os.listdir('sonnets')
mydict2 = {}
dict_list = []

for sonnetfile in sonnetfiles:
    dict_list.append(process_file.process_file(sonnetfile))
    
for sonnet_dict in dict_list:    
    print(sonnet_dict)
    print()
    mydict2 = process_file.ListMerge(mydict2, sonnet_dict)
    print("Combined dictionary size: ", len(mydict2))
    print() 
print(sorted(mydict2))

{'whoever': ['CXXXV.txt/1'], 'hath': ['CXXXV.txt/1'], 'her': ['CXXXV.txt/1'], 'wish': ['CXXXV.txt/1'], 'thou': ['CXXXV.txt/1', 'CXXXV.txt/5', 'CXXXV.txt/11'], 'hast': ['CXXXV.txt/1'], 'thy': ['CXXXV.txt/1', 'CXXXV.txt/4', 'CXXXV.txt/11', 'CXXXV.txt/12'], 'will': ['CXXXV.txt/1', 'CXXXV.txt/2', 'CXXXV.txt/2', 'CXXXV.txt/4', 'CXXXV.txt/5', 'CXXXV.txt/6', 'CXXXV.txt/7', 'CXXXV.txt/8', 'CXXXV.txt/11', 'CXXXV.txt/11', 'CXXXV.txt/12', 'CXXXV.txt/12', 'CXXXV.txt/14'], 'and': ['CXXXV.txt/2', 'CXXXV.txt/2', 'CXXXV.txt/5', 'CXXXV.txt/8', 'CXXXV.txt/10', 'CXXXV.txt/14'], 'to': ['CXXXV.txt/2', 'CXXXV.txt/4', 'CXXXV.txt/6', 'CXXXV.txt/10', 'CXXXV.txt/11', 'CXXXV.txt/12'], 'boot': ['CXXXV.txt/2'], 'in': ['CXXXV.txt/2', 'CXXXV.txt/6', 'CXXXV.txt/7', 'CXXXV.txt/8', 'CXXXV.txt/10', 'CXXXV.txt/11', 'CXXXV.txt/14'], 'overplus': ['CXXXV.txt/2'], 'more': ['CXXXV.txt/3', 'CXXXV.txt/12'], 'than': ['CXXXV.txt/3'], 'enough': ['CXXXV.txt/3'], 'am': ['CXXXV.txt/3'], 'i': ['CXXXV.txt/3'], 'that': ['CXXXV.txt/3', '

In [20]:
mydict == mydict2 

False

In [27]:
# put file processing in a function, so it can be called in parallel
# Problem: 2 processes could modify the same dictionary entry at the same time. 
#   Which write is going to win? Both, last, first? 
sonnetfiles=os.listdir('sonnets')
mydict2 = {}
dict_list = []
# About this Mac, System Report, Hardware, Total number of Cores
# Total Number of Cores: 10 (8 performance and 2 efficiency)
# % sysctl -n hw.ncpu
# % sysctl -a | grep physicalcpu:
# % system_profiler SPHardwareDataType
# % getconf _NPROCESSORS_ONLN
nproc = multiprocessing.cpu_count()
# nproc=4

# create the pool of nproc processes
pool = multiprocessing.Pool(processes=nproc)

# one processor
#for sonnetfile in sonnetfiles:
#    dict_list.append(process_file.process_file(sonnetfile))

# multiprocessor
#    pool.map(function, iterable)
dict_list=pool.map(process_file.process_file, sonnetfiles)

#print(dict_list[0])
#print()
#print(len(dict_list[0]))
for sonnet_dict in dict_list:
    print(sonnet_dict)
    print()
    mydict2 = process_file.ListMerge(mydict2, sonnet_dict)
    print("Combined dictionary size: ", len(mydict2))
    print()

print(sorted(mydict2))

{'whoever': ['CXXXV.txt/1'], 'hath': ['CXXXV.txt/1'], 'her': ['CXXXV.txt/1'], 'wish': ['CXXXV.txt/1'], 'thou': ['CXXXV.txt/1', 'CXXXV.txt/5', 'CXXXV.txt/11'], 'hast': ['CXXXV.txt/1'], 'thy': ['CXXXV.txt/1', 'CXXXV.txt/4', 'CXXXV.txt/11', 'CXXXV.txt/12'], 'will': ['CXXXV.txt/1', 'CXXXV.txt/2', 'CXXXV.txt/2', 'CXXXV.txt/4', 'CXXXV.txt/5', 'CXXXV.txt/6', 'CXXXV.txt/7', 'CXXXV.txt/8', 'CXXXV.txt/11', 'CXXXV.txt/11', 'CXXXV.txt/12', 'CXXXV.txt/12', 'CXXXV.txt/14'], 'and': ['CXXXV.txt/2', 'CXXXV.txt/2', 'CXXXV.txt/5', 'CXXXV.txt/8', 'CXXXV.txt/10', 'CXXXV.txt/14'], 'to': ['CXXXV.txt/2', 'CXXXV.txt/4', 'CXXXV.txt/6', 'CXXXV.txt/10', 'CXXXV.txt/11', 'CXXXV.txt/12'], 'boot': ['CXXXV.txt/2'], 'in': ['CXXXV.txt/2', 'CXXXV.txt/6', 'CXXXV.txt/7', 'CXXXV.txt/8', 'CXXXV.txt/10', 'CXXXV.txt/11', 'CXXXV.txt/14'], 'overplus': ['CXXXV.txt/2'], 'more': ['CXXXV.txt/3', 'CXXXV.txt/12'], 'than': ['CXXXV.txt/3'], 'enough': ['CXXXV.txt/3'], 'am': ['CXXXV.txt/3'], 'i': ['CXXXV.txt/3'], 'that': ['CXXXV.txt/3', '