## 05_07: Inverted index solution

In [1]:
import multiprocessing
import glob
import collections

In [2]:
%%file index.py

exclude = set('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')

def words(line):
    # remove punctuation
    clean = ''.join(char for char in line if char not in exclude)

    # split the line into words, make all words lowercase, return list 
    return [word.lower() for word in clean.split()]

def indexfile(filename):
    ret = []

    # enumerate lines in file
    for i, line in enumerate(open(filename, 'r').readlines()):
        # for each word in line...
        for word in words(line):
            # ...append a line to results, listing word and location
            ret.append(f'{word} {filename},{i}')
    
    return '\n'.join(ret)

Writing index.py


In [3]:
import index

In [4]:
pool = multiprocessing.Pool(processes=4)
partials = pool.map(index.indexfile, glob.glob('sonnets/*.txt'))
pool.close()

In [5]:
len(partials)

154

In [6]:
print(partials[0])

whoever sonnets/CXXXV.txt,0
hath sonnets/CXXXV.txt,0
her sonnets/CXXXV.txt,0
wish sonnets/CXXXV.txt,0
thou sonnets/CXXXV.txt,0
hast sonnets/CXXXV.txt,0
thy sonnets/CXXXV.txt,0
will sonnets/CXXXV.txt,0
and sonnets/CXXXV.txt,1
will sonnets/CXXXV.txt,1
to sonnets/CXXXV.txt,1
boot sonnets/CXXXV.txt,1
and sonnets/CXXXV.txt,1
will sonnets/CXXXV.txt,1
in sonnets/CXXXV.txt,1
overplus sonnets/CXXXV.txt,1
more sonnets/CXXXV.txt,2
than sonnets/CXXXV.txt,2
enough sonnets/CXXXV.txt,2
am sonnets/CXXXV.txt,2
i sonnets/CXXXV.txt,2
that sonnets/CXXXV.txt,2
vex sonnets/CXXXV.txt,2
thee sonnets/CXXXV.txt,2
still sonnets/CXXXV.txt,2
to sonnets/CXXXV.txt,3
thy sonnets/CXXXV.txt,3
sweet sonnets/CXXXV.txt,3
will sonnets/CXXXV.txt,3
making sonnets/CXXXV.txt,3
addition sonnets/CXXXV.txt,3
thus sonnets/CXXXV.txt,3
wilt sonnets/CXXXV.txt,4
thou sonnets/CXXXV.txt,4
whose sonnets/CXXXV.txt,4
will sonnets/CXXXV.txt,4
is sonnets/CXXXV.txt,4
large sonnets/CXXXV.txt,4
and sonnets/CXXXV.txt,4
spacious sonnets/CXXXV.txt

In [7]:
indexdict = collections.defaultdict(list)

# loop over sonnets
for partial in partials:
    # loop over lines
    for entry in partial.split('\n'):
        # get word and location
        word, loc = entry.split(' ')

        # append to the location list for the word
        indexdict[word].append(loc)

In [8]:
indexdict['love']

['sonnets/CXLVII.txt,0',
 'sonnets/CXLVII.txt,4',
 'sonnets/CXVIII.txt,8',
 'sonnets/XIII.txt,0',
 'sonnets/XIII.txt,12',
 'sonnets/IX.txt,12',
 'sonnets/CLIV.txt,13',
 'sonnets/XXIII.txt,10',
 'sonnets/XXIII.txt,12',
 'sonnets/LXXXIX.txt,4',
 'sonnets/LXXXIX.txt,13',
 'sonnets/XXI.txt,8',
 'sonnets/XXI.txt,9',
 'sonnets/LXIV.txt,11',
 'sonnets/XXXIV.txt,12',
 'sonnets/CXL.txt,5',
 'sonnets/CXL.txt,5',
 'sonnets/XLVI.txt,13',
 'sonnets/CLI.txt,0',
 'sonnets/CLI.txt,1',
 'sonnets/CLI.txt,7',
 'sonnets/CLI.txt,13',
 'sonnets/CLI.txt,13',
 'sonnets/LXXXII.txt,8',
 'sonnets/CXVI.txt,1',
 'sonnets/CXVI.txt,1',
 'sonnets/CXVI.txt,10',
 'sonnets/LVII.txt,12',
 'sonnets/LXXXVIII.txt,12',
 'sonnets/XLVII.txt,3',
 'sonnets/XLVII.txt,7',
 'sonnets/XLVII.txt,8',
 'sonnets/LXIII.txt,0',
 'sonnets/CX.txt,7',
 'sonnets/CX.txt,11',
 'sonnets/LXXI.txt,5',
 'sonnets/LXXI.txt,11',
 'sonnets/XLIX.txt,2',
 'sonnets/XLIX.txt,6',
 'sonnets/XLIX.txt,13',
 'sonnets/XXXI.txt,2',
 'sonnets/XXXI.txt,5',
 'sonnets

In [9]:
with open('index.txt', 'w') as outfile:
    # loop over words
    for word in sorted(indexdict.keys()):
        # make single space-separated string out of sorted locations
        locstring = ' '.join(sorted(indexdict[word]))

        # write out word + sorted-location string
        outfile.write(f'{word} {locstring}\n')

In [10]:
open('index.txt', 'r').readlines()[:10]

['a sonnets/C.txt,10 sonnets/CI.txt,10 sonnets/CIII.txt,1 sonnets/CIII.txt,5 sonnets/CIV.txt,8 sonnets/CLII.txt,13 sonnets/CLIII.txt,1 sonnets/CLIII.txt,11 sonnets/CLIII.txt,3 sonnets/CLIII.txt,5 sonnets/CLIII.txt,6 sonnets/CLIII.txt,7 sonnets/CLIV.txt,10 sonnets/CLIV.txt,7 sonnets/CLIV.txt,8 sonnets/CV.txt,5 sonnets/CVI.txt,7 sonnets/CVII.txt,3 sonnets/CX.txt,1 sonnets/CX.txt,11 sonnets/CXI.txt,4 sonnets/CXI.txt,8 sonnets/CXIV.txt,6 sonnets/CXLI.txt,1 sonnets/CXLI.txt,10 sonnets/CXLIII.txt,0 sonnets/CXLIV.txt,2 sonnets/CXLIV.txt,3 sonnets/CXLIV.txt,6 sonnets/CXLV.txt,10 sonnets/CXLVI.txt,4 sonnets/CXLVII.txt,0 sonnets/CXV.txt,12 sonnets/CXVIII.txt,10 sonnets/CXVIII.txt,6 sonnets/CXX.txt,12 sonnets/CXX.txt,5 sonnets/CXX.txt,6 sonnets/CXXIII.txt,3 sonnets/CXXIX.txt,0 sonnets/CXXIX.txt,10 sonnets/CXXIX.txt,10 sonnets/CXXIX.txt,11 sonnets/CXXIX.txt,11 sonnets/CXXIX.txt,6 sonnets/CXXV.txt,12 sonnets/CXXVII.txt,11 sonnets/CXXVII.txt,3 sonnets/CXXX.txt,10 sonnets/CXXX.txt,9 sonnets/CXXXI.txt

In [11]:
sorted(indexdict, key=lambda x: len(indexdict[x]), reverse=True)

['and',
 'the',
 'to',
 'of',
 'my',
 'i',
 'in',
 'that',
 'thy',
 'thou',
 'with',
 'for',
 'is',
 'not',
 'but',
 'me',
 'a',
 'thee',
 'love',
 'so',
 'be',
 'as',
 'all',
 'you',
 'his',
 'which',
 'when',
 'this',
 'it',
 'by',
 'your',
 'doth',
 'do',
 'from',
 'on',
 'or',
 'no',
 'have',
 'then',
 'what',
 'are',
 'if',
 'will',
 'more',
 'mine',
 'their',
 'shall',
 'sweet',
 'eyes',
 'they',
 'time',
 'beauty',
 'nor',
 'her',
 'art',
 'yet',
 'heart',
 'o',
 'than',
 'now',
 'can',
 'thine',
 'should',
 'hath',
 'fair',
 'one',
 'make',
 'he',
 'where',
 'still',
 'how',
 'eye',
 'true',
 'him',
 'am',
 'see',
 'loves',
 'like',
 'she',
 'those',
 'though',
 'being',
 'every',
 'some',
 'were',
 'such',
 'own',
 'who',
 'was',
 'dost',
 'may',
 'upon',
 'myself',
 'say',
 'live',
 'praise',
 'most',
 'give',
 'world',
 'let',
 'did',
 'at',
 'might',
 'ill',
 'day',
 'why',
 'times',
 'even',
 'since',
 'truth',
 'show',
 'life',
 'new',
 'thus',
 'night',
 'best',
 'look',

In [12]:
[len(indexdict[word]) for word in ['love','sweet','eyes']]

[162, 55, 55]