## Dictionary
This is just a collection of words that get assigned a number. It is much easier and faster to process integers than text, so creating a map is useful. This is a dictionary over the entire book of SPA.

## Corpus
This will be a processed corpus per poem. 
1. Excluding stopwords
2. Lowercased

In [1]:
import os
import fnmatch
import logging
from collections import defaultdict
from pprint import pprint

import re

from gensim import corpora

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
def remove_ascii(line):
    return re.sub(r'[\x90-\x99]','', line)

poems = defaultdict(list)

for file_name in os.listdir('../data/poems/'):
    if fnmatch.fnmatch(file_name, '*_cleaned_poem.txt'):
        poem_num = file_name.split('_')[0]
        
        with open(f'../data/poems/{file_name}', 'r') as f:
            poems[poem_num] = [remove_ascii(line) for line in f.read().splitlines()]

In [3]:
poems['6']

['THEY have chiseled on my stone the words:',
 'His life was gentle, and the elements so mixed in him',
 'That nature might stand up and say to all the world,',
 'This was a man.',
 'Those who knew me smile',
 'As they read this empty rhetoric.',
 'My epitaph should have been:',
 'Life was not gentle to him,',
 'And the elements so mixed in him',
 'That he made warfare on life,',
 'In the which he was slain.',
 'While I lived I could not cope with slanderous tongues,',
 'Now that I am dead I must submit to an epitaph',
 'Graven by a fool!']

In [8]:
# remove some stop words, lowercase everything, and split it up
# each poem is it's own corpus.
stoplist = set('for a of the and to in'.split())
texts = {
    poem_num: [[word for word in document.lower().split() 
                 if word not in stoplist]
                for document in corpus]
    for poem_num, corpus in poems.items()
}

In [9]:
pprint(texts)

{'1': [['where', 'are', 'elmer,', 'herman,', 'bert,', 'tom', 'charley,'],
       ['weak', 'will,', 'strong', 'arm,', 'clown,', 'boozer,', 'fighter?'],
       ['all,', 'all,', 'are', 'sleeping', 'on', 'hill.'],
       ['one', 'passed', 'fever,'],
       ['one', 'was', 'burned', 'mine,'],
       ['one', 'was', 'killed', 'brawl,'],
       ['one', 'died', 'jail,'],
       ['one', 'fell', 'from', 'bridge', 'toiling', 'children', 'wife'],
       ['all,',
        'all',
        'are',
        'sleeping,',
        'sleeping,',
        'sleeping',
        'on',
        'hill.'],
       ['where', 'are', 'ella,', 'kate,', 'mag,', 'lizzie', 'edith,'],
       ['tender',
        'heart,',
        'simple',
        'soul,',
        'loud,',
        'proud,',
        'happy',
        'one?'],
       ['all,', 'all,', 'are', 'sleeping', 'on', 'hill.'],
       ['one', 'died', 'shameful', 'child-birth,'],
       ['one', 'thwarted', 'love,'],
       ['one', 'at', 'hands', 'brute', 'brothel,'],
       ['one

         ['compasses', 'mathematical', 'instruments,'],
         ['irony', 'under', 'tenants', 'ignorance'],
         ['determinants', 'calculus', 'variations.'],
         ['anchors,', 'those', 'who', 'never', 'sailed.'],
         ['gates', 'ajaryes,', 'so', 'they', 'were;'],
         ['you',
          'left',
          'them',
          'open',
          'stray',
          'goats',
          'entered',
          'your',
          'garden.'],
         ['an', 'eye', 'watching', 'like', 'one', 'arimaspi'],
         ['so', 'did', 'youwith', 'one', 'eye.'],
         ['angels', 'blowing', 'trumpetsyou', 'are', 'heralded'],
         ['it',
          'is',
          'your',
          'horn',
          'your',
          'angel',
          'your',
          'familys',
          'estimate.'],
         ['it', 'is', 'all', 'very', 'well,', 'but', 'myself', 'i', 'know'],
         ['i', 'stirred', 'certain', 'vibrations', 'spoon', 'river'],
         ['which',
          'are',
          'my',
       

         ['nor',
          'did',
          'you',
          'carry',
          'great',
          'wounds',
          'your',
          'old',
          'age.'],
         ['you', 'did', 'not', 'starve,', 'government', 'fed', 'you.'],
         ['you', 'did', 'not', 'suffer', 'yet', 'cry', 'forward'],
         ['an', 'army', 'which', 'you', 'led'],
         ['against', 'foe', 'with', 'mocking', 'smiles,'],
         ['sharper',
          'than',
          'bayonets.',
          'you',
          'were',
          'not',
          'smitten',
          'down'],
         ['by', 'invisible', 'bombs.', 'you', 'were', 'not', 'rejected'],
         ['by', 'those', 'whom', 'you', 'were', 'defeated.'],
         ['you', 'did', 'not', 'eat', 'savorless', 'bread'],
         ['which', 'poor', 'alchemy', 'had', 'made', 'from', 'ideals,'],
         ['you', 'went', 'manila,', 'harry', 'wilmans,'],
         ['while', 'i', 'enlisted', 'bedraggled', 'army'],
         ['bright-eyed,', 'divine', 'youths,'],
  

         ['when',
          'i',
          'seemed',
          'be',
          'turned',
          'tree',
          'with',
          'trunk',
          'branches'],
         ['growing', 'indurate,', 'turning', 'stone,', 'yet', 'burgeoning'],
         ['laurel', 'leaves,', 'hosts', 'lambent', 'laurel,'],
         ['quivering,', 'fluttering,', 'shrinking,', 'fighting', 'numbness'],
         ['creeping',
          'into',
          'their',
          'veins',
          'from',
          'dying',
          'trunk',
          'branches!'],
         ['tis', 'vain,', 'o', 'youth,', 'fly', 'call', 'apollo.'],
         ['fling', 'yourselves', 'fire,', 'die', 'with', 'song', 'spring,'],
         ['if', 'die', 'you', 'must', 'spring.', 'none', 'shall', 'look'],
         ['on', 'face', 'apollo', 'live,', 'choose', 'you', 'must'],
         ['twixt', 'death', 'flame', 'death', 'after', 'years', 'sorrow,'],
         ['rooted', 'fast', 'earth,', 'feeling', 'grisly', 'hand,'],
         ['not', 'so', 

        ['i', 'loved', 'watched', 'pruned'],
        ['with', 'gnarled', 'hands'],
        ['long,', 'long', 'years;'],
        ['here', 'under', 'roots', 'this', 'northern-spy'],
        ['move', 'chemic', 'change', 'circle', 'life,'],
        ['into', 'soil', 'into', 'flesh', 'tree,'],
        ['into', 'living', 'epitaphs'],
        ['redder', 'apples!']],
 '31': [['i', 'went', 'up', 'down', 'streets'],
        ['here', 'there', 'by', 'day', 'night,'],
        ['through',
         'all',
         'hours',
         'night',
         'caring',
         'poor',
         'who',
         'were',
         'sick.'],
        ['do', 'you', 'know', 'why?'],
        ['my', 'wife', 'hated', 'me,', 'my', 'son', 'went', 'dogs.'],
        ['i', 'turned', 'people', 'poured', 'out', 'my', 'love', 'them.'],
        ['sweet',
         'it',
         'was',
         'see',
         'crowds',
         'about',
         'lawns',
         'on',
         'day',
         'my',
         'funeral,'],
        [

       ['but', 'i', 'proclaim', 'from', 'dust'],
       ['that', 'he', 'slew', 'me', 'gratify', 'his', 'hatred.']],
 '80': [['i', 'could', 'not', 'run', 'or', 'play'],
        ['boyhood.'],
        ['manhood', 'i', 'could', 'only', 'sip', 'cup,'],
        ['not', 'drink'],
        ['scarlet-fever', 'left', 'my', 'heart', 'diseased.'],
        ['yet', 'i', 'lie', 'here'],
        ['soothed', 'by', 'secret', 'none', 'but', 'mary', 'knows:'],
        ['there', 'is', 'garden', 'acacia,'],
        ['catalpa', 'trees,', 'arbors', 'sweet', 'with', 'vines'],
        ['there', 'on', 'that', 'afternoon', 'june'],
        ['by', 'marys', 'side'],
        ['kissing', 'her', 'with', 'my', 'soul', 'upon', 'my', 'lips'],
        ['it', 'suddenly', 'took', 'flight.']],
 '81': [['if', 'i', 'could', 'have', 'lived', 'another', 'year'],
        ['i', 'could', 'have', 'finished', 'my', 'flying', 'machine,'],
        ['become', 'rich', 'famous.'],
        ['hence', 'it', 'is', 'fitting', 'workman'],
      

In [10]:
text_dictionary = corpora.Dictionary()
for text in texts.values():
    text_dictionary.add_documents(text)

2018-06-26 21:19:09,243 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-06-26 21:19:09,245 : INFO : built Dictionary(91 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 20 documents (total 120 corpus positions)
2018-06-26 21:19:09,246 : INFO : adding document #0 to Dictionary(91 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,247 : INFO : built Dictionary(156 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 38 documents (total 218 corpus positions)
2018-06-26 21:19:09,249 : INFO : adding document #0 to Dictionary(156 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,250 : INFO : built Dictionary(226 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 56 documents (total 330 corpus positions)
2018-06-26 21:19:09,251 : INFO : adding document #0 to Dictionary(226 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 2

2018-06-26 21:19:09,508 : INFO : built Dictionary(1332 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 472 documents (total 2547 corpus positions)
2018-06-26 21:19:09,509 : INFO : adding document #0 to Dictionary(1332 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,515 : INFO : built Dictionary(1408 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 497 documents (total 2679 corpus positions)
2018-06-26 21:19:09,517 : INFO : adding document #0 to Dictionary(1408 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,522 : INFO : built Dictionary(1473 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 519 documents (total 2817 corpus positions)
2018-06-26 21:19:09,524 : INFO : adding document #0 to Dictionary(1473 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,526 : INFO : built Dictionary(1509 unique tokens: ['dear', 'jane!

2018-06-26 21:19:09,617 : INFO : built Dictionary(2503 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 1007 documents (total 5639 corpus positions)
2018-06-26 21:19:09,619 : INFO : adding document #0 to Dictionary(2503 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,622 : INFO : built Dictionary(2557 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 1029 documents (total 5764 corpus positions)
2018-06-26 21:19:09,624 : INFO : adding document #0 to Dictionary(2557 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,627 : INFO : built Dictionary(2604 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 1047 documents (total 5872 corpus positions)
2018-06-26 21:19:09,628 : INFO : adding document #0 to Dictionary(2604 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,630 : INFO : built Dictionary(2665 unique tokens: ['dear', 'ja

2018-06-26 21:19:09,710 : INFO : built Dictionary(3526 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 1582 documents (total 8812 corpus positions)
2018-06-26 21:19:09,711 : INFO : adding document #0 to Dictionary(3526 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,713 : INFO : built Dictionary(3559 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 1604 documents (total 8923 corpus positions)
2018-06-26 21:19:09,714 : INFO : adding document #0 to Dictionary(3559 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,716 : INFO : built Dictionary(3588 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 1623 documents (total 9035 corpus positions)
2018-06-26 21:19:09,718 : INFO : adding document #0 to Dictionary(3588 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,719 : INFO : built Dictionary(3610 unique tokens: ['dear', 'ja

2018-06-26 21:19:09,817 : INFO : built Dictionary(4360 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 2142 documents (total 11840 corpus positions)
2018-06-26 21:19:09,819 : INFO : adding document #0 to Dictionary(4360 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,821 : INFO : built Dictionary(4392 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 2161 documents (total 11949 corpus positions)
2018-06-26 21:19:09,822 : INFO : adding document #0 to Dictionary(4392 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,824 : INFO : built Dictionary(4462 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 2198 documents (total 12161 corpus positions)
2018-06-26 21:19:09,825 : INFO : adding document #0 to Dictionary(4462 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,826 : INFO : built Dictionary(4478 unique tokens: ['dear', 

2018-06-26 21:19:09,925 : INFO : built Dictionary(5165 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 2667 documents (total 14840 corpus positions)
2018-06-26 21:19:09,926 : INFO : adding document #0 to Dictionary(5165 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,928 : INFO : built Dictionary(5194 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 2690 documents (total 14952 corpus positions)
2018-06-26 21:19:09,936 : INFO : adding document #0 to Dictionary(5194 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,945 : INFO : built Dictionary(5220 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 2715 documents (total 15098 corpus positions)
2018-06-26 21:19:09,952 : INFO : adding document #0 to Dictionary(5220 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:09,958 : INFO : built Dictionary(5237 unique tokens: ['dear', 

2018-06-26 21:19:10,141 : INFO : built Dictionary(6976 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 3841 documents (total 21631 corpus positions)
2018-06-26 21:19:10,143 : INFO : adding document #0 to Dictionary(6976 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:10,145 : INFO : built Dictionary(6999 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 3862 documents (total 21745 corpus positions)
2018-06-26 21:19:10,150 : INFO : adding document #0 to Dictionary(6999 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:10,153 : INFO : built Dictionary(7013 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 3872 documents (total 21802 corpus positions)
2018-06-26 21:19:10,159 : INFO : adding document #0 to Dictionary(7013 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:10,161 : INFO : built Dictionary(7024 unique tokens: ['dear', 

2018-06-26 21:19:10,305 : INFO : built Dictionary(7430 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 4249 documents (total 23870 corpus positions)
2018-06-26 21:19:10,307 : INFO : adding document #0 to Dictionary(7430 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:10,309 : INFO : built Dictionary(7454 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 4273 documents (total 23989 corpus positions)
2018-06-26 21:19:10,310 : INFO : adding document #0 to Dictionary(7454 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:10,311 : INFO : built Dictionary(7465 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 4281 documents (total 24021 corpus positions)
2018-06-26 21:19:10,314 : INFO : adding document #0 to Dictionary(7465 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:10,316 : INFO : built Dictionary(7482 unique tokens: ['dear', 

2018-06-26 21:19:10,481 : INFO : built Dictionary(7948 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 4713 documents (total 26398 corpus positions)
2018-06-26 21:19:10,482 : INFO : adding document #0 to Dictionary(7948 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:10,484 : INFO : built Dictionary(7962 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 4727 documents (total 26476 corpus positions)
2018-06-26 21:19:10,485 : INFO : adding document #0 to Dictionary(7962 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:10,490 : INFO : built Dictionary(7982 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 4744 documents (total 26565 corpus positions)
2018-06-26 21:19:10,492 : INFO : adding document #0 to Dictionary(7982 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:10,494 : INFO : built Dictionary(8005 unique tokens: ['dear', 

2018-06-26 21:19:10,618 : INFO : built Dictionary(8409 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 5135 documents (total 28777 corpus positions)
2018-06-26 21:19:10,623 : INFO : adding document #0 to Dictionary(8409 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:10,630 : INFO : built Dictionary(8435 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 5164 documents (total 28943 corpus positions)
2018-06-26 21:19:10,632 : INFO : adding document #0 to Dictionary(8435 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...)
2018-06-26 21:19:10,634 : INFO : built Dictionary(8446 unique tokens: ['dear', 'jane!', 'winsome', '(where', 'how']...) from 5174 documents (total 28990 corpus positions)


In [11]:
text_dictionary.save('../data/spa.dict')

2018-06-26 21:20:08,758 : INFO : saving Dictionary object under ../data/spa.dict, separately None
2018-06-26 21:20:08,766 : INFO : saved ../data/spa.dict
