## Dictionary
This is just a collection of words that get assigned a number. It is much easier and faster to process integers than text, so creating a map is useful. This is a dictionary over the entire book of SPA.

## Corpus
This will be a processed corpus per poem. 
1. Excluding stopwords
2. Lowercased

In [1]:
import os
import fnmatch
import logging
import string
import re
from collections import defaultdict, Counter
from pprint import pprint
from itertools import chain

from gensim import corpora

In [2]:
def remove_ascii_and_punctuation(line):
    removed_ascii = re.sub(r'[\x90-\x99]','', line)
    
    punctuation = re.compile('[%s]' % re.escape(string.punctuation))
    return punctuation.sub('', removed_ascii)

poems = defaultdict(list)

for file_name in os.listdir('../../data/poems/'):
    if fnmatch.fnmatch(file_name, '*_cleaned_poem.txt'):
        poem_num = file_name.split('_')[0]
        
        with open(f'../../data/poems/{file_name}', 'r') as f:
            poems[poem_num] = remove_ascii_and_punctuation(' '.join(f.read().splitlines()))

In [3]:
poems['6']

'They have chiseled on my stone the words His life was gentle and the elements so mixed in him That nature might stand up and say to all the world This was a man Those who knew me smile As they read this empty rhetoric My epitaph should have been Life was not gentle to him And the elements so mixed in him That he made warfare on life In the which he was slain While I lived I could not cope with slanderous tongues Now that I am dead I must submit to an epitaph Graven by a fool'

In [4]:
# remove some stop words, lowercase everything, and split it up
# each poem is it's own corpus.
stoplist = set('for a of the and to in'.split())

# Not because it's good code.. But because I think it's really funny this works.
texts = {
    poem_num: [word for word in document.lower().split() 
               if word not in stoplist 
               and len(word) > 3]
    for poem_num, document in poems.items()
}

In [5]:
[print(text, '\n') for text in texts.values()];

['dear', 'jane', 'dear', 'winsome', 'jane', 'stole', 'room', 'where', 'your', 'nurses', 'linen', 'cuffs', 'took', 'hand', 'said', 'with', 'smile', 'youll', 'soon', 'well', 'liquid', 'thought', 'your', 'eyes', 'sank', 'eyes', 'like', 'that', 'slips', 'into', 'heart', 'flower', 'dear', 'jane', 'whole', 'mcneely', 'fortune', 'could', 'have', 'bought', 'your', 'care', 'night', 'night', 'paid', 'smile', 'warmth', 'your', 'soul', 'your', 'little', 'hands', 'laid', 'brow', 'jane', 'till', 'flame', 'life', 'went', 'dark', 'above', 'disk', 'night', 'longed', 'hoped', 'well', 'again', 'pillow', 'head', 'your', 'little', 'breasts', 'hold', 'fast', 'clasp', 'love', 'father', 'provide', 'when', 'died', 'jane', 'dear', 'jane'] 

['passer', 'love', 'find', 'your', 'soul', 'through', 'soul', 'beloved', 'when', 'beloved', 'withdraws', 'itself', 'from', 'your', 'soul', 'then', 'have', 'lost', 'your', 'soul', 'written', 'have', 'friend', 'sorrow', 'friend', 'hence', 'long', 'years', 'solitude', 'home', '

In [6]:
text_dictionary = corpora.Dictionary(texts.values())

In [7]:
text_dictionary.save('./spa.dict')

In [8]:
corpus = [text_dictionary.doc2bow(texts[str(ind)]) for ind in range(1, len(texts) + 1)]

In [9]:
corpora.MmCorpus.serialize('./spa.mm', corpus)  # store to disk, for later use

In [10]:
corpus

[[(11, 2),
  (20, 1),
  (23, 1),
  (29, 3),
  (33, 1),
  (35, 2),
  (43, 1),
  (48, 1),
  (58, 4),
  (61, 3),
  (73, 2),
  (79, 2),
  (107, 1),
  (108, 1),
  (115, 1),
  (125, 1),
  (159, 1),
  (175, 1),
  (196, 1),
  (215, 1),
  (227, 1),
  (231, 2),
  (247, 1),
  (295, 1),
  (314, 1),
  (352, 1),
  (359, 1),
  (390, 1),
  (391, 1),
  (425, 1),
  (436, 1),
  (439, 1),
  (469, 1),
  (476, 1),
  (477, 1),
  (541, 1),
  (576, 1),
  (612, 1),
  (646, 1),
  (807, 1),
  (887, 1),
  (907, 1),
  (1149, 2),
  (1175, 1),
  (1208, 1),
  (1345, 1),
  (1475, 1),
  (1544, 6),
  (1617, 1),
  (1778, 1),
  (1824, 1),
  (1845, 1),
  (2100, 1),
  (2101, 1),
  (2182, 1),
  (2205, 1),
  (2219, 1),
  (2298, 1),
  (2548, 1),
  (2806, 1),
  (2817, 1),
  (2833, 1),
  (3234, 1),
  (3275, 1),
  (3276, 1),
  (3277, 1),
  (3278, 1),
  (3279, 1),
  (3280, 1),
  (3281, 1),
  (3282, 1),
  (3283, 2),
  (3284, 1),
  (3285, 1),
  (3286, 1),
  (3287, 1),
  (3288, 1),
  (3289, 1),
  (3290, 2),
  (3291, 1),
  (3292, 1),
 