In [1]:
import numpy as np
import tensorflow as tf

### Load the Corpus
##### Get book names

In [3]:
import glob

book_filenames = sorted(glob.glob("./data/*.txt"))

print("Found {} books".format(len(book_filenames)))

Found 5 books


##### Combine books into a string

In [13]:
import codecs

corpus_raw = u""
for filename in book_filenames:
    with codecs.open(filename, 'r', 'utf-8') as book_file:
        corpus_raw += book_file.read()

print("Corpus is {} characters long".format(len(corpus_raw)))

Corpus is 9719485 characters long


### Process Corpus
##### Create lookup tables

In [8]:
def create_lookup_tables(text):
    """
    Create lookup tables for vocab
    :param text: The GOT text split into words
    :return: A tuple of dicts (vocab_to_int, int_to_vocab)
    """
    int_to_vocab = set(text)
    vocab_to_int = {word: key for key, word in enumerate(int_to_vocab)}
    return vocab_to_int, int_to_vocab
    

##### Tokenize punctuation

In [9]:
def token_lookup():
    """
    Generate a dict to map punctuation into a token
    :return: dictionary mapping puncuation to token
    """
    return {
        '.': '||period||',
        ',': '||comma||',
        '"': '||quotes||',
        ';': '||semicolon||',
        '!': '||exclamation-mark||',
        '?': '||question-mark||',
        '(': '||left-parentheses||',
        ')': '||right-parentheses||',
        '--': '||emm-dash||',
        '\n': '||return||'
        
    }


##### Process data

In [15]:
import pickle

token_dict = token_lookup()
for token, replacement in token_dict.items():
    corpus_raw = corpus_raw.replace(token, ' {} '.format(replacement))
corpus_raw = corpus_raw.lower()
corpus_raw = corpus_raw.split()

vocab_to_int, int_to_vocab = create_lookup_tables(corpus_raw)
corpus_int = [vocab_to_int[word] for word in corpus_raw]
pickle.dump((corpus_int, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))

AttributeError: 'list' object has no attribute 'replace'