In [1]:
import nltk
from nltk import tokenize
import numpy as np
import random
import pandas as pd

In [2]:
def split_text(filepath, min_char):
    """Convert text file to a list of sentences.
    
    Args:
    filepath: string. Filepath of text file.
    min_char: int. Minimum number of characters required for a sentence to be
    included.

    Returns:
    sentences: list of strings. List of sentences containined in the text file.
    """
    # Load data into string variable and remove new line characters
    file = open(filepath, "r", encoding="utf8")
    text = file.read().replace('\n', ' ')
    text = text.replace('.”', '”.').replace('."', '".').replace('?”', '”?').replace('!”', '”!')
    text = text.replace('--', ' ').replace('. . .', '').replace('_', '')
    file.close()
    
    # Split text into a list of sentences
    sentences = tokenize.sent_tokenize(text)
    
    # Remove sentences that are less than min_char long
    sentences = [sent for sent in sentences if len(sent) >= min_char]

    return list(sentences)

In [3]:
# Set parameter values
min_char = 5

# Create lists
baron = split_text('Books/The_Works_Of_Lord_Baron.txt', min_char = min_char)\
        + split_text('Books/Frankenstein.txt', min_char = min_char)
polidori = split_text('Books/The_Vampyre_John_Polidori.txt', min_char = min_char)\
           + split_text('Books/The_Dairy_Of_John_William_Polidori.txt', min_char = min_char)\
           + split_text('Books/Frankenstein.txt', min_char = min_char)
percy = split_text('Books/The_Complete_Poetical_Works_Of_Percy_Bysshe_Shelley.txt', min_char = min_char)\
        + split_text('Books/Frankenstein.txt', min_char = min_char)
mary = split_text('Books/The_Last_Man_By_Mary_Wollstonecraft_Shelley.txt', min_char = min_char)\
       + split_text('Books/Frankenstein.txt', min_char = min_char)
alcott = split_text('Books/Little_Women_By_Louisa_May_Alcott.txt', min_char = min_char)
bronte = split_text('Books/Jane_Eyre_An_Autobiography_By_Charlotte_Bronte.txt', min_char = min_char)

In [4]:
# Print length of each list

text_dict = {'Baron': baron, 'Polidori': polidori, 'Percy': percy, 'Mary': mary,
             'Alcott': alcott, 'Bronte': bronte}

for key in text_dict.keys():
    print(key, ':', len(text_dict[key]))

Baron : 21753
Polidori : 9680
Percy : 22942
Mary : 13667
Alcott : 9563
Bronte : 9699


In [5]:
# Set random seed
np.random.seed(1)

# Set length parameter
max_len = 9000

# Select sentences
names = [baron, polidori, percy, mary, alcott, bronte]
combined = []

for name in names:
    name = np.random.choice(name, max_len, replace = False)
    combined += list(name)

print('The length of the combined list is:', len(combined))

The length of the combined list is: 54000


In [6]:
labels = ['Baron']*max_len + ['Polidori']*max_len + ['Percy']*max_len + ['Mary']*max_len\
        + ['Alcott']*max_len + ['Bronte']*max_len + ['Collins']*max_len

print('The length of the labels list is:', len(labels))

The length of the labels list is: 63000


In [7]:
# Set random seed
random.seed(3)

# Randomly shuffle data
zipped = list(zip(combined, labels))
random.shuffle(zipped)
combined, labels = zip(*zipped)

In [8]:
# Create pandas dataframe
out_data = pd.DataFrame()
out_data['text'] = combined
out_data['author'] = labels

print(out_data.head())

                                                text    author
0                                  What is he doing?    Bronte
1  "Isn't a fellow to have any pleasure after a f...    Alcott
2  The original story, Ernestus Berchtold, may po...  Polidori
3  Later in the evening, when his mind had been s...    Alcott
4  “But this was a luxury of sensation that could...     Percy


In [10]:
# Export as a csv file
out_data.to_csv('author_dataset.csv', index=False)