In [220]:
# re
import re

# data reading / cleaning
from gut_tokenize import read_data, preprocess

# ceiling 
import math 

# random
import random
random.seed(42)

import nltk
nltk.download('words')
from nltk.corpus import words
word_set = set(words.words())

[nltk_data] Downloading package words to /Users/tim/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [221]:
'''
This method reads and returns `num_samples` random samples of length `sample_size`
From the inputted text. The optional advance argument lets you input a percentage
To advance in the text (recommended: 1.5) to avoid extraneous headmatter not captured by
the tokenization pipeline.

This method is pulling from each texts as a list of strings, so, it does not limit its search to only text
From the same paragraph, group of sentences, etc -- it will travel across those boundaries, just looking for words.

When it generates the random samples, this method attempts to start a sample with a full word
Changing something like "...ough he told her" to "...he told her". 
Thus, the samples returned could be a few characters smaller than the inputted `sample_size`. This is neglegible
wrt the experience of reading a sample, but visually instructive.

Also, the samples themselves are prefixed with elipses for clarity that they are in fact samples
Are not standalone paragraphs, etc.

Range-work adapted from https://www.geeksforgeeks.org/python-non-overlapping-random-ranges/

Args:
    text: full text of the book (list of str)
    num_samples: how many samples to draw from a text (int)
    sample_size: how large the samples should be, in characters (int)
    advance: how far to advance past headmatter (float, enter as a percent i.e. 1.5 to start at the 1.5th pctle)

Returns:
    res: samples, a list of samples from the inputted `text`
'''
def sample_from_text(text, num_samples, sample_size, advance):
    if text is None:
        return
    if num_samples < 1:
        return
    
    if sample_size < 1:
        return
    
    # first, advance past first 1.5% of text, just in case any head matter was retained
    max = len(text)

    floor = 0 if advance is None else math.ceil(max * (float(advance) / 100.0))
  
    N = num_samples
    K = sample_size
  
    tot = len(text)
    result = set()
    for _ in range(num_samples):
        temp = random.randint(floor, tot - sample_size)
        
        while any(temp >= idx and temp <= idx + sample_size for idx in result):
            temp = random.randint(floor, tot - sample_size) 
            
        result.add(temp)
    result = [(idx, idx + sample_size) for idx in result]
    
    samples = []
    starts = []
    stops = []
    for start, stop in result:
        sample = text[start:stop]
        # try to start with a normal word, not something like "th in the ...."
        sample = sample.split(" ")
        if len(sample[0]) <= 2 or sample[0] not in word_set:
            sample = sample[1:]
        if len(sample[-1]) <= 2 or sample[-1] not in word_set:
            sample = sample[:-1]
        sample = ' '.join(sample)
        # and then adding ... at the beginning at the end
        samples.append("..." + sample + "...")
        starts.append(start)
        stops.append(stop)
    return samples, starts, stops

'''
Helper method for writing an inputted `sample` to a file.
Args:
    sample: text of the sample (str)
    filename: filename, should have directory prefixed already (str)
    
Note that from this filename, you'll be able to reference where in the text the sample is draw from
By grabbing the work at that index from the original text list (give or take a few words)

'''
def write_sample_to_file(sample, filename):
    if sample is None:
        return
    text = open(filename, "w")
    text.write(sample)
    text.close()

    
'''
Helper method for generating and writing samples from `texts` to files.
Args:
    titles: book titles for the files you are working with (string)
    texts: book texts (str)
    num_samples: how many samples to draw from a text (int)
    sample_size: how large the samples should be, in characters (int)
    advance: how far to advance past headmatter (float, enter as a percent i.e. 1.5 to start at the 1.5th pctle)

Return:
    summary string of how many samples were written and their location (hard-coded, but could be made dynamic)

'''
def write_samples_to_file(titles, texts, num_samples, sample_size, advance):
    count = 0
    for index, text in enumerate(texts):
        current_book = titles[index]
        samples, starts, stops = sample_from_text(text, num_samples, sample_size, advance)
        for index, sample in enumerate(samples):
            file_name = "../Gutenberg/samples/" + current_book.replace("_clean.txt", "") + "_" + str(starts[index]) + "_" + str(stops[index]) + ".txt"
            write_sample_to_file(sample, file_name)
            count +=1
    
    return "Wrote " + str(count) + " samples to ../Gutenberg/samples."

In [222]:
directory = "../Gutenberg/cleaned_texts/"
titles = []
texts = []
titles, texts = read_data(directory)

for index, text in enumerate(texts):
    texts[index] = preprocess(text)

write_samples_to_file(titles, texts, 13, 800, 1.5)

'Wrote 364 samples to ../Gutenberg/samples.'