In [None]:
import glob
import extract_textbooks
import json

In [None]:
# Parse the books into individual .txt files
extract_textbooks.process_textbooks_multiprocess('datasets/astro_textbooks/')
extract_textbooks.process_textbooks_multiprocess('datasets/physics_textbooks/')

In [None]:
# Preprocess, to get a list of books, where each book is a list of paragraphs
book_paths = glob.glob('datasets/textbooks_extracted/*.txt')
books_paragraphs = [extract_textbooks.preprocess_text(file_path) for file_path in book_paths]

In [None]:
# Mark each paragraph as good or bad based on whether the rate of certain
# characters is within the distribution
%matplotlib widget
extract_textbooks.histogram_percentages(books_paragraphs)

In [None]:
bounds = {
    'spaces': (6, 24),
    'digits': (0, 15),
    'capital_letters': (1, 23),
    'lowercase_letters': (50, 95),
    'newlines': (0, 5),
    'backslashes': (0, 5),
    'periods': (0, 8),
    'exclamation_marks': (0, 5),
    'question_marks': (0, 6)
}

# Filter the textbooks
filtered_books_paragraphs = extract_textbooks.filter_textbooks(books_paragraphs, bounds)

In [None]:
# save to JSON: full, training, and evaluation sets
import json
import random

root = '/home/tijmen/cosmosage/datasets/'
train_file = root+'textbooks_train.jsonl'
eval_file = root+'textbooks_eval.jsonl'
full_file = root+'textbooks.jsonl'

# Collect all paragraphs
all_paragraphs = []
for book in filtered_books_paragraphs:
    for para in book:
        all_paragraphs.append({'text': para})

# Shuffle the paragraphs
random.shuffle(all_paragraphs)

# Splitting the paragraphs into training and evaluation sets
with open(train_file, 'w', encoding='utf-8') as train_f, \
     open(eval_file, 'w', encoding='utf-8') as eval_f, \
     open(full_file, 'w', encoding='utf-8') as full_f:
    
    for para in all_paragraphs:
        # Write to full dataset file
        full_f.write(json.dumps(para) + '\n')

        # Randomly decide whether to write to training or evaluation set
        if random.random() < 0.8:  # 80% chance to go into training data
            train_f.write(json.dumps(para) + '\n')
        else:
            eval_f.write(json.dumps(para) + '\n')

# Also make a flat JSONL that has one entry per book
flat_file = root+'textbooks_flat.jsonl'
with open(flat_file, 'w', encoding='utf-8') as flat_f:
    for book in filtered_books_paragraphs:
        flat_book = '\n\n'.join(book)
        flat_f.write(json.dumps({'text': flat_book}) + '\n')


In [None]:
# next steps
# 1. see why some of the samples are crazy long, see if we can cut them into paragraphs
# 2. tokenize. pad or discard short samples, chunk long samples