# Textbooks to JSON

#### Author

Tijmen de Haan <tijmen@post.kek.jp>

#### History

 - 2023 Nov 25 started coding
 - 2023 Nov 29 changed from a model loss-based book selection to one based on special character counts

#### Description

I have extracted text data from public-domain astro-related textbooks. Unfortunately, some of the books are misformatted, irrelevant, or low-quality. This notebook serves to turn the large amount of raw data into medium-quality data chunks ready to be tokenized and pretrained on.

### Step 1: Filter books with special character statistics

We'll use the rate of special characters to evaluate the quality of these textbooks. The books that are out of distribution get thrown out.

In [None]:
import json
import matplotlib.pyplot as plt

def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def calculate_percentages(textbook):
    space_percentage = 100 * textbook.count(' ') / len(textbook)
    newline_percentage = 100 * textbook.count('\n') / len(textbook)
    backslash_percentage = 100 * textbook.count('\\') / len(textbook)
    return space_percentage, newline_percentage, backslash_percentage

def histogram_percentages(textbooks):
    space_percentages = []
    newline_percentages = []
    backslash_percentages = []

    for book in textbooks:
        space_perc, newline_perc, backslash_perc = calculate_percentages(book)
        space_percentages.append(space_perc)
        newline_percentages.append(newline_perc)
        backslash_percentages.append(backslash_perc)

    plt.figure()
    plt.hist(space_percentages, bins=50, range=(0, 30), alpha=0.5, label='Spaces')
    plt.hist(newline_percentages, bins=50, range=(0, 30), alpha=0.5, label='Newlines')
    plt.hist(backslash_percentages, bins=50, range=(0, 30), alpha=0.5, label='Backslashes')
    plt.legend()
    plt.xlabel('Percentage')
    plt.ylabel('Number of Textbooks')
    plt.title('Character Distribution in Textbooks')
    plt.show()

def filter_textbooks(textbooks, space_bounds, newline_bounds, backslash_bounds):
    filtered_books = []
    for book in textbooks:
        space_perc, newline_perc, backslash_perc = calculate_percentages(book)
        if space_bounds[0] <= space_perc <= space_bounds[1] and \
           newline_bounds[0] <= newline_perc <= newline_bounds[1] and \
           backslash_bounds[0] <= backslash_perc <= backslash_bounds[1]:
            filtered_books.append(book)
    return filtered_books

def save_data(filtered_books, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as file:
        json.dump(filtered_books, file, indent=4)

file_path = 'datasets/textbooks_clean.json'
output_file_path = 'datasets/textbooks_clean2.json'

textbooks = load_data(file_path)
histogram_percentages(textbooks)

# Adjusted based on visual inspection of the histograms
space_bounds = (13.0, 19.5)  # min and max percentage of spaces
newline_bounds = (1.0, 6.0)  # min and max percentage of newlines
backslash_bounds = (0.0, 1.0)  # min and max percentage of backslashes

filtered_books = filter_textbooks(textbooks, space_bounds, newline_bounds, backslash_bounds)
print(f"After filtering, {len(filtered_books)} out of {len(textbooks)} books remain.")
save_data(filtered_books, output_file_path)

### Step 2: Tokenize and save

Textbooks are very long. Ideally we'd do something like in-context learning from parts and summarize with an advanced model like GPT-4. Since I don't have any money for this right now, I'll just chunk the textbooks into fixed-length tokenized samples. This will be used to pretrain.

We use pickle for now to save to disk.

In [None]:
import json
import random
import multiprocessing
from transformers import PreTrainedTokenizerFast

def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def load_tokenizer():
    return PreTrainedTokenizerFast(tokenizer_file="Yi-6B/tokenizer.json")

def tokenize_book(book, tokenizer_working_size=10000, chunk_size=512):
    '''
    Method to chunk a book. We want to cut the book into `chunk_size` token chunks,
    but we don't a priori know how much of the book is `chunk_size` tokens long. So we first
    tokenize the whole book, then chunk it into `chunk_size` token chunks.
    '''
    tokenizer = load_tokenizer()  # Load separate tokenizer in each process for multiprocessing
    pad_token_id = tokenizer.convert_tokens_to_ids('<unk>')  # Use <unk> as the pad token

    # First tokenize the whole book. This has to happen in pieces "tokenizer_working_size" long
    tokenized_book = []
    for i in range(0, len(book), tokenizer_working_size):
        part = book[i:i+tokenizer_working_size]
        tokens = tokenizer.encode(part, add_special_tokens=False)
        tokenized_book.extend(tokens)

    # Now chunk the whole tokenized book into chunks of size "chunk_size"
    # and manually pad each chunk to ensure consistent length
    chunked_book = []
    for i in range(0, len(tokenized_book), chunk_size):
        chunk = tokenized_book[i:i+chunk_size]
        padded_chunk = chunk + [pad_token_id] * (chunk_size - len(chunk))  # Pad the chunk
        chunked_book.append(padded_chunk)

    return chunked_book

file_path = 'datasets/textbooks_clean2.json'
textbooks = load_data(file_path)

print(f"Starting tokenization of {len(textbooks)} textbooks using multiprocessing...")

# Using Pool to utilize all available CPUs
with multiprocessing.Pool() as pool:
    chunked_books = pool.map(tokenize_book, textbooks)

print("Flattening and shuffling...")

flat_chunks = []
for chunked_book in chunked_books:
    flat_chunks.extend(chunked_book)
random.shuffle(flat_chunks)


# Save tokenized data as pkl
print("Saving pickle...")
import pickle
with open("Yi-6B_textbooks_v1/tokenized_dataset.pkl", "wb") as file:
    pickle.dump(flat_chunks, file)

print("Done.")

## Ideas for improvement

 - One idea is to not pad in the tokenizer but rather pad on the fly in the __getitem__ method of the dataset
```
def __getitem__(self, idx):
    token_ids = self.data[idx]
    encoded_dict = self.tokenizer.prepare_for_model(
        token_ids,
        max_length=self.max_length,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
    return encoded_dict
```

 - Books tend to have crap at the start and the end. We could cut this out, potentially by using the model itself.

 - Here's some debugging code
 ```
# logging.info(f"memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
# logging.info(input_ids.shape)
# logging.info(input_ids)
# logging.info(input_ids.dtype)

# output_512 = model(input_ids[:, :512], labels=input_ids[:, :512])
# logging.info(f"memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
# output_1024 = model(input_ids[:, :1024], labels=input_ids[:, :1024])
# logging.info(f"memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
# output_2048 = model(input_ids[:, :2048], labels=input_ids[:, :2048])
# logging.info(f"memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
# output_4096 = model(input_ids[:, :4096], labels=input_ids[:, :4096])
# logging.info(f"memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
```