**goal**

data preparation: generate tokenized np array for train and eval

1. download a sampled version from [HuggingFaceFW/fineweb-edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu)
2. load encoder and tokenize  
3. save as np array


In [23]:
import os
import numpy as np
from tqdm import tqdm
import tiktoken
from datasets import load_dataset
import multiprocessing as mp
from IPython.display import display, HTML


In [2]:
shard_size = int(1e7)

# LOCAL dir to save the cached data
local_dir = "edu_fineweb10B"
os.makedirs(local_dir, exist_ok=True)

1. download a sampled version from [HuggingFaceFW/fineweb-edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu)

https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu

In [13]:
# HF datapath 
datapath = "HuggingFaceFW/fineweb-edu"
sub_name = "sample-10BT"

fw = load_dataset(datapath, sub_name, split = 'train')

print("object type", type(fw))
print("# of data points", f"{len(fw):,}")
print("one data point\n", fw[0], sep = '')

n_sample = int(len(fw) * 0.01)

fw_sample = [{'text': fw[i]['text']} for i in range(n_sample)]
del fw


Resolving data files:   0%|          | 0/2110 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/98 [00:00<?, ?it/s]

object type <class 'datasets.arrow_dataset.Dataset'>
# of data points 9,672,101
one data point
{'text': 'The Independent Jane\nFor all the love, romance and scandal in Jane Austen’s books, what they are really about is freedom and independence. Independence of thought and the freedom to choose.\nElizabeth’s refusal of Mr. Collins offer of marriage showed an independence seldom seen in heroines of the day. Her refusal of Mr. Darcy while triggered by anger showed a level of independence that left him shocked and stunned.\nThe freedom she exhibited in finally accepting him in direct defiance of Lady Catherine and knowing her father would disapprove was unusual even for Austen. In her last book Anne Elliot is persuaded to refuse Captain Wentworth at Lady Russel’s insistence.\nAlthough Jane played by the rules of the day, all of her writing is infused with how she wanted life to be. She ‘screams’ her outrage at the limitations for women in Emma.\nWhen accosted by Mrs. Elton, Jane Fairfax sa

# 2.  load encoder and tokenize func (removed to utils due to multi-thread process)

In [14]:
enc = tiktoken.get_encoding("gpt2")

# show stored objects 
display(HTML(f"<h2>vars</h2>"))
print("all vars:\n", [k for k in vars(enc)], end = '\n\n', sep = '')
print("max token value:\n", enc.max_token_value, end = '\n\n', sep = '')
print("special tokens:\n", enc._special_tokens, end = '\n\n', sep = '')

# show callable functions that are not dunder methods
display(HTML(f"<h2>dir</h2>"))
print("callable functions:\n", [k for k in dir(enc) if callable(getattr(enc, k)) and not k.startswith('_')], end = '\n\n', sep = '')

# show the result of encode_ordinary
display(HTML(f"<h2>encode_ordinary</h2>"))
print(enc.encode_ordinary("Hello, world!"))

all vars:
['name', '_pat_str', '_mergeable_ranks', '_special_tokens', 'max_token_value', '_core_bpe', 'special_tokens_set']

max token value:
50256

special tokens:
{'<|endoftext|>': 50256}



callable functions:
['decode', 'decode_batch', 'decode_bytes', 'decode_bytes_batch', 'decode_single_token_bytes', 'decode_tokens_bytes', 'decode_with_offsets', 'encode', 'encode_batch', 'encode_ordinary', 'encode_ordinary_batch', 'encode_single_token', 'encode_with_unstable', 'token_byte_values']



[15496, 11, 995, 0]


In [15]:
## move tokenize to utils.py to ensure it is in __main__
# def tokenize(doc):
#     """
#     doc: string of a single document
#     returns a numpy array of unit 16 tokens
#     """
 
#     tokens = [eot] # the special <|endoftext|> token delimits all documents
#     tokens.extend(enc.encode_ordinary(doc["text"]))
#     tokens_np = np.array(tokens)

#     ## note: remove below token check, checking once is enough 
#     ## assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
   
#     tokens_np_uint16 = tokens_np.astype(np.uint16)
#     return tokens_np_uint16

In [18]:
# each example has about 800-1000 tokens
n_tokens = len(enc.encode_ordinary(fw_sample[5]['text']))
print("# of tokens in each data point", n_tokens )
print("# of data points in each shard", shard_size//n_tokens)
print("# of shards, approx", len(fw_sample)*n_tokens//shard_size)

# of tokens in each data point 869
# of data points in each shard 11507
# of shards, approx 8


## 3. write 1d array to file

In [19]:
nprocs = max(0, os.cpu_count())

print("# of cores for compute", nprocs)

nprocs //= 2

# of cores for compute 14


In [28]:
from utils import tokenize
def _write_datafile(filename, tokens_np):
    np.save(filename, tokens_np)

def write_shard_to_file(shard_index, tokens_np, local_dir):
    split = 'val' if shard_index == 0 else 'train'
    filename = os.path.join(local_dir, f'edufineweb_{split}_{shard_index:06d}')
    _write_datafile(filename, tokens_np)

shard_size = int(1e7) # number of tokens per shard
chunk_size = 64

with mp.Pool(nprocs) as pool:

    shard_index = 0
    token_count = 0
    all_tokens_np = np.empty((shard_size, ), dtype = np.uint16)
    progress_bar = tqdm(total = shard_size, unit = 'tokens', desc=f"Shard {shard_index}")

    for tokens in pool.imap(tokenize, fw_sample, chunksize = chunk_size):
        
        if token_count + len(tokens) < shard_size:
            delta_token_count = len(tokens)
            all_tokens_np[token_count: token_count + delta_token_count] = tokens
            token_count += delta_token_count
            progress_bar.update(delta_token_count)
        else:
            delta_token_count = shard_size - token_count
            remain_token_count = len(tokens) - delta_token_count

            all_tokens_np[token_count:] = tokens[:delta_token_count]
            
            # save the shard to local dir
            write_shard_to_file(shard_index, all_tokens_np, local_dir)

            # initiate for the next shard
            shard_index += 1
            token_count = remain_token_count
            all_tokens_np = np.empty((shard_size, ), dtype = np.uint16)
            all_tokens_np[:token_count] = tokens[delta_token_count: ] 
            
            progress_bar = tqdm(total = shard_size, unit = 'tokens', desc=f"Shard {shard_index}")
            progress_bar.update(remain_token_count)

    if token_count > 0:
        write_shard_to_file(shard_index, all_tokens_np, local_dir)  


Shard 0: 100%|█████████▉| 9999305/10000000 [01:07<00:00, 148999.59tokens/s]  
Shard 0: 100%|█████████▉| 9999305/10000000 [00:00<00:00, 15174552.04tokens/s]
Shard 1: 100%|█████████▉| 9999180/10000000 [00:00<00:00, 21658105.77tokens/s]
Shard 2: 100%|█████████▉| 9999727/10000000 [00:00<00:00, 21775226.35tokens/s]
Shard 3: 100%|█████████▉| 9999707/10000000 [00:00<00:00, 21429036.91tokens/s]
Shard 4: 100%|█████████▉| 9999178/10000000 [00:00<00:00, 21555291.75tokens/s]
Shard 5: 100%|█████████▉| 9999790/10000000 [00:00<00:00, 22312649.99tokens/s]
Shard 6: 100%|█████████▉| 9999474/10000000 [00:00<00:00, 22032215.46tokens/s]
Shard 7: 100%|█████████▉| 9999915/10000000 [00:00<00:00, 21890227.84tokens/s]
Shard 8: 100%|█████████▉| 9999648/10000000 [00:00<00:00, 21740380.40tokens/s]
Shard 9: 100%|█████████▉| 9999431/10000000 [00:00<00:00, 21286797.54tokens/s]
