In [None]:
!pip install boto3

In [None]:
#download the Fineweb-edu 10B dataset, tokenize and save to S3
#adapted the code from here: https://github.com/karpathy/build-nanogpt/blob/master/fineweb.py

In [None]:
import boto3

aws_access_key = ""
aws_secret_key = ""

## Fineweb dataset

In [None]:
!pip install tiktoken datasets tqdm

In [None]:
import os
import multiprocessing as mp
import numpy as np
import tiktoken
from datasets import load_dataset # pip install datasets
from tqdm import tqdm # pip install tqdm

In [None]:
nprocs = max(1, os.cpu_count()//2)
nprocs

In [None]:
local_dir = "edu_fineweb10B"
remote_name = "sample-10BT" #"CC-MAIN-2024-10"#
shard_size = int(1e8) # 100M tokens per shard, total of 100 shards
print(shard_size)

In [None]:
os.makedirs(local_dir, exist_ok=True)

In [None]:
# download the dataset
fw = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, streaming=True, split="train")

In [None]:
# init the tokenizer
enc = tiktoken.get_encoding("gpt2")
eot = enc._special_tokens['<|endoftext|>'] # end of text token
def tokenize(doc):
    # tokenizes a single document and returns a numpy array of uint16 tokens
    tokens = [eot] # the special <|endoftext|> token delimits all documents
    tokens.extend(enc.encode_ordinary(doc["text"]))
    tokens_np = np.array(tokens)
    assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
    tokens_np_uint16 = tokens_np.astype(np.uint16)
    return tokens_np_uint16

In [None]:
def write_datafile(filename, tokens_np):
    np.save(filename, tokens_np)

In [None]:
filename = os.path.join(local_dir, f"edufineweb_")
filename

In [None]:
with mp.Pool(nprocs) as pool:
    shard_index = 0
    # preallocate buffer to hold current shard
    all_tokens_np = np.empty((shard_size,), dtype=np.uint16)
    token_count = 0
    progress_bar = None
    for tokens in pool.imap(tokenize, fw, chunksize=16):

        # is there enough space in the current shard for the new tokens?
        if token_count + len(tokens) < shard_size:
            # simply append tokens to current shard
            all_tokens_np[token_count:token_count+len(tokens)] = tokens
            token_count += len(tokens)
            # update progress bar
            #if progress_bar is None:
            #    progress_bar = tqdm(total=shard_size, unit="tokens", desc=f"Shard {shard_index}")
            #progress_bar.update(len(tokens))
        else:
            # write the current shard and start a new one
            split = "val" if shard_index == 0 else "train"
            filename = os.path.join(local_dir, f"edufineweb_{split}_{shard_index:06d}")
            # split the document into whatever fits in this shard; the remainder goes to next one
            remainder = shard_size - token_count
            #progress_bar.update(remainder)
            all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
            print(filename)
            write_datafile(filename, all_tokens_np)
            shard_index += 1
            progress_bar = None
            # populate the next shard with the leftovers of the current doc
            all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
            token_count = len(tokens)-remainder

    # write any remaining tokens as the last shard
    if token_count != 0:
        split = "val" if shard_index == 0 else "train"
        filename = os.path.join(local_dir, f"edufineweb_{split}_{shard_index:06d}")
        write_datafile(filename, all_tokens_np[:token_count])

## Upload files to S3

In [None]:
s3 = boto3.client('s3', aws_access_key_id=aws_access_key, aws_secret_access_key=aws_secret_key)
bucket_name = "fineweb-10b-tokenized-071024"


In [None]:
def upload_file_to_s3(file_path, bucket_name):
  try:
      s3.upload_file(file_path, bucket_name, file_path)
      print(f"File {file_path} successfully uploaded to S3 bucket {bucket_name}.")
  except Exception as e:
      print(f"Error uploading file to S3: {e}")

In [None]:
import os

# Specify the directory path
directory_path = "edu_fineweb10B"

# List all files in the directory
files = os.listdir(directory_path)

# Print the list of files
print("Files in the directory:")
for file in files:
  file_path = os.path.join(directory_path, file)
  print(file_path)
  upload_file_to_s3(file_path, bucket_name)


