In [2]:
!pip install datasets tokenizers tiktoken

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 M

In [3]:
# Import libraries
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from datasets import load_dataset
import os
import torch
import tiktoken

# Check GPU availability
print("GPU Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0))

# Step 1: Load C4 dataset from Hugging Face
def load_c4_data():
    output_file = "c4_sample.txt"
    if not os.path.exists(output_file):
        print("Loading C4 dataset from Hugging Face...")
        # Stream the English subset of C4 dataset
        dataset = load_dataset("allenai/c4", "en", split="train", streaming=True)

        # Write a sample of the dataset to a file (e.g., 100k examples)
        with open(output_file, 'w', encoding='utf-8') as f:
            for i, example in enumerate(dataset):
                if i >= 100000:  # Limit to 100k examples for Colab feasibility
                    break
                f.write(example['text'] + "\n")
        print("C4 sample saved to", output_file)
    return output_file

# Step 2: Define and train the tokenizer
def train_tokenizer(file_path):
    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = Whitespace()
    # No explicit decoder needed for BPE; decoding is handled internally

    trainer = BpeTrainer(
        vocab_size=30000,  # Adjustable; GPT-4 uses ~100k
        min_frequency=2,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )

    print("Training tokenizer...")
    tokenizer.train([file_path], trainer)
    print("Training complete!")

    tokenizer.save("custom_tokenizer.json")
    return tokenizer

# Step 3: Test the tokenizer
def test_tokenizer(tokenizer):
    test_sentence = "The quick brown fox jumps over the lazy dog."
    encoded = tokenizer.encode(test_sentence)

    print("Original:", test_sentence)
    print("Tokens:", encoded.tokens)
    print("Token IDs:", encoded.ids)
    print("Decoded:", tokenizer.decode(encoded.ids))

# Main execution
if __name__ == "__main__":
    # Load C4 data
    text_file = load_c4_data()

    # Train the tokenizer
    tokenizer = train_tokenizer(text_file)

    # Test it
    test_tokenizer(tokenizer)

    # Compare with GPT-like tokenizer
    gpt_tokenizer = tiktoken.get_encoding("cl100k_base")
    test_sentence = "The quick brown fox jumps over the lazy dog."
    gpt_encoded = gpt_tokenizer.encode(test_sentence)
    print("\nGPT Tokenizer Comparison:")
    print("GPT Tokens:", [gpt_tokenizer.decode([t]) for t in gpt_encoded])
    print("GPT Token Count:", len(gpt_encoded))
    print("Custom Token Count:", len(tokenizer.encode(test_sentence).tokens))

GPU Available: True
GPU Name: NVIDIA A100-SXM4-40GB
Loading C4 dataset from Hugging Face...


README.md:   0%|          | 0.00/41.1k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

C4 sample saved to c4_sample.txt
Training tokenizer...
Training complete!
Original: The quick brown fox jumps over the lazy dog.
Tokens: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']
Token IDs: [4481, 5668, 9486, 15281, 25743, 4663, 4410, 18541, 6499, 37]
Decoded: The quick brown fox jumps over the lazy dog .

GPT Tokenizer Comparison:
GPT Tokens: ['The', ' quick', ' brown', ' fox', ' jumps', ' over', ' the', ' lazy', ' dog', '.']
GPT Token Count: 10
Custom Token Count: 10
