In [None]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from tokenizers.pre_tokenizers import Whitespace

In [3]:
# Path to your Amharic poem dataset
dataset_path = "../data/normalized_poems.txt"

# Initialize a BPE tokenizer
tokenizer = Tokenizer(models.BPE())

# Use Whitespace pre-tokenization (splits on spaces, preserving Amharic characters)
tokenizer.pre_tokenizer = Whitespace()

# Define special tokens (same as GPT-2 for compatibility)
special_tokens = ["<|endoftext|>"]

# Configure the trainer
trainer = trainers.BpeTrainer(
    vocab_size=12000,  # Adjust based on your dataset size; GPT-2 uses ~50k
    min_frequency=2,   # Ignore tokens appearing less than twice
    special_tokens=special_tokens
)

# Train the tokenizer on your dataset
def get_training_corpus():
    with open(dataset_path, "r", encoding="utf-8") as f:
        # Yield chunks of text to save memory
        for line in f:
            yield line

tokenizer.train_from_iterator(get_training_corpus(), trainer)

# Save the tokenizer
tokenizer.save("amharic_bpe_tokenizer.json")

In [4]:
from transformers import PreTrainedTokenizerFast, GPT2TokenizerFast

# Load the trained tokenizer
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="amharic_bpe_tokenizer.json",
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
    unk_token="<|unk|>",
    pad_token="<|pad|>"
)

# Save it in a format compatible with Hugging Face
tokenizer.save_pretrained("amharic_gpt2_tokenizer")

# Test the tokenizer
text = "አንቺ የፅዮን ልጅ ዘምሪ እልል በይ"
tokens = tokenizer.encode(text)
print("Tokens:", tokens)
print("Decoded:", tokenizer.decode(tokens))

Tokens: [435, 146, 5540, 328, 6512, 2496, 1010]
Decoded: አንቺ የ ፅዮን ልጅ ዘምሪ እልል በይ


In [5]:
from transformers import GPT2LMHeadModel

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Resize the token embeddings to match the new tokenizer's vocabulary
model.resize_token_embeddings(len(tokenizer))

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f17434d/63bed80836ee0758c8fd4f8975d59bb0b864263ee2753547c358e8a37cde8758?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T172205Z&X-Amz-Expires=3600&X-Amz-Signature=ba7a0285fb4841ab02be3e3db0e8213e9dcee64afbdedfb3b7b5b1a1e30868d9&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747246925&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0NjkyNX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzNGQvNjNiZWQ4MDgzNmVlMDc1OGM4ZmQ0Zjg5NzVkNTliYjBiODY0MjYzZWUyNzUzNTQ3YzM1OGU4YTM3Y2RlODc1OCoifV19&Signature=H0YyRX7PBtidqpFrZqR2zg-hOeAcD09aheF2hnPI6xagHJHy2-W48

pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(7212, 768)