In [1]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input='../data/normalized_poems.txt',
    model_prefix='amharic_bpe',
    vocab_size=12000,                 # You can change this to 4k, 10k etc.
    model_type='bpe',                # Or 'unigram'
    character_coverage=1.0,          # 1.0 for Amharic, not 0.9995 like English
    pad_id=0, unk_id=1, bos_id=2, eos_id=3,
    input_sentence_size=1000000,     # optional for sampling large corpus
    shuffle_input_sentence=True
)


In [4]:

sp = spm.SentencePieceProcessor()
sp.load("amharic_bpe.model")

text = "ድንግል ፈጣሪዋን ወለደችው"
tokens = sp.encode(text, out_type=str)

print(tokens)  # subword tokens
print(sp.encode(text))  # token IDs


['▁ድንግል', '▁ፈጣሪ', 'ዋን', '▁ወለ', 'ደ', 'ችው']
[174, 565, 214, 948, 11775, 407]


In [5]:
decoded = sp.decode(tokens)
print(decoded)
# Output: "መዘመር ልጀምር ለእመቤታችን"


ድንግል ፈጣሪዋን ወለደችው


In [6]:
# Load your poem file
with open("../data/normalized_poems.txt", encoding="utf-8") as f:
    lines = f.readlines()

# Convert all lines into token ID sequences
tokenized_lines = [sp.encode(line.strip(), out_type=int) for line in lines if line.strip()]


In [8]:
from itertools import chain

# Flatten all lines into a single list of tokens
flattened_tokens = list(chain.from_iterable(tokenized_lines))


In [10]:
seq_length = 128
chunks = [flattened_tokens[i:i+seq_length] for i in range(0, len(flattened_tokens)-seq_length, seq_length)]
print(f"Total number of chunks: {len(chunks)}")

Total number of chunks: 249


In [None]:
from datasets import Dataset

dataset = Dataset.from_dict({"input_ids": chunks})


ModuleNotFoundError: No module named 'datasets'