In [None]:
# load dataset
from datasets import load_dataset
dataset = load_dataset("roneneldan/TinyStories")
print(dataset['train'][:10])
print(len(dataset['train']))

In [None]:
import torch
class TinyDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]

In [None]:
# Tokenize
import sentencepiece as spm
import os
from typing import List
class Tokenizer:
    def __init__(self, model_path):
        assert os.path.isfile(model_path), model_path
        self.sp_model = spm.SentencePieceProcessor(model_file=model_file)
        # BOS / EOS token IDs
        self.vocab_size: int = self.sp_model.vocab_size()
        self.bos_id: int = self.sp_model.bos_id()
        self.eos_id: int = self.sp_model.eos_id()
        self.pad_id: int = self.sp_model.pad_id()
        print(f"vocab_size: {self.vocab_size} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}")
    
    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
        t = self.sp_model.encode(s)
        if bos:
            t = [self.bos_id] + t
        if eos:
            t = t + [self.eos_id]
        return t

    def decode(self, ids: List[int]) -> str:
        return self.sp_model.decode(ids)

tokenizer = Tokenizer("tokenizer.model")
tokenizer.encode("hello world", bos=True, eos=True)