|
1 | 1 | # ztoken |
2 | 2 |
|
| 3 | +[](https://pkg.go.dev/github.com/zerfoo/ztoken) |
| 4 | +[](https://opensource.org/licenses/Apache-2.0) |
| 5 | + |
3 | 6 | BPE tokenizer library for Go with HuggingFace compatibility. |
4 | 7 |
|
5 | 8 | Part of the [Zerfoo](https://github.com/zerfoo) ML ecosystem. |
6 | 9 |
|
7 | | -## Install |
| 10 | +## Features |
| 11 | + |
| 12 | +- **Byte-Pair Encoding (BPE)** tokenizer with full merge-based encoding/decoding |
| 13 | +- **HuggingFace `tokenizer.json`** loading — compatible with GPT-2, Llama, Gemma, Mistral, and other models |
| 14 | +- **GGUF tokenizer extraction** — extract tokenizer data directly from GGUF model files via `ztoken/gguf` |
| 15 | +- **SentencePiece compatibility** — handles U+2581 space markers used by Llama-family models |
| 16 | +- **Special token handling** — BOS, EOS, PAD, UNK with exact-match encoding for control tokens |
| 17 | +- **Byte-level BPE** — GPT-2 style byte-to-Unicode encoding for full UTF-8 coverage |
| 18 | +- **Text normalization** — configurable normalizer pipeline (NFC, NFD, NFKC, lowercase, etc.) |
| 19 | +- **Zero external dependencies** — stdlib only, plus `golang.org/x/text` for Unicode normalization |
8 | 20 |
|
9 | | -```sh |
| 21 | +## Installation |
| 22 | + |
| 23 | +```bash |
10 | 24 | go get github.com/zerfoo/ztoken |
11 | 25 | ``` |
12 | 26 |
|
13 | | -## Features |
| 27 | +## Quick Start |
| 28 | + |
| 29 | +### Load from HuggingFace tokenizer.json |
| 30 | + |
| 31 | +```go |
| 32 | +package main |
| 33 | + |
| 34 | +import ( |
| 35 | + "fmt" |
| 36 | + |
| 37 | + "github.com/zerfoo/ztoken" |
| 38 | +) |
| 39 | + |
| 40 | +func main() { |
| 41 | + // Load a HuggingFace tokenizer.json file |
| 42 | + tok, err := ztoken.LoadFromJSON("tokenizer.json") |
| 43 | + if err != nil { |
| 44 | + panic(err) |
| 45 | + } |
| 46 | + |
| 47 | + // Encode text to token IDs |
| 48 | + ids, _ := tok.Encode("Hello, world!") |
| 49 | + fmt.Println(ids) |
| 50 | + |
| 51 | + // Decode token IDs back to text |
| 52 | + text, _ := tok.Decode(ids) |
| 53 | + fmt.Println(text) // Hello, world! |
| 54 | + |
| 55 | + // Inspect vocabulary |
| 56 | + fmt.Println(tok.VocabSize()) |
| 57 | + |
| 58 | + // Access special tokens |
| 59 | + special := tok.SpecialTokens() |
| 60 | + fmt.Printf("BOS=%d EOS=%d PAD=%d UNK=%d\n", |
| 61 | + special.BOS, special.EOS, special.PAD, special.UNK) |
| 62 | +} |
| 63 | +``` |
| 64 | + |
| 65 | +### Extract Tokenizer from GGUF Model Files |
| 66 | + |
| 67 | +The `ztoken/gguf` sub-package extracts tokenizer data directly from GGUF model files, so you don't need a separate `tokenizer.json`: |
| 68 | + |
| 69 | +```go |
| 70 | +package main |
| 71 | + |
| 72 | +import ( |
| 73 | + "fmt" |
| 74 | + |
| 75 | + "github.com/zerfoo/ztoken/gguf" |
| 76 | +) |
14 | 77 |
|
15 | | -- Byte-Pair Encoding (BPE) tokenizer with HuggingFace tokenizer.json support |
16 | | -- SentencePiece compatibility mode |
17 | | -- Special token handling (BOS, EOS, PAD, UNK) |
18 | | -- GGUF tokenizer extraction via `ztoken/gguf` sub-package |
19 | | -- Zero external dependencies (stdlib only, plus golang.org/x/text) |
| 78 | +func main() { |
| 79 | + // metadata is any type implementing gguf.Metadata interface: |
| 80 | + // GetString(key string) (string, bool) |
| 81 | + // GetStringArray(key string) ([]string, bool) |
| 82 | + // GetUint32(key string) (uint32, bool) |
| 83 | + // GetInt32Array(key string) ([]int32, bool) |
| 84 | + tok, err := gguf.ExtractTokenizer(metadata) |
| 85 | + if err != nil { |
| 86 | + panic(err) |
| 87 | + } |
| 88 | + |
| 89 | + ids, _ := tok.Encode("Hello from GGUF!") |
| 90 | + fmt.Println(ids) |
| 91 | +} |
| 92 | +``` |
20 | 93 |
|
21 | | -## Quick start |
| 94 | +### Build a Tokenizer Programmatically |
22 | 95 |
|
23 | 96 | ```go |
24 | 97 | package main |
25 | 98 |
|
26 | 99 | import ( |
27 | | - "fmt" |
| 100 | + "fmt" |
28 | 101 |
|
29 | | - "github.com/zerfoo/ztoken" |
| 102 | + "github.com/zerfoo/ztoken" |
30 | 103 | ) |
31 | 104 |
|
32 | 105 | func main() { |
33 | | - tok, err := ztoken.LoadFromJSON("tokenizer.json") |
34 | | - if err != nil { |
35 | | - panic(err) |
36 | | - } |
37 | | - ids, _ := tok.Encode("Hello, world!") |
38 | | - fmt.Println(ids) |
39 | | - text, _ := tok.Decode(ids) |
40 | | - fmt.Println(text) |
| 106 | + vocab := map[string]int{ |
| 107 | + "hello": 0, "world": 1, " ": 2, |
| 108 | + "<unk>": 3, "<s>": 4, "</s>": 5, "<pad>": 6, |
| 109 | + } |
| 110 | + merges := []ztoken.MergePair{ |
| 111 | + {Left: "hel", Right: "lo"}, |
| 112 | + {Left: "wor", Right: "ld"}, |
| 113 | + } |
| 114 | + special := ztoken.SpecialTokens{BOS: 4, EOS: 5, PAD: 6, UNK: 3} |
| 115 | + |
| 116 | + tok := ztoken.NewBPETokenizer(vocab, merges, special, false) |
| 117 | + ids, _ := tok.Encode("hello") |
| 118 | + fmt.Println(ids) // [0] |
41 | 119 | } |
42 | 120 | ``` |
43 | 121 |
|
| 122 | +## SentencePiece Compatibility |
| 123 | + |
| 124 | +Models using SentencePiece tokenization (Llama, Gemma) encode spaces as the U+2581 character. ztoken handles this automatically when loading from GGUF files with `tokenizer.ggml.model = "llama"`, or you can enable it manually: |
| 125 | + |
| 126 | +```go |
| 127 | +tok := ztoken.NewBPETokenizer(vocab, merges, special, false) |
| 128 | +tok.SetSentencePiece(true) |
| 129 | +``` |
| 130 | + |
| 131 | +## Use Cases |
| 132 | + |
| 133 | +- **ML inference preprocessing** — tokenize prompts before feeding them to transformer models via [zerfoo](https://github.com/zerfoo/zerfoo) |
| 134 | +- **Text processing pipelines** — encode/decode text with production-grade BPE |
| 135 | +- **Model tooling** — extract and inspect tokenizers from GGUF and HuggingFace model files |
| 136 | +- **Embedding in Go services** — zero-CGo tokenization that compiles with `go build` |
| 137 | + |
| 138 | +## Package Structure |
| 139 | + |
| 140 | +| Package | Description | |
| 141 | +|---------|-------------| |
| 142 | +| `ztoken` | Core tokenizer interface, BPE implementation, HuggingFace JSON loader | |
| 143 | +| `ztoken/gguf` | GGUF metadata-based tokenizer extraction | |
| 144 | + |
| 145 | +## Dependencies |
| 146 | + |
| 147 | +ztoken has zero external dependencies beyond the Go standard library and `golang.org/x/text` for Unicode normalization. |
| 148 | + |
| 149 | +ztoken is used by: |
| 150 | + |
| 151 | +- [zerfoo](https://github.com/zerfoo/zerfoo) — ML inference, training, and serving framework |
| 152 | + |
44 | 153 | ## License |
45 | 154 |
|
46 | 155 | Apache 2.0 |
0 commit comments