-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtokenizer.py
35 lines (27 loc) · 818 Bytes
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from curtsies.fmtfuncs import red, green, on_blue, yellow, blue, cyan
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
TRAIN_BASE = False
TOKENIZER_DIR = "../tokenizer"
paths = ["../data.txt"]
if TRAIN_BASE:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=paths, vocab_size=52000, min_frequency=2, special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
])
tokenizer.save_model(TOKENIZER_DIR)
inp = "print('hello world!')"
tokenizer = GPT2Tokenizer.from_pretrained(TOKENIZER_DIR)
tokenizer.add_special_tokens({
"eos_token": "</s>",
"bos_token": "<s>",
"unk_token": "<unk>",
"pad_token": "<pad>",
"mask_token": "<mask>"
})
t = tokenizer.encode(inp)
print(t)