In [1]:
from gensim.models import KeyedVectors

model: KeyedVectors = KeyedVectors.load_word2vec_format(
    "../data/GoogleNews-vectors-negative300.bin.gz", binary=True, limit=10_0000
)
dict(list(model.key_to_index.items())[:5])

{'</s>': 0, 'in': 1, 'for': 2, 'that': 3, 'is': 4}

In [3]:
import json


used_words = []
used_indices = []

for word, index in model.key_to_index.items():
    # ノイズ，2 単語からなるトークンを削除
    if len(word) > 1 and ("_" in word or "#" in word):
        continue

    used_words.append(word)
    used_indices.append(index)

unk_token = "[UNK]"
used_words.append(unk_token)
used_indices.append(len(used_indices))


with open("google-news/vocab.json", "w") as f:
    json.dump({k: i for i, k in enumerate(used_words)}, f, indent=2)

In [4]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace


tokenizer = Tokenizer(WordLevel.from_file("google-news/vocab.json", unk_token=unk_token))
tokenizer.pre_tokenizer = Whitespace()
tokenizer.add_special_tokens([unk_token])
tokenizer.save("google-news/tokenizer.json")

In [5]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file="google-news/tokenizer.json")

In [6]:
tokens = tokenizer("Hello, world!")
tokens

{'input_ids': [15494, 75697, 150, 75697], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [7]:
tokenizer.decode(tokens["input_ids"])

'Hello [UNK] world [UNK]'

`GoogleNews-vectors-negative300.bin.gz` は "単語" 埋め込みなので記号とか冠詞の a とかが無い．

語彙集合は単語埋め込みのキーではなく訓練データから作ったほうが良さそう．