<a href="https://colab.research.google.com/github/tomoyahiroe/transformers-playground/blob/main/special_tokens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# transformersのインストール
!pip install transformers[ja,sentencepiece,torch]

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
import pprint

# types of special tokens by specific models

## BERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
pprint.pp(tokenizer.special_tokens_map)

# {'unk_token': '[UNK]',
#  'sep_token': '[SEP]',
#  'pad_token': '[PAD]',
#  'cls_token': '[CLS]',
#  'mask_token': '[MASK]'}

In [None]:
tokenizer = AutoTokenizer.from_pretrained("llm-book/bert-base-japanese-v3-marc_ja")
pprint.pp(tokenizer.special_tokens_map)

# {'unk_token': '[UNK]',
#  'sep_token': '[SEP]',
#  'pad_token': '[PAD]',
#  'cls_token': '[CLS]',
#  'mask_token': '[MASK]'}

## GPT

In [None]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
pprint.pp(tokenizer.special_tokens_map)

# {'bos_token': '<|endoftext|>',
#  'eos_token': '<|endoftext|>',
#  'unk_token': '<|endoftext|>'}

In [None]:
tokenizer = AutoTokenizer.from_pretrained("abeja/gpt2-large-japanese")
pprint.pp(tokenizer.special_tokens_map)

# {'bos_token': '<s>',
#  'eos_token': '</s>',
#  'unk_token': '<unk>',
#  'sep_token': '[SEP]',
#  'pad_token': '[PAD]',
#  'cls_token': '[CLS]',
#  'mask_token': '[MASK]'}

## T5

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/t5-v1_1-xl")
pprint.pp(tokenizer.special_tokens_map)

# {'eos_token': '</s>',
#  'unk_token': '<unk>',
#  'pad_token': '<pad>',
#  'additional_special_tokens': ['<extra_id_0>',
#                                '<extra_id_1>',
#                                '<extra_id_2>',
#                                '<extra_id_3>',
#                                '<extra_id_4>',
#                                '<extra_id_5>',
#                                '<extra_id_6>',
#                                '<extra_id_7>',
#                                ...
#                                '<extra_id_99>']}

In [None]:
tokenizer = AutoTokenizer.from_pretrained("llm-book/t5-base-long-livedoor-news-corpus")
pprint.pp(tokenizer.special_tokens_map)

# {'eos_token': '</s>',
#  'unk_token': '<unk>',
#  'pad_token': '<pad>',
#  'additional_special_tokens': ['<extra_id_0>',
#                                '<extra_id_1>',
#                                '<extra_id_2>',
#                                '<extra_id_3>',
#                                '<extra_id_4>',
#                                '<extra_id_5>',
#                                '<extra_id_6>',
#                                '<extra_id_7>',
#                                ...
#                                '<extra_id_99>']}

## その他？

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")
pprint.pp(tokenizer.special_tokens_map)

# {'eos_token': '<|im_end|>',
#  'pad_token': '<|endoftext|>',
#  'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}

In [None]:
tokenizer = AutoTokenizer.from_pretrained("nanonets/Nanonets-OCR-s") # Image Text to Text
pprint.pp(tokenizer.special_tokens_map)

# {'eos_token': '<|im_end|>',
#  'pad_token': '<|endoftext|>',
#  'additional_special_tokens': ['<|im_start|>',
#                                '<|im_end|>',
#                                '<|object_ref_start|>',
#                                '<|object_ref_end|>',
#                                '<|box_start|>',
#                                '<|box_end|>',
#                                '<|quad_start|>',
#                                '<|quad_end|>',
#                                '<|vision_start|>',
#                                '<|vision_end|>',
#                                '<|vision_pad|>',
#                                '<|image_pad|>',
#                                '<|video_pad|>']}

In [None]:
tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-M1-80k")
pprint.pp(tokenizer.special_tokens_map)

# {'bos_token': '<beginning_of_sentence>',
#  'eos_token': '<end_of_sentence>',
#  'unk_token': '<end_of_document>'}

# models config

In [None]:
model = AutoModelForCausalLM.from_pretrained(
"abeja/gpt2-large-japanese"
)
model.config

# GPT2Config {
#   "activation_function": "gelu_new",
#   "architectures": [
#     "GPT2LMHeadModel"
#   ],
#   "attn_pdrop": 0.1,
#   "bos_token_id": 1,
#   "embd_pdrop": 0.1,
#   "eos_token_id": 2,
#   "initializer_range": 0.02,
#   "layer_norm_epsilon": 1e-05,
#   "model_type": "gpt2",
#   "n_ctx": 1024,
#   "n_embd": 1280,
#   "n_head": 20,
#   "n_inner": null,
#   "n_layer": 36,
#   "n_positions": 1024,
#   "reorder_and_upcast_attn": false,
#   "resid_pdrop": 0.1,
#   "scale_attn_by_inverse_layer_idx": false,
#   "scale_attn_weights": true,
#   "summary_activation": null,
#   "summary_first_dropout": 0.1,
#   "summary_proj_to_labels": true,
#   "summary_type": "cls_index",
#   "summary_use_proj": true,
#   "task_specific_params": {
#     "text-generation": {
#       "do_sample": true,
#       "max_length": 50
#     }
#   },
#   "tokenizer_class": "T5Tokenizer",
#   "torch_dtype": "float32",
#   "transformers_version": "4.52.4",
#   "use_cache": true,
#   "vocab_size": 32000
# }