<a href="https://colab.research.google.com/github/tomoyahiroe/transformers-playground/blob/main/pad_token_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers[ja,sentencepiece,torch]

In [None]:
import torch
from transformers import AutoTokenizer
import pprint

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v3")

In [None]:
inputs = [
    "明日は晴れるだろう",
    "今日も一日良い天気だった"
]

In [None]:
tokenized_inputs = tokenizer(inputs, padding=True, return_tensors="pt")

In [None]:
pprint.pp(tokenized_inputs["input_ids"])
# tensor([[    2, 18767,   465, 25466,  7056, 15063,     3,     0,     0,     0],
#         [    2, 15028,   484,   601,  2698, 14459, 18834, 12563,   449,     3]])

In [None]:
decoded_tokens = [tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][i]) for i in range(len(inputs))]

In [None]:
pprint.pp(decoded_tokens)
# [['[CLS]', '明日', 'は', '晴れ', '##る', 'だろう', '[SEP]', '[PAD]', '[PAD]', '[PAD]'],
#  ['[CLS]', '今日', 'も', '一', '日', '良い', '天気', 'だっ', 'た', '[SEP]']]

## モデルの最大長に合わせてパディングする

In [None]:
tokenized_inputs = tokenizer(inputs, padding='max_length', return_tensors="pt")
decoded_tokens = [tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][i]) for i in range(len(inputs))]
print(len(tokenized_inputs['input_ids'][0]))
pprint.pp(decoded_tokens[0])

# 512
# ['[CLS]',
#  '明日',
#  'は',
#  '晴れ',
#  '##る',
#  'だろう',
#  '[SEP]',
#  '[PAD]',
#  '[PAD]',
#   ...
#  '[PAD]',
#  '[PAD]']

## 任意の長さでパディングする

In [None]:
tokenized_inputs = tokenizer(inputs, padding='max_length', max_length=15, return_tensors="pt")
decoded_tokens = [tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][i]) for i in range(len(inputs))]
print(len(tokenized_inputs['input_ids'][0]))
pprint.pp(decoded_tokens[0])

# 15
# ['[CLS]',
#  '明日',
#  'は',
#  '晴れ',
#  '##る',
#  'だろう',
#  '[SEP]',
#  '[PAD]',
#  '[PAD]',
#  '[PAD]',
#  '[PAD]',
#  '[PAD]',
#  '[PAD]',
#  '[PAD]',
#  '[PAD]']