### Introduce the pre-trained model

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from jtk import RobertaJumanTokenizer

In [2]:
checkpoint = "nlp-waseda/roberta-base-japanese-with-auto-jumanpp"
# checkpoint = "cl-tohoku/bert-base-japanese-char-v2"
tokenizer = RobertaJumanTokenizer.from_pretrained("nlp-waseda/roberta-base-japanese")
model = AutoModelForMaskedLM.from_pretrained(checkpoint)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'AlbertTokenizer'. 
The class this function is called from is 'RobertaJumanTokenizer'.


Exception: Can't find JUMAN command: jumanpp

In [8]:
tokenizer

PreTrainedTokenizerFast(name_or_path='nlp-waseda/roberta-base-japanese', vocab_size=32000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True)})

### Loading data

In [9]:
from datasets import load_dataset
dataset = load_dataset(path="dataset/processed/")

Using custom data configuration processed-56ae36322fe58fe1
Found cached dataset text (/home/tian/.cache/huggingface/datasets/text/processed-56ae36322fe58fe1/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad)


  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 86555
    })
})

In [11]:
sample = dataset['train'].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Text: {row['text']}'")

Loading cached shuffled indices for dataset at /home/tian/.cache/huggingface/datasets/text/processed-56ae36322fe58fe1/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad/cache-64978b3cbd241a8a.arrow



'>>> Text: 許そうと思う夜道の沈丁花'

'>>> Text: 夏空で身長測定雲たちよ'

'>>> Text: しょうがつのかいものたてやまよくみえる'


In [12]:
def tokenize_function(examples):
    result = tokenizer(examples["text"], padding=True, return_tensors='pt')
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [13]:
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

Loading cached processed dataset at /home/tian/.cache/huggingface/datasets/text/processed-56ae36322fe58fe1/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad/cache-55d048f4f3b8c8d6.arrow


In [14]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 86555
    })
})

In [15]:
tokenized_datasets['train']['input_ids'][0], tokenized_datasets['train']['attention_mask'][0], tokenized_datasets['train']['word_ids'][0]

([2,
  10549,
  4778,
  8295,
  23367,
  23367,
  20762,
  1954,
  1652,
  1378,
  961,
  3008,
  769,
  368,
  1285,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 [None,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  None,
  None,
  None,
  None,
  None,
  None,
  None,
  None])

In [16]:
tokenizer.convert_tokens_to_ids(['UNK'])

[25624]

In [17]:
n = 0
for line in tokenized_datasets['train']['input_ids']:
    for w in line:
        if w == 25624:
            n += 1

print(f"#[UNK] = {n}")

#[UNK] = 0


In [18]:
tokenizer.decode(tokenized_datasets["train"][0]["input_ids"])

'[CLS] じゅうさんさいぱぱをおやじとよんでなつ[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

### Training model

In [19]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

In [45]:
samples = [tokenized_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] じゅうさん[MASK]ぱぱ[MASK]おやじとよんでなつ[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

'>>> [CLS][MASK]るこにーでぶんこいちさつぶんのひや[MASK][SEP][PAD][PAD][PAD][PAD][PAD][PAD]'


In [50]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [51]:
model = AutoModelForMaskedLM.from_pretrained(checkpoint)

loading configuration file config.json from cache at /home/tian/.cache/huggingface/hub/models--nlp-waseda--roberta-base-japanese/snapshots/49ce73ecd6eb4dab3c1c06edaaab35892b3cea80/config.json
Model config RobertaConfig {
  "_name_or_path": "nlp-waseda/roberta-base-japanese",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 2,
  "classifier_dropout": null,
  "eos_token_id": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file pytorch_model.bin from cache at /home/tian/.cache/huggingface/hub/m

In [52]:
training_args.num_train_epochs = 8.0
training_args.save_steps = 10000
training_args.weight_decay = 0.01
num_warmup_steps = 1_000,
init_lr = 2e-5,
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_nam

In [53]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [54]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 86555
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 86560
  Number of trainable parameters = 110652416


  0%|          | 0/86560 [00:00<?, ?it/s]

{'loss': 4.6163, 'learning_rate': 4.971118299445471e-05, 'epoch': 0.05}
{'loss': 4.2999, 'learning_rate': 4.942236598890943e-05, 'epoch': 0.09}
{'loss': 4.1822, 'learning_rate': 4.9133548983364145e-05, 'epoch': 0.14}
{'loss': 4.1085, 'learning_rate': 4.884473197781886e-05, 'epoch': 0.18}
{'loss': 4.0244, 'learning_rate': 4.855591497227357e-05, 'epoch': 0.23}
{'loss': 3.9483, 'learning_rate': 4.826709796672828e-05, 'epoch': 0.28}
{'loss': 3.9225, 'learning_rate': 4.7978280961182996e-05, 'epoch': 0.32}
{'loss': 3.8385, 'learning_rate': 4.7689463955637706e-05, 'epoch': 0.37}
{'loss': 3.816, 'learning_rate': 4.740064695009242e-05, 'epoch': 0.42}
{'loss': 3.7875, 'learning_rate': 4.711182994454714e-05, 'epoch': 0.46}
{'loss': 3.7351, 'learning_rate': 4.682301293900185e-05, 'epoch': 0.51}
{'loss': 3.6842, 'learning_rate': 4.6534195933456565e-05, 'epoch': 0.55}
{'loss': 3.676, 'learning_rate': 4.6245378927911274e-05, 'epoch': 0.6}
{'loss': 3.577, 'learning_rate': 4.595656192236599e-05, 'epoch

Saving model checkpoint to test-trainer/checkpoint-10000
Configuration saved in test-trainer/checkpoint-10000/config.json


{'loss': 3.5315, 'learning_rate': 4.422365988909427e-05, 'epoch': 0.92}


Model weights saved in test-trainer/checkpoint-10000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-10000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-10000/special_tokens_map.json


{'loss': 3.4656, 'learning_rate': 4.3934842883548985e-05, 'epoch': 0.97}
{'loss': 3.4053, 'learning_rate': 4.36460258780037e-05, 'epoch': 1.02}
{'loss': 3.3991, 'learning_rate': 4.335720887245841e-05, 'epoch': 1.06}
{'loss': 3.3815, 'learning_rate': 4.306839186691313e-05, 'epoch': 1.11}
{'loss': 3.3449, 'learning_rate': 4.2779574861367836e-05, 'epoch': 1.16}
{'loss': 3.3742, 'learning_rate': 4.249075785582255e-05, 'epoch': 1.2}
{'loss': 3.319, 'learning_rate': 4.220194085027727e-05, 'epoch': 1.25}
{'loss': 3.2667, 'learning_rate': 4.191312384473198e-05, 'epoch': 1.29}
{'loss': 3.3347, 'learning_rate': 4.1624306839186695e-05, 'epoch': 1.34}
{'loss': 3.2407, 'learning_rate': 4.1335489833641405e-05, 'epoch': 1.39}
{'loss': 3.3009, 'learning_rate': 4.104667282809612e-05, 'epoch': 1.43}
{'loss': 3.2549, 'learning_rate': 4.075785582255083e-05, 'epoch': 1.48}
{'loss': 3.1463, 'learning_rate': 4.046903881700555e-05, 'epoch': 1.52}
{'loss': 3.2513, 'learning_rate': 4.018022181146026e-05, 'epoch

Saving model checkpoint to test-trainer/checkpoint-20000
Configuration saved in test-trainer/checkpoint-20000/config.json


{'loss': 3.0912, 'learning_rate': 3.844731977818854e-05, 'epoch': 1.85}


Model weights saved in test-trainer/checkpoint-20000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-20000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-20000/special_tokens_map.json


{'loss': 3.1289, 'learning_rate': 3.815850277264326e-05, 'epoch': 1.89}
{'loss': 3.0968, 'learning_rate': 3.786968576709797e-05, 'epoch': 1.94}
{'loss': 3.1429, 'learning_rate': 3.758086876155268e-05, 'epoch': 1.99}
{'loss': 3.0806, 'learning_rate': 3.729205175600739e-05, 'epoch': 2.03}
{'loss': 3.0341, 'learning_rate': 3.700323475046211e-05, 'epoch': 2.08}
{'loss': 3.0414, 'learning_rate': 3.6714417744916825e-05, 'epoch': 2.13}
{'loss': 3.0157, 'learning_rate': 3.6425600739371535e-05, 'epoch': 2.17}
{'loss': 2.9226, 'learning_rate': 3.613678373382625e-05, 'epoch': 2.22}
{'loss': 3.0411, 'learning_rate': 3.584796672828096e-05, 'epoch': 2.26}
{'loss': 2.9884, 'learning_rate': 3.555914972273568e-05, 'epoch': 2.31}
{'loss': 3.0055, 'learning_rate': 3.5270332717190394e-05, 'epoch': 2.36}
{'loss': 2.9623, 'learning_rate': 3.49815157116451e-05, 'epoch': 2.4}
{'loss': 2.9341, 'learning_rate': 3.469269870609982e-05, 'epoch': 2.45}
{'loss': 2.9418, 'learning_rate': 3.440388170055453e-05, 'epoch

Saving model checkpoint to test-trainer/checkpoint-30000
Configuration saved in test-trainer/checkpoint-30000/config.json


{'loss': 2.9498, 'learning_rate': 3.2670979667282814e-05, 'epoch': 2.77}


Model weights saved in test-trainer/checkpoint-30000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-30000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-30000/special_tokens_map.json


{'loss': 2.889, 'learning_rate': 3.238216266173752e-05, 'epoch': 2.82}
{'loss': 2.8763, 'learning_rate': 3.209334565619224e-05, 'epoch': 2.87}
{'loss': 2.8842, 'learning_rate': 3.1804528650646956e-05, 'epoch': 2.91}
{'loss': 2.9415, 'learning_rate': 3.1515711645101665e-05, 'epoch': 2.96}
{'loss': 2.8505, 'learning_rate': 3.122689463955638e-05, 'epoch': 3.0}
{'loss': 2.8034, 'learning_rate': 3.093807763401109e-05, 'epoch': 3.05}
{'loss': 2.8332, 'learning_rate': 3.06492606284658e-05, 'epoch': 3.1}
{'loss': 2.7894, 'learning_rate': 3.036044362292052e-05, 'epoch': 3.14}
{'loss': 2.7725, 'learning_rate': 3.007162661737523e-05, 'epoch': 3.19}
{'loss': 2.7797, 'learning_rate': 2.9782809611829947e-05, 'epoch': 3.23}
{'loss': 2.8012, 'learning_rate': 2.949399260628466e-05, 'epoch': 3.28}
{'loss': 2.8015, 'learning_rate': 2.920517560073937e-05, 'epoch': 3.33}
{'loss': 2.7493, 'learning_rate': 2.891635859519409e-05, 'epoch': 3.37}
{'loss': 2.7599, 'learning_rate': 2.86275415896488e-05, 'epoch': 

Saving model checkpoint to test-trainer/checkpoint-40000
Configuration saved in test-trainer/checkpoint-40000/config.json


{'loss': 2.7829, 'learning_rate': 2.6894639556377083e-05, 'epoch': 3.7}


Model weights saved in test-trainer/checkpoint-40000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-40000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-40000/special_tokens_map.json


{'loss': 2.705, 'learning_rate': 2.6605822550831792e-05, 'epoch': 3.74}
{'loss': 2.7187, 'learning_rate': 2.6317005545286505e-05, 'epoch': 3.79}
{'loss': 2.7312, 'learning_rate': 2.6028188539741222e-05, 'epoch': 3.84}
{'loss': 2.6907, 'learning_rate': 2.573937153419593e-05, 'epoch': 3.88}
{'loss': 2.7684, 'learning_rate': 2.545055452865065e-05, 'epoch': 3.93}
{'loss': 2.7059, 'learning_rate': 2.516173752310536e-05, 'epoch': 3.97}
{'loss': 2.7523, 'learning_rate': 2.4872920517560074e-05, 'epoch': 4.02}
{'loss': 2.6353, 'learning_rate': 2.458410351201479e-05, 'epoch': 4.07}
{'loss': 2.5971, 'learning_rate': 2.4295286506469503e-05, 'epoch': 4.11}
{'loss': 2.6489, 'learning_rate': 2.4006469500924216e-05, 'epoch': 4.16}
{'loss': 2.6093, 'learning_rate': 2.371765249537893e-05, 'epoch': 4.21}
{'loss': 2.5788, 'learning_rate': 2.3428835489833642e-05, 'epoch': 4.25}
{'loss': 2.6341, 'learning_rate': 2.3140018484288355e-05, 'epoch': 4.3}
{'loss': 2.6222, 'learning_rate': 2.285120147874307e-05, '

Saving model checkpoint to test-trainer/checkpoint-50000
Configuration saved in test-trainer/checkpoint-50000/config.json


{'loss': 2.5807, 'learning_rate': 2.1118299445471352e-05, 'epoch': 4.62}


Model weights saved in test-trainer/checkpoint-50000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-50000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-50000/special_tokens_map.json


{'loss': 2.6048, 'learning_rate': 2.0829482439926065e-05, 'epoch': 4.67}
{'loss': 2.5887, 'learning_rate': 2.0540665434380778e-05, 'epoch': 4.71}
{'loss': 2.5156, 'learning_rate': 2.025184842883549e-05, 'epoch': 4.76}
{'loss': 2.5505, 'learning_rate': 1.9963031423290204e-05, 'epoch': 4.81}
{'loss': 2.5703, 'learning_rate': 1.9674214417744917e-05, 'epoch': 4.85}
{'loss': 2.483, 'learning_rate': 1.9385397412199633e-05, 'epoch': 4.9}
{'loss': 2.5397, 'learning_rate': 1.9096580406654343e-05, 'epoch': 4.94}
{'loss': 2.5797, 'learning_rate': 1.880776340110906e-05, 'epoch': 4.99}
{'loss': 2.4474, 'learning_rate': 1.8518946395563772e-05, 'epoch': 5.04}
{'loss': 2.4871, 'learning_rate': 1.8230129390018485e-05, 'epoch': 5.08}
{'loss': 2.4077, 'learning_rate': 1.7941312384473198e-05, 'epoch': 5.13}
{'loss': 2.4807, 'learning_rate': 1.7652495378927914e-05, 'epoch': 5.18}
{'loss': 2.4253, 'learning_rate': 1.7363678373382624e-05, 'epoch': 5.22}
{'loss': 2.5013, 'learning_rate': 1.707486136783734e-05

Saving model checkpoint to test-trainer/checkpoint-60000
Configuration saved in test-trainer/checkpoint-60000/config.json


{'loss': 2.4002, 'learning_rate': 1.534195933456562e-05, 'epoch': 5.55}


Model weights saved in test-trainer/checkpoint-60000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-60000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-60000/special_tokens_map.json


{'loss': 2.4853, 'learning_rate': 1.5053142329020334e-05, 'epoch': 5.59}
{'loss': 2.4264, 'learning_rate': 1.4764325323475047e-05, 'epoch': 5.64}
{'loss': 2.4573, 'learning_rate': 1.4475508317929759e-05, 'epoch': 5.68}
{'loss': 2.4068, 'learning_rate': 1.4186691312384473e-05, 'epoch': 5.73}
{'loss': 2.3893, 'learning_rate': 1.3897874306839186e-05, 'epoch': 5.78}
{'loss': 2.3939, 'learning_rate': 1.36090573012939e-05, 'epoch': 5.82}
{'loss': 2.405, 'learning_rate': 1.3320240295748615e-05, 'epoch': 5.87}
{'loss': 2.4079, 'learning_rate': 1.3031423290203328e-05, 'epoch': 5.91}
{'loss': 2.4392, 'learning_rate': 1.274260628465804e-05, 'epoch': 5.96}
{'loss': 2.3951, 'learning_rate': 1.2453789279112754e-05, 'epoch': 6.01}
{'loss': 2.3416, 'learning_rate': 1.2164972273567469e-05, 'epoch': 6.05}
{'loss': 2.3597, 'learning_rate': 1.1876155268022182e-05, 'epoch': 6.1}
{'loss': 2.353, 'learning_rate': 1.1587338262476895e-05, 'epoch': 6.15}
{'loss': 2.3155, 'learning_rate': 1.129852125693161e-05, 

Saving model checkpoint to test-trainer/checkpoint-70000
Configuration saved in test-trainer/checkpoint-70000/config.json


{'loss': 2.2346, 'learning_rate': 9.56561922365989e-06, 'epoch': 6.47}


Model weights saved in test-trainer/checkpoint-70000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-70000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-70000/special_tokens_map.json


{'loss': 2.3493, 'learning_rate': 9.276802218114602e-06, 'epoch': 6.52}
{'loss': 2.2863, 'learning_rate': 8.987985212569317e-06, 'epoch': 6.56}
{'loss': 2.3292, 'learning_rate': 8.699168207024031e-06, 'epoch': 6.61}
{'loss': 2.2874, 'learning_rate': 8.410351201478742e-06, 'epoch': 6.65}
{'loss': 2.311, 'learning_rate': 8.121534195933457e-06, 'epoch': 6.7}
{'loss': 2.3614, 'learning_rate': 7.832717190388172e-06, 'epoch': 6.75}
{'loss': 2.3663, 'learning_rate': 7.543900184842884e-06, 'epoch': 6.79}
{'loss': 2.2771, 'learning_rate': 7.255083179297598e-06, 'epoch': 6.84}
{'loss': 2.2974, 'learning_rate': 6.96626617375231e-06, 'epoch': 6.89}
{'loss': 2.3446, 'learning_rate': 6.6774491682070244e-06, 'epoch': 6.93}
{'loss': 2.2799, 'learning_rate': 6.388632162661738e-06, 'epoch': 6.98}
{'loss': 2.3054, 'learning_rate': 6.099815157116451e-06, 'epoch': 7.02}
{'loss': 2.2588, 'learning_rate': 5.810998151571165e-06, 'epoch': 7.07}
{'loss': 2.204, 'learning_rate': 5.522181146025878e-06, 'epoch': 7

Saving model checkpoint to test-trainer/checkpoint-80000
Configuration saved in test-trainer/checkpoint-80000/config.json


{'loss': 2.2122, 'learning_rate': 3.789279112754159e-06, 'epoch': 7.39}


Model weights saved in test-trainer/checkpoint-80000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-80000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-80000/special_tokens_map.json


{'loss': 2.2133, 'learning_rate': 3.5004621072088725e-06, 'epoch': 7.44}
{'loss': 2.1673, 'learning_rate': 3.211645101663586e-06, 'epoch': 7.49}
{'loss': 2.2702, 'learning_rate': 2.9228280961182996e-06, 'epoch': 7.53}
{'loss': 2.2308, 'learning_rate': 2.634011090573013e-06, 'epoch': 7.58}
{'loss': 2.1932, 'learning_rate': 2.3451940850277264e-06, 'epoch': 7.62}
{'loss': 2.2399, 'learning_rate': 2.05637707948244e-06, 'epoch': 7.67}
{'loss': 2.2039, 'learning_rate': 1.7675600739371536e-06, 'epoch': 7.72}
{'loss': 2.2261, 'learning_rate': 1.478743068391867e-06, 'epoch': 7.76}
{'loss': 2.2424, 'learning_rate': 1.1899260628465803e-06, 'epoch': 7.81}
{'loss': 2.2375, 'learning_rate': 9.011090573012939e-07, 'epoch': 7.86}
{'loss': 2.1981, 'learning_rate': 6.122920517560074e-07, 'epoch': 7.9}
{'loss': 2.226, 'learning_rate': 3.234750462107209e-07, 'epoch': 7.95}
{'loss': 2.211, 'learning_rate': 3.465804066543438e-08, 'epoch': 7.99}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 5274.7398, 'train_samples_per_second': 131.275, 'train_steps_per_second': 16.41, 'train_loss': 2.789232480019166, 'epoch': 8.0}


TrainOutput(global_step=86560, training_loss=2.789232480019166, metrics={'train_runtime': 5274.7398, 'train_samples_per_second': 131.275, 'train_steps_per_second': 16.41, 'train_loss': 2.789232480019166, 'epoch': 8.0})

In [55]:
trainer.save_model()

Saving model checkpoint to test-trainer
Configuration saved in test-trainer/config.json
Model weights saved in test-trainer/pytorch_model.bin
tokenizer config file saved in test-trainer/tokenizer_config.json
Special tokens file saved in test-trainer/special_tokens_map.json
