In [1]:
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, DefaultDataCollator, TrainingArguments, Trainer, AutoModelForQuestionAnswering
from datasets import Dataset
import json
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Use model 
model = RobertaForQuestionAnswering.from_pretrained("vinai/phobert-base")
tokenizer = RobertaTokenizerFast.from_pretrained("vinai/phobert-base")


def tokenize_function(examples):
    return tokenizer(examples["context"], padding="max_length", truncation=True)

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a

In [3]:
# Function read file train
def read_data(filePath):
    f = open(filePath,encoding='utf-8')
    fileRead = json.load(f)
    context = []
    question = []
    answer = []
    for object in fileRead:
        for qa in object["qas"]:
            context.append(object["context"].lower())
            question.append(qa["question"].lower())
            answer.append({
                "text": [qa["answer"]["text"][0].lower()],
                "start": [qa["answer"]["start"][0]]
            })
    dataset = {
    "context": context,
    "question": question,
    "answer": answer
    }
    return dataset

In [83]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=128,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["start"][0]
        end_char = answer["start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [105]:
dataset = read_data("./QA_data/qa_train.json")
eval_dataset = read_data("./QA_data/qa_eval.json")

In [106]:
dataset_train = Dataset.from_dict(dataset)
dataset_eval = Dataset.from_dict(eval_dataset)

In [107]:
dataset_train

Dataset({
    features: ['context', 'question', 'answer'],
    num_rows: 1119
})

In [108]:
tokenized_squad = dataset_train.map(preprocess_function, batched=True, remove_columns=dataset_train.column_names)
tokenized_squad_eval = dataset_eval.map(preprocess_function, batched=True, remove_columns=dataset_eval.column_names)



In [110]:
tokenized_squad

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 1119
})

In [111]:
data_collator = DefaultDataCollator()

In [115]:
training_args = TrainingArguments(
    output_dir="phobert_law",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=100,
    push_to_hub=False,
)

In [116]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad,
    eval_dataset=tokenized_squad_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    )

In [117]:
trainer.train()

 11%|█         | 666/6300 [2:53:38<24:28:58, 15.64s/it]
  1%|          | 70/7000 [16:07<27:14:09, 14.15s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                    
[A                                                 
  1%|          | 70/7000 [16:57<27:14:09, 14.15s/it]
[A

{'eval_loss': 1.005435824394226, 'eval_runtime': 50.3835, 'eval_samples_per_second': 3.672, 'eval_steps_per_second': 0.238, 'epoch': 1.0}


  2%|▏         | 140/7000 [32:53<23:28:57, 12.32s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                     
[A                                                 
  2%|▏         | 140/7000 [33:37<23:28:57, 12.32s/it]
[A

{'eval_loss': 0.7314278483390808, 'eval_runtime': 43.7416, 'eval_samples_per_second': 4.229, 'eval_steps_per_second': 0.274, 'epoch': 2.0}


  3%|▎         | 210/7000 [48:59<25:36:55, 13.58s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                     
[A                                                 
  3%|▎         | 210/7000 [49:47<25:36:55, 13.58s/it]
[A

{'eval_loss': 0.5932534337043762, 'eval_runtime': 48.0168, 'eval_samples_per_second': 3.853, 'eval_steps_per_second': 0.25, 'epoch': 3.0}


  4%|▍         | 280/7000 [1:05:05<23:56:54, 12.83s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                       
[A                                                   
  4%|▍         | 280/7000 [1:05:53<23:56:54, 12.83s/it]
[A

{'eval_loss': 0.5414730310440063, 'eval_runtime': 47.3099, 'eval_samples_per_second': 3.91, 'eval_steps_per_second': 0.254, 'epoch': 4.0}


  5%|▌         | 350/7000 [1:20:38<22:50:48, 12.37s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                       
[A                                                   
  5%|▌         | 350/7000 [1:21:22<22:50:48, 12.37s/it]
[A

{'eval_loss': 0.5151845812797546, 'eval_runtime': 43.7461, 'eval_samples_per_second': 4.229, 'eval_steps_per_second': 0.274, 'epoch': 5.0}


  6%|▌         | 420/7000 [1:36:15<23:42:41, 12.97s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                       
[A                                                   
  6%|▌         | 420/7000 [1:37:01<23:42:41, 12.97s/it]
[A

{'eval_loss': 0.4983881711959839, 'eval_runtime': 46.051, 'eval_samples_per_second': 4.017, 'eval_steps_per_second': 0.261, 'epoch': 6.0}


  7%|▋         | 490/7000 [1:52:15<23:31:14, 13.01s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                       
[A                                                   
  7%|▋         | 490/7000 [1:53:00<23:31:14, 13.01s/it]
[A

{'eval_loss': 0.4552173912525177, 'eval_runtime': 45.4328, 'eval_samples_per_second': 4.072, 'eval_steps_per_second': 0.264, 'epoch': 7.0}


                                                       
  7%|▋         | 500/7000 [1:55:04<22:34:38, 12.50s/it]

{'loss': 0.1742, 'learning_rate': 1.8571428571428575e-05, 'epoch': 7.14}


  8%|▊         | 560/7000 [2:07:53<22:41:03, 12.68s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                       
[A                                                   
  8%|▊         | 560/7000 [2:08:40<22:41:03, 12.68s/it]
[A

{'eval_loss': 0.4397463798522949, 'eval_runtime': 46.7412, 'eval_samples_per_second': 3.958, 'eval_steps_per_second': 0.257, 'epoch': 8.0}


  9%|▉         | 630/7000 [2:23:59<23:18:59, 13.18s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                       
[A                                                   
  9%|▉         | 630/7000 [2:24:47<23:18:59, 13.18s/it]
[A

{'eval_loss': 0.5358627438545227, 'eval_runtime': 47.8481, 'eval_samples_per_second': 3.866, 'eval_steps_per_second': 0.251, 'epoch': 9.0}


 10%|█         | 700/7000 [2:39:54<22:29:01, 12.85s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                       
[A                                                   
 10%|█         | 700/7000 [2:40:40<22:29:01, 12.85s/it]
[A

{'eval_loss': 0.47583237290382385, 'eval_runtime': 45.9886, 'eval_samples_per_second': 4.023, 'eval_steps_per_second': 0.261, 'epoch': 10.0}


 11%|█         | 770/7000 [2:55:45<20:41:12, 11.95s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                       
[A                                                   
 11%|█         | 770/7000 [2:56:31<20:41:12, 11.95s/it]
[A

{'eval_loss': 0.40715309977531433, 'eval_runtime': 46.2285, 'eval_samples_per_second': 4.002, 'eval_steps_per_second': 0.26, 'epoch': 11.0}


 12%|█▏        | 840/7000 [3:11:44<21:09:11, 12.36s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                       
[A                                                   
 12%|█▏        | 840/7000 [3:12:28<21:09:11, 12.36s/it]
[A

{'eval_loss': 0.4625696837902069, 'eval_runtime': 43.9682, 'eval_samples_per_second': 4.208, 'eval_steps_per_second': 0.273, 'epoch': 12.0}


 13%|█▎        | 910/7000 [3:27:23<22:00:38, 13.01s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                       
[A                                                   
 13%|█▎        | 910/7000 [3:28:09<22:00:38, 13.01s/it]
[A

{'eval_loss': 0.45435255765914917, 'eval_runtime': 46.2387, 'eval_samples_per_second': 4.001, 'eval_steps_per_second': 0.26, 'epoch': 13.0}


 14%|█▍        | 980/7000 [3:42:55<19:56:34, 11.93s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                       
[A                                                   
 14%|█▍        | 980/7000 [3:43:39<19:56:34, 11.93s/it]
[A

{'eval_loss': 0.5825703144073486, 'eval_runtime': 44.3224, 'eval_samples_per_second': 4.174, 'eval_steps_per_second': 0.271, 'epoch': 14.0}


                                                        
 14%|█▍        | 1000/7000 [3:47:46<21:27:27, 12.87s/it]

{'loss': 0.1056, 'learning_rate': 1.7142857142857142e-05, 'epoch': 14.29}


 15%|█▌        | 1050/7000 [3:59:35<22:59:11, 13.91s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 15%|█▌        | 1050/7000 [4:00:25<22:59:11, 13.91s/it]
[A

{'eval_loss': 0.4166968762874603, 'eval_runtime': 49.4482, 'eval_samples_per_second': 3.741, 'eval_steps_per_second': 0.243, 'epoch': 15.0}


 16%|█▌        | 1120/7000 [4:16:17<20:56:55, 12.83s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 16%|█▌        | 1120/7000 [4:17:03<20:56:55, 12.83s/it]
[A

{'eval_loss': 0.43925565481185913, 'eval_runtime': 46.6095, 'eval_samples_per_second': 3.969, 'eval_steps_per_second': 0.257, 'epoch': 16.0}


 17%|█▋        | 1190/7000 [4:32:05<19:50:25, 12.29s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 17%|█▋        | 1190/7000 [4:32:50<19:50:25, 12.29s/it]
[A

{'eval_loss': 0.43483462929725647, 'eval_runtime': 44.4392, 'eval_samples_per_second': 4.163, 'eval_steps_per_second': 0.27, 'epoch': 17.0}


 18%|█▊        | 1260/7000 [4:48:28<23:39:06, 14.83s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 18%|█▊        | 1260/7000 [4:49:21<23:39:06, 14.83s/it]
[A

{'eval_loss': 0.42404982447624207, 'eval_runtime': 52.6705, 'eval_samples_per_second': 3.512, 'eval_steps_per_second': 0.228, 'epoch': 18.0}


 19%|█▉        | 1330/7000 [5:05:42<20:42:07, 13.14s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 19%|█▉        | 1330/7000 [5:06:28<20:42:07, 13.14s/it]
[A

{'eval_loss': 0.484701007604599, 'eval_runtime': 45.9296, 'eval_samples_per_second': 4.028, 'eval_steps_per_second': 0.261, 'epoch': 19.0}


 20%|██        | 1400/7000 [5:21:47<20:06:36, 12.93s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 20%|██        | 1400/7000 [5:22:33<20:06:36, 12.93s/it]
[A

{'eval_loss': 0.499732106924057, 'eval_runtime': 45.8886, 'eval_samples_per_second': 4.032, 'eval_steps_per_second': 0.262, 'epoch': 20.0}


 21%|██        | 1470/7000 [5:37:50<19:48:01, 12.89s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 21%|██        | 1470/7000 [5:38:35<19:48:01, 12.89s/it]
[A

{'eval_loss': 0.46898049116134644, 'eval_runtime': 45.6575, 'eval_samples_per_second': 4.052, 'eval_steps_per_second': 0.263, 'epoch': 21.0}


                                                        
 21%|██▏       | 1500/7000 [5:45:08<20:00:31, 13.10s/it]

{'loss': 0.0848, 'learning_rate': 1.5714285714285715e-05, 'epoch': 21.43}


 22%|██▏       | 1540/7000 [5:53:52<19:32:53, 12.89s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 22%|██▏       | 1540/7000 [5:54:38<19:32:53, 12.89s/it]
[A

{'eval_loss': 0.4929385185241699, 'eval_runtime': 45.4305, 'eval_samples_per_second': 4.072, 'eval_steps_per_second': 0.264, 'epoch': 22.0}


 23%|██▎       | 1610/7000 [6:09:49<19:12:24, 12.83s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 23%|██▎       | 1610/7000 [6:10:34<19:12:24, 12.83s/it]
[A

{'eval_loss': 0.4431319832801819, 'eval_runtime': 44.8414, 'eval_samples_per_second': 4.126, 'eval_steps_per_second': 0.268, 'epoch': 23.0}


 24%|██▍       | 1680/7000 [6:25:46<18:55:24, 12.81s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 24%|██▍       | 1680/7000 [6:26:31<18:55:24, 12.81s/it]
[A

{'eval_loss': 0.4644157588481903, 'eval_runtime': 45.0024, 'eval_samples_per_second': 4.111, 'eval_steps_per_second': 0.267, 'epoch': 24.0}


 25%|██▌       | 1750/7000 [6:41:39<18:41:53, 12.82s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 25%|██▌       | 1750/7000 [6:42:24<18:41:53, 12.82s/it]
[A

{'eval_loss': 0.4775567650794983, 'eval_runtime': 44.8034, 'eval_samples_per_second': 4.129, 'eval_steps_per_second': 0.268, 'epoch': 25.0}


 26%|██▌       | 1820/7000 [6:57:34<18:25:06, 12.80s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 26%|██▌       | 1820/7000 [6:58:19<18:25:06, 12.80s/it]
[A

{'eval_loss': 0.4685598909854889, 'eval_runtime': 44.7973, 'eval_samples_per_second': 4.13, 'eval_steps_per_second': 0.268, 'epoch': 26.0}


 27%|██▋       | 1890/7000 [7:13:29<18:05:07, 12.74s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 27%|██▋       | 1890/7000 [7:14:14<18:05:07, 12.74s/it]
[A

{'eval_loss': 0.46925023198127747, 'eval_runtime': 45.3255, 'eval_samples_per_second': 4.082, 'eval_steps_per_second': 0.265, 'epoch': 27.0}


 28%|██▊       | 1960/7000 [7:29:26<17:51:48, 12.76s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 28%|██▊       | 1960/7000 [7:30:11<17:51:48, 12.76s/it]
[A

{'eval_loss': 0.5100871920585632, 'eval_runtime': 45.1864, 'eval_samples_per_second': 4.094, 'eval_steps_per_second': 0.266, 'epoch': 28.0}


                                                        
 29%|██▊       | 2000/7000 [7:38:53<18:20:56, 13.21s/it]

{'loss': 0.0673, 'learning_rate': 1.4285714285714287e-05, 'epoch': 28.57}


 29%|██▉       | 2030/7000 [7:45:26<17:38:47, 12.78s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 29%|██▉       | 2030/7000 [7:46:11<17:38:47, 12.78s/it]
[A

{'eval_loss': 0.488718181848526, 'eval_runtime': 45.1684, 'eval_samples_per_second': 4.096, 'eval_steps_per_second': 0.266, 'epoch': 29.0}


 30%|███       | 2100/7000 [8:01:26<17:31:46, 12.88s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 30%|███       | 2100/7000 [8:02:10<17:31:46, 12.88s/it]
[A

{'eval_loss': 0.48461848497390747, 'eval_runtime': 44.8454, 'eval_samples_per_second': 4.125, 'eval_steps_per_second': 0.268, 'epoch': 30.0}


 31%|███       | 2170/7000 [8:17:24<17:09:58, 12.79s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 31%|███       | 2170/7000 [8:18:09<17:09:58, 12.79s/it]
[A

{'eval_loss': 0.4587068259716034, 'eval_runtime': 45.0214, 'eval_samples_per_second': 4.109, 'eval_steps_per_second': 0.267, 'epoch': 31.0}


 32%|███▏      | 2240/7000 [8:33:27<16:58:01, 12.83s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 32%|███▏      | 2240/7000 [8:34:12<16:58:01, 12.83s/it]
[A

{'eval_loss': 0.4954650402069092, 'eval_runtime': 45.0654, 'eval_samples_per_second': 4.105, 'eval_steps_per_second': 0.266, 'epoch': 32.0}


 33%|███▎      | 2310/7000 [8:49:29<16:44:24, 12.85s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 33%|███▎      | 2310/7000 [8:50:14<16:44:24, 12.85s/it]
[A

{'eval_loss': 0.49469447135925293, 'eval_runtime': 45.2975, 'eval_samples_per_second': 4.084, 'eval_steps_per_second': 0.265, 'epoch': 33.0}


 34%|███▍      | 2380/7000 [9:05:28<16:30:47, 12.87s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 34%|███▍      | 2380/7000 [9:06:13<16:30:47, 12.87s/it]
[A

{'eval_loss': 0.5160640478134155, 'eval_runtime': 44.8734, 'eval_samples_per_second': 4.123, 'eval_steps_per_second': 0.267, 'epoch': 34.0}


 35%|███▌      | 2450/7000 [9:21:27<16:15:01, 12.86s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 35%|███▌      | 2450/7000 [9:22:12<16:15:01, 12.86s/it]
[A

{'eval_loss': 0.5154871344566345, 'eval_runtime': 45.2194, 'eval_samples_per_second': 4.091, 'eval_steps_per_second': 0.265, 'epoch': 35.0}


                                                        
 36%|███▌      | 2500/7000 [9:33:04<16:23:15, 13.11s/it]

{'loss': 0.0609, 'learning_rate': 1.2857142857142859e-05, 'epoch': 35.71}


 36%|███▌      | 2520/7000 [9:37:26<16:00:26, 12.86s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 36%|███▌      | 2520/7000 [9:38:11<16:00:26, 12.86s/it]
[A

{'eval_loss': 0.4828377664089203, 'eval_runtime': 44.9364, 'eval_samples_per_second': 4.117, 'eval_steps_per_second': 0.267, 'epoch': 36.0}


 37%|███▋      | 2590/7000 [9:53:26<15:49:23, 12.92s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                   
 37%|███▋      | 2590/7000 [9:54:11<15:49:23, 12.92s/it]
[A

{'eval_loss': 0.47072139382362366, 'eval_runtime': 45.2615, 'eval_samples_per_second': 4.087, 'eval_steps_per_second': 0.265, 'epoch': 37.0}


 38%|███▊      | 2660/7000 [10:09:24<15:27:44, 12.83s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 38%|███▊      | 2660/7000 [10:10:09<15:27:44, 12.83s/it]
[A

{'eval_loss': 0.4623873829841614, 'eval_runtime': 45.6465, 'eval_samples_per_second': 4.053, 'eval_steps_per_second': 0.263, 'epoch': 38.0}


 39%|███▉      | 2730/7000 [10:25:20<15:08:16, 12.76s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 39%|███▉      | 2730/7000 [10:26:05<15:08:16, 12.76s/it]
[A

{'eval_loss': 0.4838006794452667, 'eval_runtime': 44.8714, 'eval_samples_per_second': 4.123, 'eval_steps_per_second': 0.267, 'epoch': 39.0}


 40%|████      | 2800/7000 [10:41:19<14:59:03, 12.84s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 40%|████      | 2800/7000 [10:42:04<14:59:03, 12.84s/it]
[A

{'eval_loss': 0.5074338912963867, 'eval_runtime': 44.8684, 'eval_samples_per_second': 4.123, 'eval_steps_per_second': 0.267, 'epoch': 40.0}


 41%|████      | 2870/7000 [10:57:16<14:38:19, 12.76s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 41%|████      | 2870/7000 [10:58:01<14:38:19, 12.76s/it]
[A

{'eval_loss': 0.46368077397346497, 'eval_runtime': 44.9264, 'eval_samples_per_second': 4.118, 'eval_steps_per_second': 0.267, 'epoch': 41.0}


 42%|████▏     | 2940/7000 [11:13:12<14:21:46, 12.74s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 42%|████▏     | 2940/7000 [11:13:57<14:21:46, 12.74s/it]
[A

{'eval_loss': 0.4885837733745575, 'eval_runtime': 44.7423, 'eval_samples_per_second': 4.135, 'eval_steps_per_second': 0.268, 'epoch': 42.0}


                                                         
 43%|████▎     | 3000/7000 [11:26:57<14:28:58, 13.03s/it]

{'loss': 0.0578, 'learning_rate': 1.1428571428571429e-05, 'epoch': 42.86}


 43%|████▎     | 3010/7000 [11:29:09<14:10:24, 12.79s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 43%|████▎     | 3010/7000 [11:29:54<14:10:24, 12.79s/it]
[A

{'eval_loss': 0.4872160255908966, 'eval_runtime': 44.8494, 'eval_samples_per_second': 4.125, 'eval_steps_per_second': 0.268, 'epoch': 43.0}


 44%|████▍     | 3080/7000 [11:45:02<13:55:14, 12.78s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 44%|████▍     | 3080/7000 [11:45:47<13:55:14, 12.78s/it]
[A

{'eval_loss': 0.48433718085289, 'eval_runtime': 44.8034, 'eval_samples_per_second': 4.129, 'eval_steps_per_second': 0.268, 'epoch': 44.0}


 45%|████▌     | 3150/7000 [12:00:56<13:42:50, 12.82s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 45%|████▌     | 3150/7000 [12:01:41<13:42:50, 12.82s/it]
[A

{'eval_loss': 0.47487321496009827, 'eval_runtime': 45.0044, 'eval_samples_per_second': 4.111, 'eval_steps_per_second': 0.267, 'epoch': 45.0}


 46%|████▌     | 3220/7000 [12:16:48<13:25:53, 12.79s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 46%|████▌     | 3220/7000 [12:17:32<13:25:53, 12.79s/it]
[A

{'eval_loss': 0.4620359241962433, 'eval_runtime': 44.5343, 'eval_samples_per_second': 4.154, 'eval_steps_per_second': 0.269, 'epoch': 46.0}


 47%|████▋     | 3290/7000 [12:32:41<13:12:11, 12.81s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 47%|████▋     | 3290/7000 [12:33:25<13:12:11, 12.81s/it]
[A

{'eval_loss': 0.4715520441532135, 'eval_runtime': 44.5443, 'eval_samples_per_second': 4.153, 'eval_steps_per_second': 0.269, 'epoch': 47.0}


 48%|████▊     | 3360/7000 [12:48:40<13:00:54, 12.87s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 48%|████▊     | 3360/7000 [12:49:25<13:00:54, 12.87s/it]
[A

{'eval_loss': 0.48065799474716187, 'eval_runtime': 44.7183, 'eval_samples_per_second': 4.137, 'eval_steps_per_second': 0.268, 'epoch': 48.0}


 49%|████▉     | 3430/7000 [13:04:31<12:38:45, 12.75s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 49%|████▉     | 3430/7000 [13:05:16<12:38:45, 12.75s/it]
[A

{'eval_loss': 0.47871407866477966, 'eval_runtime': 45.2365, 'eval_samples_per_second': 4.09, 'eval_steps_per_second': 0.265, 'epoch': 49.0}


                                                         
 50%|█████     | 3500/7000 [13:21:03<15:53:35, 16.35s/it]

{'loss': 0.0553, 'learning_rate': 1e-05, 'epoch': 50.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 50%|█████     | 3500/7000 [13:21:52<15:53:35, 16.35s/it]
[A

{'eval_loss': 0.4823940098285675, 'eval_runtime': 46.4467, 'eval_samples_per_second': 3.983, 'eval_steps_per_second': 0.258, 'epoch': 50.0}


 51%|█████     | 3570/7000 [13:36:34<11:08:38, 11.70s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 51%|█████     | 3570/7000 [13:37:15<11:08:38, 11.70s/it]
[A

{'eval_loss': 0.489362508058548, 'eval_runtime': 40.8684, 'eval_samples_per_second': 4.527, 'eval_steps_per_second': 0.294, 'epoch': 51.0}


 52%|█████▏    | 3640/7000 [13:51:32<11:54:08, 12.75s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 52%|█████▏    | 3640/7000 [13:52:18<11:54:08, 12.75s/it]
[A

{'eval_loss': 0.49476033449172974, 'eval_runtime': 45.6906, 'eval_samples_per_second': 4.049, 'eval_steps_per_second': 0.263, 'epoch': 52.0}


 53%|█████▎    | 3710/7000 [14:08:27<12:38:00, 13.82s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 53%|█████▎    | 3710/7000 [14:09:16<12:38:00, 13.82s/it]
[A

{'eval_loss': 0.4974142014980316, 'eval_runtime': 49.1572, 'eval_samples_per_second': 3.763, 'eval_steps_per_second': 0.244, 'epoch': 53.0}


 54%|█████▍    | 3780/7000 [14:24:41<11:25:30, 12.77s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 54%|█████▍    | 3780/7000 [14:25:26<11:25:30, 12.77s/it]
[A

{'eval_loss': 0.4843025803565979, 'eval_runtime': 45.1894, 'eval_samples_per_second': 4.094, 'eval_steps_per_second': 0.266, 'epoch': 54.0}


 55%|█████▌    | 3850/7000 [14:40:34<11:13:47, 12.83s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 55%|█████▌    | 3850/7000 [14:41:19<11:13:47, 12.83s/it]
[A

{'eval_loss': 0.49399083852767944, 'eval_runtime': 44.9014, 'eval_samples_per_second': 4.12, 'eval_steps_per_second': 0.267, 'epoch': 55.0}


 56%|█████▌    | 3920/7000 [14:56:28<10:51:41, 12.70s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 56%|█████▌    | 3920/7000 [14:57:13<10:51:41, 12.70s/it]
[A

{'eval_loss': 0.49456408619880676, 'eval_runtime': 45.2154, 'eval_samples_per_second': 4.092, 'eval_steps_per_second': 0.265, 'epoch': 56.0}


 57%|█████▋    | 3990/7000 [15:12:20<10:36:27, 12.69s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 57%|█████▋    | 3990/7000 [15:13:05<10:36:27, 12.69s/it]
[A

{'eval_loss': 0.48707738518714905, 'eval_runtime': 44.8734, 'eval_samples_per_second': 4.123, 'eval_steps_per_second': 0.267, 'epoch': 57.0}


                                                         
 57%|█████▋    | 4000/7000 [15:15:15<11:16:38, 13.53s/it]

{'loss': 0.0538, 'learning_rate': 8.571428571428571e-06, 'epoch': 57.14}


 58%|█████▊    | 4060/7000 [15:28:16<10:27:34, 12.81s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 58%|█████▊    | 4060/7000 [15:29:01<10:27:34, 12.81s/it]
[A

{'eval_loss': 0.4683845341205597, 'eval_runtime': 44.6693, 'eval_samples_per_second': 4.142, 'eval_steps_per_second': 0.269, 'epoch': 58.0}


 59%|█████▉    | 4130/7000 [15:44:11<10:09:12, 12.74s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                         
[A                                                    
 59%|█████▉    | 4130/7000 [15:44:56<10:09:12, 12.74s/it]
[A

{'eval_loss': 0.4915618598461151, 'eval_runtime': 44.7873, 'eval_samples_per_second': 4.131, 'eval_steps_per_second': 0.268, 'epoch': 59.0}


 60%|██████    | 4200/7000 [16:00:02<9:58:44, 12.83s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 60%|██████    | 4200/7000 [16:00:47<9:58:44, 12.83s/it]
[A

{'eval_loss': 0.5130019783973694, 'eval_runtime': 44.9724, 'eval_samples_per_second': 4.114, 'eval_steps_per_second': 0.267, 'epoch': 60.0}


 61%|██████    | 4270/7000 [16:15:52<9:37:52, 12.70s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 61%|██████    | 4270/7000 [16:16:37<9:37:52, 12.70s/it]
[A

{'eval_loss': 0.5216244459152222, 'eval_runtime': 45.4665, 'eval_samples_per_second': 4.069, 'eval_steps_per_second': 0.264, 'epoch': 61.0}


 62%|██████▏   | 4340/7000 [16:31:46<9:28:59, 12.83s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 62%|██████▏   | 4340/7000 [16:32:31<9:28:59, 12.83s/it]
[A

{'eval_loss': 0.46468687057495117, 'eval_runtime': 44.9864, 'eval_samples_per_second': 4.112, 'eval_steps_per_second': 0.267, 'epoch': 62.0}


 63%|██████▎   | 4410/7000 [16:47:39<9:12:06, 12.79s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 63%|██████▎   | 4410/7000 [16:48:25<9:12:06, 12.79s/it]
[A

{'eval_loss': 0.4915112853050232, 'eval_runtime': 45.3615, 'eval_samples_per_second': 4.078, 'eval_steps_per_second': 0.265, 'epoch': 63.0}


 64%|██████▍   | 4480/7000 [17:03:36<8:56:17, 12.77s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 64%|██████▍   | 4480/7000 [17:04:21<8:56:17, 12.77s/it]
[A

{'eval_loss': 0.4784536063671112, 'eval_runtime': 45.0814, 'eval_samples_per_second': 4.104, 'eval_steps_per_second': 0.266, 'epoch': 64.0}


                                                         
 64%|██████▍   | 4500/7000 [17:08:41<9:01:56, 13.01s/it]

{'loss': 0.0548, 'learning_rate': 7.1428571428571436e-06, 'epoch': 64.29}


 65%|██████▌   | 4550/7000 [17:19:34<8:44:29, 12.84s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 65%|██████▌   | 4550/7000 [17:20:19<8:44:29, 12.84s/it]
[A

{'eval_loss': 0.48833951354026794, 'eval_runtime': 45.1494, 'eval_samples_per_second': 4.098, 'eval_steps_per_second': 0.266, 'epoch': 65.0}


 66%|██████▌   | 4620/7000 [17:35:31<8:27:55, 12.80s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 66%|██████▌   | 4620/7000 [17:36:17<8:27:55, 12.80s/it]
[A

{'eval_loss': 0.47547245025634766, 'eval_runtime': 45.4385, 'eval_samples_per_second': 4.071, 'eval_steps_per_second': 0.264, 'epoch': 66.0}


 67%|██████▋   | 4690/7000 [17:51:28<8:14:13, 12.84s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 67%|██████▋   | 4690/7000 [17:52:14<8:14:13, 12.84s/it]
[A

{'eval_loss': 0.47920849919319153, 'eval_runtime': 45.2385, 'eval_samples_per_second': 4.089, 'eval_steps_per_second': 0.265, 'epoch': 67.0}


 68%|██████▊   | 4760/7000 [18:07:27<7:59:43, 12.85s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 68%|██████▊   | 4760/7000 [18:08:12<7:59:43, 12.85s/it]
[A

{'eval_loss': 0.4810597598552704, 'eval_runtime': 45.1974, 'eval_samples_per_second': 4.093, 'eval_steps_per_second': 0.266, 'epoch': 68.0}


 69%|██████▉   | 4830/7000 [18:23:24<7:41:10, 12.75s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 69%|██████▉   | 4830/7000 [18:24:09<7:41:10, 12.75s/it]
[A

{'eval_loss': 0.4805644750595093, 'eval_runtime': 44.8124, 'eval_samples_per_second': 4.128, 'eval_steps_per_second': 0.268, 'epoch': 69.0}


 70%|███████   | 4900/7000 [18:39:23<7:30:33, 12.87s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 70%|███████   | 4900/7000 [18:40:08<7:30:33, 12.87s/it]
[A

{'eval_loss': 0.48750215768814087, 'eval_runtime': 44.7483, 'eval_samples_per_second': 4.134, 'eval_steps_per_second': 0.268, 'epoch': 70.0}


 71%|███████   | 4970/7000 [18:55:22<7:17:35, 12.93s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 71%|███████   | 4970/7000 [18:56:07<7:17:35, 12.93s/it]
[A

{'eval_loss': 0.4919893741607666, 'eval_runtime': 45.2405, 'eval_samples_per_second': 4.089, 'eval_steps_per_second': 0.265, 'epoch': 71.0}


                                                         
 71%|███████▏  | 5000/7000 [19:02:39<7:18:33, 13.16s/it]

{'loss': 0.0538, 'learning_rate': 5.7142857142857145e-06, 'epoch': 71.43}


 72%|███████▏  | 5040/7000 [19:11:25<6:58:28, 12.81s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 72%|███████▏  | 5040/7000 [19:12:10<6:58:28, 12.81s/it]
[A

{'eval_loss': 0.48913368582725525, 'eval_runtime': 45.1454, 'eval_samples_per_second': 4.098, 'eval_steps_per_second': 0.266, 'epoch': 72.0}


 73%|███████▎  | 5110/7000 [19:27:27<6:44:25, 12.84s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 73%|███████▎  | 5110/7000 [19:28:12<6:44:25, 12.84s/it]
[A

{'eval_loss': 0.47810909152030945, 'eval_runtime': 44.7853, 'eval_samples_per_second': 4.131, 'eval_steps_per_second': 0.268, 'epoch': 73.0}


 74%|███████▍  | 5180/7000 [19:43:28<6:32:14, 12.93s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 74%|███████▍  | 5180/7000 [19:44:14<6:32:14, 12.93s/it]
[A

{'eval_loss': 0.4892573654651642, 'eval_runtime': 45.3915, 'eval_samples_per_second': 4.076, 'eval_steps_per_second': 0.264, 'epoch': 74.0}


 75%|███████▌  | 5250/7000 [19:59:30<6:12:52, 12.78s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 75%|███████▌  | 5250/7000 [20:00:16<6:12:52, 12.78s/it]
[A

{'eval_loss': 0.4916624426841736, 'eval_runtime': 45.6145, 'eval_samples_per_second': 4.056, 'eval_steps_per_second': 0.263, 'epoch': 75.0}


 76%|███████▌  | 5320/7000 [20:15:34<6:01:54, 12.93s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 76%|███████▌  | 5320/7000 [20:16:19<6:01:54, 12.93s/it]
[A

{'eval_loss': 0.5005748867988586, 'eval_runtime': 45.4855, 'eval_samples_per_second': 4.067, 'eval_steps_per_second': 0.264, 'epoch': 76.0}


 77%|███████▋  | 5390/7000 [20:31:36<5:48:01, 12.97s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 77%|███████▋  | 5390/7000 [20:32:22<5:48:01, 12.97s/it]
[A

{'eval_loss': 0.48802894353866577, 'eval_runtime': 45.5215, 'eval_samples_per_second': 4.064, 'eval_steps_per_second': 0.264, 'epoch': 77.0}


 78%|███████▊  | 5460/7000 [20:47:41<5:34:01, 13.01s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 78%|███████▊  | 5460/7000 [20:48:26<5:34:01, 13.01s/it]
[A

{'eval_loss': 0.49634701013565063, 'eval_runtime': 45.1594, 'eval_samples_per_second': 4.097, 'eval_steps_per_second': 0.266, 'epoch': 78.0}


                                                         
 79%|███████▊  | 5500/7000 [20:57:10<5:27:15, 13.09s/it]

{'loss': 0.0528, 'learning_rate': 4.2857142857142855e-06, 'epoch': 78.57}


 79%|███████▉  | 5530/7000 [21:03:46<5:16:44, 12.93s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 79%|███████▉  | 5530/7000 [21:04:31<5:16:44, 12.93s/it]
[A

{'eval_loss': 0.4939378798007965, 'eval_runtime': 45.0784, 'eval_samples_per_second': 4.104, 'eval_steps_per_second': 0.266, 'epoch': 79.0}


 80%|████████  | 5600/7000 [21:19:46<5:02:08, 12.95s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 80%|████████  | 5600/7000 [21:20:32<5:02:08, 12.95s/it]
[A

{'eval_loss': 0.5015149116516113, 'eval_runtime': 45.7396, 'eval_samples_per_second': 4.045, 'eval_steps_per_second': 0.262, 'epoch': 80.0}


 81%|████████  | 5670/7000 [21:35:50<4:46:06, 12.91s/it] 
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 81%|████████  | 5670/7000 [21:36:36<4:46:06, 12.91s/it]
[A

{'eval_loss': 0.48450812697410583, 'eval_runtime': 45.3765, 'eval_samples_per_second': 4.077, 'eval_steps_per_second': 0.264, 'epoch': 81.0}


 82%|████████▏ | 5740/7000 [21:51:54<4:32:11, 12.96s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 82%|████████▏ | 5740/7000 [21:52:39<4:32:11, 12.96s/it]
[A

{'eval_loss': 0.4912053644657135, 'eval_runtime': 45.5085, 'eval_samples_per_second': 4.065, 'eval_steps_per_second': 0.264, 'epoch': 82.0}


 83%|████████▎ | 5810/7000 [22:07:57<4:15:44, 12.89s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 83%|████████▎ | 5810/7000 [22:08:43<4:15:44, 12.89s/it]
[A

{'eval_loss': 0.4865064024925232, 'eval_runtime': 45.9326, 'eval_samples_per_second': 4.028, 'eval_steps_per_second': 0.261, 'epoch': 83.0}


 84%|████████▍ | 5880/7000 [22:24:03<4:01:40, 12.95s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 84%|████████▍ | 5880/7000 [22:24:48<4:01:40, 12.95s/it]
[A

{'eval_loss': 0.49206721782684326, 'eval_runtime': 45.5245, 'eval_samples_per_second': 4.064, 'eval_steps_per_second': 0.264, 'epoch': 84.0}


 85%|████████▌ | 5950/7000 [22:40:08<3:48:22, 13.05s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 85%|████████▌ | 5950/7000 [22:40:53<3:48:22, 13.05s/it]
[A

{'eval_loss': 0.5010748505592346, 'eval_runtime': 45.7316, 'eval_samples_per_second': 4.045, 'eval_steps_per_second': 0.262, 'epoch': 85.0}


                                                        
 86%|████████▌ | 6000/7000 [22:51:49<3:36:12, 12.97s/it]

{'loss': 0.0491, 'learning_rate': 2.8571428571428573e-06, 'epoch': 85.71}


 86%|████████▌ | 6020/7000 [22:56:19<3:33:03, 13.04s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 86%|████████▌ | 6020/7000 [22:57:05<3:33:03, 13.04s/it]
[A

{'eval_loss': 0.49535807967185974, 'eval_runtime': 45.4785, 'eval_samples_per_second': 4.068, 'eval_steps_per_second': 0.264, 'epoch': 86.0}


 87%|████████▋ | 6090/7000 [23:12:27<3:15:04, 12.86s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 87%|████████▋ | 6090/7000 [23:13:13<3:15:04, 12.86s/it]
[A

{'eval_loss': 0.49294793605804443, 'eval_runtime': 45.5915, 'eval_samples_per_second': 4.058, 'eval_steps_per_second': 0.263, 'epoch': 87.0}


 88%|████████▊ | 6160/7000 [23:28:36<3:01:56, 13.00s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 88%|████████▊ | 6160/7000 [23:29:22<3:01:56, 13.00s/it]
[A

{'eval_loss': 0.4960065186023712, 'eval_runtime': 45.7546, 'eval_samples_per_second': 4.043, 'eval_steps_per_second': 0.262, 'epoch': 88.0}


 89%|████████▉ | 6230/7000 [23:44:45<2:45:55, 12.93s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 89%|████████▉ | 6230/7000 [23:45:31<2:45:55, 12.93s/it]
[A

{'eval_loss': 0.49731165170669556, 'eval_runtime': 45.6225, 'eval_samples_per_second': 4.055, 'eval_steps_per_second': 0.263, 'epoch': 89.0}


 90%|█████████ | 6300/7000 [24:00:51<2:31:03, 12.95s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 90%|█████████ | 6300/7000 [24:01:37<2:31:03, 12.95s/it]
[A

{'eval_loss': 0.4934477210044861, 'eval_runtime': 45.5855, 'eval_samples_per_second': 4.058, 'eval_steps_per_second': 0.263, 'epoch': 90.0}


 91%|█████████ | 6370/7000 [24:16:58<2:16:19, 12.98s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 91%|█████████ | 6370/7000 [24:17:44<2:16:19, 12.98s/it]
[A

{'eval_loss': 0.490345299243927, 'eval_runtime': 45.4405, 'eval_samples_per_second': 4.071, 'eval_steps_per_second': 0.264, 'epoch': 91.0}


 92%|█████████▏| 6440/7000 [24:33:05<2:00:48, 12.94s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 92%|█████████▏| 6440/7000 [24:33:50<2:00:48, 12.94s/it]
[A

{'eval_loss': 0.4954295754432678, 'eval_runtime': 45.0474, 'eval_samples_per_second': 4.107, 'eval_steps_per_second': 0.266, 'epoch': 92.0}


                                                        
 93%|█████████▎| 6500/7000 [24:47:02<1:50:01, 13.20s/it]

{'loss': 0.049, 'learning_rate': 1.4285714285714286e-06, 'epoch': 92.86}


 93%|█████████▎| 6510/7000 [24:49:14<1:44:58, 12.85s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 93%|█████████▎| 6510/7000 [24:49:59<1:44:58, 12.85s/it]
[A

{'eval_loss': 0.4952269196510315, 'eval_runtime': 45.3435, 'eval_samples_per_second': 4.08, 'eval_steps_per_second': 0.265, 'epoch': 93.0}


 94%|█████████▍| 6580/7000 [25:05:19<1:30:46, 12.97s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 94%|█████████▍| 6580/7000 [25:06:06<1:30:46, 12.97s/it]
[A

{'eval_loss': 0.4966422915458679, 'eval_runtime': 46.5538, 'eval_samples_per_second': 3.974, 'eval_steps_per_second': 0.258, 'epoch': 94.0}


 95%|█████████▌| 6650/7000 [25:21:19<1:13:31, 12.60s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                        
[A                                                    
 95%|█████████▌| 6650/7000 [25:22:03<1:13:31, 12.60s/it]
[A

{'eval_loss': 0.49447327852249146, 'eval_runtime': 44.5591, 'eval_samples_per_second': 4.152, 'eval_steps_per_second': 0.269, 'epoch': 95.0}


 96%|█████████▌| 6720/7000 [25:36:59<58:25, 12.52s/it]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                      
[A                                                    
 96%|█████████▌| 6720/7000 [25:37:44<58:25, 12.52s/it]
[A

{'eval_loss': 0.49710920453071594, 'eval_runtime': 44.7742, 'eval_samples_per_second': 4.132, 'eval_steps_per_second': 0.268, 'epoch': 96.0}


 97%|█████████▋| 6790/7000 [25:52:42<45:20, 12.95s/it]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                      
[A                                                    
 97%|█████████▋| 6790/7000 [25:53:28<45:20, 12.95s/it]
[A

{'eval_loss': 0.49652624130249023, 'eval_runtime': 46.1767, 'eval_samples_per_second': 4.006, 'eval_steps_per_second': 0.26, 'epoch': 97.0}


 98%|█████████▊| 6860/7000 [26:09:18<33:18, 14.27s/it]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                      
[A                                                    
 98%|█████████▊| 6860/7000 [26:10:09<33:18, 14.27s/it]
[A

{'eval_loss': 0.4962505102157593, 'eval_runtime': 51.0995, 'eval_samples_per_second': 3.62, 'eval_steps_per_second': 0.235, 'epoch': 98.0}


 99%|█████████▉| 6930/7000 [26:25:58<15:26, 13.24s/it]  
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                      
[A                                                    
 99%|█████████▉| 6930/7000 [26:26:43<15:26, 13.24s/it]
[A

{'eval_loss': 0.49633529782295227, 'eval_runtime': 44.6846, 'eval_samples_per_second': 4.14, 'eval_steps_per_second': 0.269, 'epoch': 99.0}


                                                      
100%|██████████| 7000/7000 [26:42:10<00:00, 13.37s/it] 

{'loss': 0.0465, 'learning_rate': 0.0, 'epoch': 100.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

                                                      
[A                                                    
100%|██████████| 7000/7000 [26:43:01<00:00, 13.37s/it]
                                                      
100%|██████████| 7000/7000 [26:43:01<00:00, 13.74s/it] 

{'eval_loss': 0.49616506695747375, 'eval_runtime': 47.5344, 'eval_samples_per_second': 3.892, 'eval_steps_per_second': 0.252, 'epoch': 100.0}
{'train_runtime': 96181.9408, 'train_samples_per_second': 1.163, 'train_steps_per_second': 0.073, 'train_loss': 0.06898140634809222, 'epoch': 100.0}





TrainOutput(global_step=7000, training_loss=0.06898140634809222, metrics={'train_runtime': 96181.9408, 'train_samples_per_second': 1.163, 'train_steps_per_second': 0.073, 'train_loss': 0.06898140634809222, 'epoch': 100.0})

In [118]:
trainer.save_model()

In [126]:
text = "di chúc bằng văn bản có giá trị như di chúc được công chứng hoặc chứng thực: di chúc của quân nhân tại ngũ có xác nhận của thủ trưởng đơn vị từ cấp đại đội trở lên, nếu quân nhân không thể yêu cầu công chứng hoặc chứng thực, di chúc của người đang đi trên tàu biển, máy bay có xác nhận của người chỉ huy phương tiện đó, di chúc của người đang điều trị tại bệnh viện, cơ sở chữa bệnh, điều dưỡng khác có xác nhận của người phụ trách bệnh viện, cơ sở đó, di chúc của người đang làm công việc khảo sát, thăm dò, nghiên cứu ở vùng rừng núi, hải đảo có xác nhận của người phụ trách đơn vị, di chúc của công dân việt nam đang ở nước ngoài có chứng nhận của cơ quan lãnh sự, đại diện ngoại giao việt nam ở nước đó, di chúc của người đang bị tạm giam, tạm giữ, đang chấp hành hình phạt tù, người đang chấp hành biện pháp xử lý hành chính tại cơ sở giáo dục, cơ sở chữa bệnh có xác nhận của người phụ trách cơ sở đó ; điều 638."
question = "Quyền dân sự được công nhận ở đâu ?"
question2= "di chúc của người đang đi trên tàu biển, máy bay có xác nhận của ai ?"

In [120]:
inputs = tokenizer(question2, text, return_tensors="pt")

In [121]:
model = RobertaForQuestionAnswering.from_pretrained("phobert_law")
tokenizer = RobertaTokenizerFast.from_pretrained("phobert_law")



In [125]:
with torch.no_grad():
    outputs = model(**inputs)

RuntimeError: The expanded size of the tensor (261) must match the existing size (258) at non-singleton dimension 1.  Target sizes: [1, 261].  Tensor sizes: [1, 258]

In [68]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [69]:
answer_start_index

tensor(248)

In [70]:
answer_end_index

tensor(248)

In [71]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'đó'