In [1]:
!pip install transformers[torch] datasets evaluate optuna



In [2]:
!pip install -U datasets



In [3]:
import pandas as pd
import numpy as np
import json
from joblib import dump, load

import os
import random
from tqdm.notebook  import tqdm

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader

from transformers import (AutoConfig,
                          AutoTokenizer,
                          DefaultDataCollator,
                          AutoModelForQuestionAnswering,
                          TrainingArguments,
                          Trainer,
                          default_data_collator,
                          get_scheduler)

from datasets import load_dataset
import evaluate
import collections
import optuna



In [4]:
def seed_everything(seed):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = True

seed_everything(42)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"

In [8]:
dataset = load_dataset(path='/kaggle/input',data_files='./custom_data.json',split='train')
dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['context', 'answers', 'id', 'question'],
    num_rows: 28024
})

In [9]:
squad = dataset.train_test_split(.2)
squad

DatasetDict({
    train: Dataset({
        features: ['context', 'answers', 'id', 'question'],
        num_rows: 22419
    })
    test: Dataset({
        features: ['context', 'answers', 'id', 'question'],
        num_rows: 5605
    })
})

In [10]:
squad["train"][0]

{'context': '경제협력개발기구(OECD)가 발표한 ‘더 나은 삶 지수(Better Life Index, BLI)’ 중 하나인 ‘공동체(Community)’ 지수에서 대한민국은 러시아와 브라질을 포함한 36개 국가 중 최하위를 기록했다. 한국 사회에서 공동체가 파괴되어온 주요인으로 다양한 의견이 개진되었으나, 한국도시의 오래된 특징인 아파트와 고립된 생활 공간, 차도와 인도의 구분이 없는 길과 건물 배치가 이웃 간 소통을 가로막고 갈등이 생겨도 이를 원활하게 해결할 수 없는 환경을 조성했다는 지적이 주목을 받았다(김재형, 2016). 도움이 필요할 때 이웃의 관심과 손길을 기대하기 어려운 환경은 고독사, 아동학대, 가정폭력, 자살 등 다양한 사회문제를 방치하고 악화시키는 데 일조한다. 이러한 공동체 와해와 지역현안 문제를 주민들의 참여를 통해 적극적으로 해결하고자 시행된 정책사업 중 하나가 2010년부터 각 지자체에 본격적으로 도입된 마을 만들기 사업이다. 마을 만들기 사업의 유형은 주체와 내용에 따라 다양한 형태로 발현되지만, 그 핵심은 주민참여를 주요 동력으로 공동의 지역문제와 욕구를 해결하고, 공동체 회복을 통해 주민의 삶의 질을 향상시킨다는 정책적 목적으로 귀결된다(박선희, 2014).',
 'answers': {'answer_start': [84], 'text': ['대한민국']},
 'id': 'QUES_UniBPiuRnf',
 'question': '더 나은 삶 지수 중 하나인 공동체 지수에서 어느 나라가 최하위를 기록했니'}

In [11]:
print("Context: ", squad["train"][0]["context"])
print("Question: ", squad["train"][0]["question"])
print("Answer: ", squad["train"][0]["answers"])

Context:  경제협력개발기구(OECD)가 발표한 ‘더 나은 삶 지수(Better Life Index, BLI)’ 중 하나인 ‘공동체(Community)’ 지수에서 대한민국은 러시아와 브라질을 포함한 36개 국가 중 최하위를 기록했다. 한국 사회에서 공동체가 파괴되어온 주요인으로 다양한 의견이 개진되었으나, 한국도시의 오래된 특징인 아파트와 고립된 생활 공간, 차도와 인도의 구분이 없는 길과 건물 배치가 이웃 간 소통을 가로막고 갈등이 생겨도 이를 원활하게 해결할 수 없는 환경을 조성했다는 지적이 주목을 받았다(김재형, 2016). 도움이 필요할 때 이웃의 관심과 손길을 기대하기 어려운 환경은 고독사, 아동학대, 가정폭력, 자살 등 다양한 사회문제를 방치하고 악화시키는 데 일조한다. 이러한 공동체 와해와 지역현안 문제를 주민들의 참여를 통해 적극적으로 해결하고자 시행된 정책사업 중 하나가 2010년부터 각 지자체에 본격적으로 도입된 마을 만들기 사업이다. 마을 만들기 사업의 유형은 주체와 내용에 따라 다양한 형태로 발현되지만, 그 핵심은 주민참여를 주요 동력으로 공동의 지역문제와 욕구를 해결하고, 공동체 회복을 통해 주민의 삶의 질을 향상시킨다는 정책적 목적으로 귀결된다(박선희, 2014).
Question:  더 나은 삶 지수 중 하나인 공동체 지수에서 어느 나라가 최하위를 기록했니
Answer:  {'answer_start': [84], 'text': ['대한민국']}


In [12]:
train_dataset = squad["train"]
valid_dataset = squad["test"]

In [13]:
# select samples
train_dataset = train_dataset.select(range(1000))
valid_dataset = valid_dataset.select(range(100))

In [14]:
MODEL_NAME ='monologg/kobigbird-bert-base'
config = AutoConfig.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/241k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/492k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

In [15]:
max_length = 1024
stride = 128

def preprocess_training_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride = stride,
        return_offsets_mapping=True,
        return_overflowing_tokens=True,
        padding="max_length",
    )

    #the offset mapping gives us a tuple indicating the sub-token’s start position and end position relative to the original token it was split from
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
      sample_idx = sample_map[i]
      answer = answers[sample_idx]
      start_char = answer["answer_start"][0]
      end_char = answer["answer_start"][0] + len(answer["text"][0])
      sequence_ids = inputs.sequence_ids(i)

      # Find the start and end of the context
      idx = 0
      while sequence_ids[idx] != 1:
          idx += 1
      context_start = idx
      while sequence_ids[idx] == 1:
          idx += 1
      context_end = idx - 1

      # If the answer is not fully inside the context, label it (0, 0)
      if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
          start_positions.append(0)
          end_positions.append(0)
      else:
          # Otherwise it's the start and end token positions
          idx = context_start
          while idx <= context_end and offset[idx][0] <= start_char:
              idx += 1
          start_positions.append(idx - 1)

          idx = context_end
          while idx >= context_start and offset[idx][1] >= end_char:
              idx -= 1
          end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [16]:
def preprocess_validation_function(exmaples):
    questions = [q.strip() for q in exmaples["question"]]
    inputs = tokenizer(
        questions,
        exmaples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(exmaples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [17]:
train_set = train_dataset.map(preprocess_training_function,
                              batched=True,
                              remove_columns= train_dataset.column_names,
                                    load_from_cache_file=False,)

valid_set = valid_dataset.map(preprocess_validation_function,
                              batched=True,
                              remove_columns= valid_dataset.column_names,
                                    load_from_cache_file=False,)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [18]:
train_set

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 1000
})

In [19]:
valid_set

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 100
})

In [20]:
valid_modelset =valid_set.remove_columns(["example_id", "offset_mapping"])

In [21]:
train_set.set_format("torch")
valid_modelset.set_format("torch")

In [22]:
def get_loader(batch_size):
  trainloader = DataLoader(train_set, shuffle=True, batch_size=batch_size, collate_fn=default_data_collator)
  validloader = DataLoader(valid_modelset, shuffle=False, batch_size=batch_size, collate_fn=default_data_collator)
  return trainloader, validloader

In [23]:
metric = evaluate.load("squad")

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [24]:
n_best = 20
max_answer_length = 30

def compute_metrics(start_logits, end_logits,features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if (end_index < start_index
                        or end_index - start_index +1 > max_answer_length):
                      continue
                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],}
                    answers.append(answer)

        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append({"id": example_id, "prediction_text": best_answer["text"]})
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=references)

In [25]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)

model.safetensors:   0%|          | 0.00/458M [00:00<?, ?B/s]

Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
accumulation_steps = 8

def train(trainloader, model, optimizer, lr_scheduler, device):
    model.train()
    running_loss = 0.0
    for idx, batch in enumerate(trainloader):
      batch ={k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs.loss
      running_loss += loss.item()
      if (idx +1) % accumulation_steps == 0:
          loss.backward()
          optimizer.step()
          optimizer.zero_grad()
          lr_scheduler.step()
    train_loss = running_loss / len(trainloader)
    return train_loss

In [27]:
def validate(validloader, model, device):
    model.eval()
    start_logits =[]
    end_logits = []
    with torch.no_grad():
        for idx, batch in enumerate(validloader):
            batch ={k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            start_logits.append(outputs.start_logits.detach().cpu().numpy())
            end_logits.append(outputs.end_logits.detach().cpu().numpy())

        start_logits = np.concatenate(start_logits, axis=0)
        end_logits = np.concatenate(end_logits, axis=0)
        results = compute_metrics(start_logits, end_logits, valid_set, valid_dataset)
    return  results["exact_match"], results["f1"]

In [28]:
torch.cuda.empty_cache()

def train_mrc(trial):
    Learning_rate=trial.suggest_float('learning_rate', 5e-7, 5e-3, log=True)
    Weight_decay = trial.suggest_float('weight_decay', 0.0, 0.3)
    num_epochs = 10
    Batch_size = 1
    accumulation_steps = 32
    trainloader, validloader = get_loader(Batch_size)
    num_update_steps_per_epoch = len(trainloader)
    num_training_steps = num_epochs * num_update_steps_per_epoch

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
    model = model.to(device)

    optimizer = AdamW(model.parameters(), lr=Learning_rate, weight_decay=Weight_decay)
    lr_scheduler = get_scheduler(
      trial.suggest_categorical("SchedulerType", ['linear', 'cosine', 'constant']),
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps,
      )
    for epoch in tqdm(range(1, num_epochs+1)):
        train_loss = train(trainloader, model, optimizer, lr_scheduler, device)
        em, f1 = validate(validloader, model, device)
        print("Epoch:{:1d}/{:1d}, Train Loss:{:.4f}, EM:{:.2f}, F1:{:.2f}".format(epoch, num_epochs, train_loss, em, f1 ))

    return em


In [29]:
study = optuna.create_study(sampler=optuna.samplers.TPESampler(), direction="maximize")
study.optimize(train_mrc, n_trials=20)

[I 2023-12-17 00:41:16,098] A new study created in memory with name: no-name-1b8823dc-360c-455b-90e3-2d2d7260d0a7
Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:5.6749, EM:0.00, F1:2.38


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:5.6107, EM:1.00, F1:2.62


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:5.6159, EM:0.00, F1:1.09


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:5.6151, EM:0.00, F1:1.44


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:5.6171, EM:0.00, F1:0.99


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:5.6105, EM:0.00, F1:1.87


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:5.6092, EM:0.00, F1:1.01


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:5.6222, EM:0.00, F1:2.07


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:5.6048, EM:0.00, F1:1.52


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 01:08:39,025] Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 0.000775156024954002, 'weight_decay': 0.1520120324801813, 'SchedulerType': 'cosine'}. Best is trial 0 with value: 0.0.


Epoch:10/10, Train Loss:5.6050, EM:0.00, F1:1.85


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:3.7161, EM:30.00, F1:36.53


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:2.0651, EM:61.00, F1:66.97


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:1.7177, EM:55.00, F1:61.36


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:1.5071, EM:41.00, F1:46.32


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:1.1750, EM:59.00, F1:65.20


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:0.9415, EM:67.00, F1:74.00


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:0.8130, EM:66.00, F1:73.74


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:0.7948, EM:69.00, F1:75.94


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:0.8805, EM:67.00, F1:74.09


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 01:36:42,749] Trial 1 finished with value: 69.0 and parameters: {'learning_rate': 6.553795360776687e-05, 'weight_decay': 0.016687789495253614, 'SchedulerType': 'cosine'}. Best is trial 1 with value: 69.0.


Epoch:10/10, Train Loss:0.6202, EM:69.00, F1:75.13


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:4.5623, EM:18.00, F1:23.57


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:2.8721, EM:37.00, F1:41.51


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:1.5628, EM:67.00, F1:70.57


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:1.1446, EM:68.00, F1:73.79


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:0.8891, EM:72.00, F1:77.60


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:0.7165, EM:72.00, F1:78.33


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:0.6082, EM:72.00, F1:79.26


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:0.5471, EM:74.00, F1:80.50


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:0.5243, EM:72.00, F1:77.43


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 02:04:32,639] Trial 2 finished with value: 70.0 and parameters: {'learning_rate': 1.0301526766047216e-05, 'weight_decay': 0.22873868812106432, 'SchedulerType': 'constant'}. Best is trial 2 with value: 70.0.


Epoch:10/10, Train Loss:0.5132, EM:70.00, F1:75.93


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:4.5709, EM:13.00, F1:19.28


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:3.1731, EM:26.00, F1:31.03


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:2.4868, EM:36.00, F1:40.76


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:2.0126, EM:54.00, F1:59.31


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:1.3393, EM:61.00, F1:64.63


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:1.0358, EM:63.00, F1:68.97


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:0.8992, EM:65.00, F1:71.38


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:0.7807, EM:68.00, F1:74.53


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:0.6758, EM:65.00, F1:72.45


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 02:32:31,928] Trial 3 finished with value: 68.0 and parameters: {'learning_rate': 9.190383211488997e-06, 'weight_decay': 0.2679873608563311, 'SchedulerType': 'cosine'}. Best is trial 2 with value: 70.0.


Epoch:10/10, Train Loss:0.5909, EM:68.00, F1:74.18


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:4.1296, EM:15.00, F1:18.62


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:4.9349, EM:5.00, F1:7.62


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:5.1371, EM:8.00, F1:11.85


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:4.6037, EM:9.00, F1:12.64


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:4.0676, EM:15.00, F1:18.35


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:3.8921, EM:13.00, F1:19.26


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:3.7673, EM:14.00, F1:18.13


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:3.5719, EM:7.00, F1:13.40


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:3.4848, EM:11.00, F1:14.84


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 03:00:31,746] Trial 4 finished with value: 11.0 and parameters: {'learning_rate': 0.00016538734425010913, 'weight_decay': 0.0840270929220575, 'SchedulerType': 'linear'}. Best is trial 2 with value: 70.0.


Epoch:10/10, Train Loss:3.1094, EM:11.00, F1:15.22


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:6.5670, EM:0.00, F1:2.02


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:5.8187, EM:0.00, F1:2.02


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:5.3869, EM:1.00, F1:4.69


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:5.1411, EM:4.00, F1:8.75


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:4.9554, EM:5.00, F1:11.05


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:4.7783, EM:10.00, F1:15.91


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:4.6205, EM:11.00, F1:16.69


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:4.4582, EM:11.00, F1:16.80


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:4.2950, EM:11.00, F1:17.36


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 03:28:29,136] Trial 5 finished with value: 13.0 and parameters: {'learning_rate': 6.274383561383295e-07, 'weight_decay': 0.21655546087749766, 'SchedulerType': 'linear'}. Best is trial 2 with value: 70.0.


Epoch:10/10, Train Loss:4.1479, EM:13.00, F1:19.36


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:5.1021, EM:0.00, F1:2.24


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:5.5991, EM:3.00, F1:7.57


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:5.6060, EM:0.00, F1:4.05


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:5.5929, EM:3.00, F1:6.89


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:5.5958, EM:0.00, F1:2.29


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:5.6054, EM:0.00, F1:3.19


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:5.6157, EM:0.00, F1:2.09


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:5.6017, EM:0.00, F1:0.25


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:5.6080, EM:0.00, F1:1.32


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 03:56:33,682] Trial 6 finished with value: 0.0 and parameters: {'learning_rate': 0.0002568964783181359, 'weight_decay': 0.16019929303674696, 'SchedulerType': 'constant'}. Best is trial 2 with value: 70.0.


Epoch:10/10, Train Loss:5.5952, EM:0.00, F1:1.82


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:5.0881, EM:10.00, F1:13.64


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:3.6429, EM:21.00, F1:24.19


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:2.7962, EM:37.00, F1:41.64


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:1.8637, EM:60.00, F1:66.48


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:1.3147, EM:73.00, F1:76.17


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:1.0725, EM:72.00, F1:78.74


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:0.9280, EM:72.00, F1:78.36


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:0.8223, EM:72.00, F1:77.36


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:0.7614, EM:75.00, F1:77.63


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 04:24:27,714] Trial 7 finished with value: 75.0 and parameters: {'learning_rate': 5.131905498322558e-06, 'weight_decay': 0.29532769908944934, 'SchedulerType': 'linear'}. Best is trial 7 with value: 75.0.


Epoch:10/10, Train Loss:0.6339, EM:75.00, F1:78.80


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:5.6802, EM:0.00, F1:0.45


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:5.5995, EM:0.00, F1:0.87


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:5.5897, EM:0.00, F1:2.09


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:5.5919, EM:0.00, F1:1.22


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:5.6047, EM:0.00, F1:0.78


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:5.5896, EM:0.00, F1:0.47


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:5.5940, EM:1.00, F1:1.96


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:5.6061, EM:0.00, F1:1.53


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:5.5976, EM:0.00, F1:0.79


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 04:51:34,928] Trial 8 finished with value: 0.0 and parameters: {'learning_rate': 0.0005391092357773988, 'weight_decay': 0.2736153460365248, 'SchedulerType': 'linear'}. Best is trial 7 with value: 75.0.


Epoch:10/10, Train Loss:5.5944, EM:0.00, F1:1.03


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:4.9833, EM:18.00, F1:20.57


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:3.3202, EM:40.00, F1:43.96


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:2.1490, EM:64.00, F1:68.88


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:1.5295, EM:67.00, F1:73.60


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:1.2240, EM:72.00, F1:76.93


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:1.0940, EM:67.00, F1:73.75


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:0.9208, EM:68.00, F1:75.38


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:0.7781, EM:70.00, F1:74.68


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:0.7023, EM:69.00, F1:76.05


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 05:18:44,712] Trial 9 finished with value: 68.0 and parameters: {'learning_rate': 6.681311198155504e-06, 'weight_decay': 0.25819356337192756, 'SchedulerType': 'linear'}. Best is trial 7 with value: 75.0.


Epoch:10/10, Train Loss:0.6538, EM:68.00, F1:77.02


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:6.3734, EM:0.00, F1:3.20


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:5.5593, EM:0.00, F1:3.23


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:5.2507, EM:1.00, F1:4.23


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:5.0333, EM:4.00, F1:7.19


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:4.8223, EM:7.00, F1:10.89


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:4.6499, EM:9.00, F1:13.23


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:4.4873, EM:11.00, F1:15.67


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:4.3189, EM:14.00, F1:18.67


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:4.1628, EM:13.00, F1:17.65


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 05:45:51,353] Trial 10 finished with value: 13.0 and parameters: {'learning_rate': 7.378501873241227e-07, 'weight_decay': 0.09778585158481679, 'SchedulerType': 'linear'}. Best is trial 7 with value: 75.0.


Epoch:10/10, Train Loss:4.0393, EM:13.00, F1:17.65


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:5.3185, EM:7.00, F1:11.51


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:3.9270, EM:16.00, F1:22.28


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:3.1961, EM:24.00, F1:30.86


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:2.6780, EM:31.00, F1:37.72


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:2.2202, EM:43.00, F1:49.88


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:1.9320, EM:50.00, F1:55.34


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:1.3789, EM:59.00, F1:65.45


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:1.2365, EM:54.00, F1:59.99


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:1.1219, EM:61.00, F1:66.22


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 06:12:54,947] Trial 11 finished with value: 67.0 and parameters: {'learning_rate': 4.264506153788281e-06, 'weight_decay': 0.21172669428984298, 'SchedulerType': 'constant'}. Best is trial 7 with value: 75.0.


Epoch:10/10, Train Loss:0.9723, EM:67.00, F1:71.33


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:3.8977, EM:23.00, F1:27.82


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:2.0617, EM:58.00, F1:65.96


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:1.2750, EM:64.00, F1:69.63


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:0.9971, EM:66.00, F1:73.70


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:0.8585, EM:65.00, F1:69.53


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:0.6797, EM:73.00, F1:79.02


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:0.6294, EM:70.00, F1:76.01


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:0.5259, EM:74.00, F1:80.47


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:0.4671, EM:70.00, F1:78.38


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 06:40:02,737] Trial 12 finished with value: 72.0 and parameters: {'learning_rate': 2.187560976851187e-05, 'weight_decay': 0.29827308174861383, 'SchedulerType': 'constant'}. Best is trial 7 with value: 75.0.


Epoch:10/10, Train Loss:0.4386, EM:72.00, F1:80.03


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:5.8801, EM:0.00, F1:1.90


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:5.6577, EM:0.00, F1:1.04


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:5.6744, EM:0.00, F1:0.84


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:5.6455, EM:1.00, F1:1.29


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:5.6295, EM:0.00, F1:0.64


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:5.6115, EM:0.00, F1:1.20


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:5.6063, EM:0.00, F1:1.46


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:5.6061, EM:0.00, F1:1.07


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:5.6066, EM:0.00, F1:0.45


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 07:07:15,447] Trial 13 finished with value: 0.0 and parameters: {'learning_rate': 0.0037744645222162142, 'weight_decay': 0.298267946119185, 'SchedulerType': 'constant'}. Best is trial 7 with value: 75.0.


Epoch:10/10, Train Loss:5.5954, EM:0.00, F1:2.15


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:3.4838, EM:48.00, F1:54.69


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:1.6878, EM:66.00, F1:71.92


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:1.4684, EM:65.00, F1:72.03


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:1.1483, EM:71.00, F1:76.57


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:0.8040, EM:78.00, F1:82.63


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:0.6811, EM:74.00, F1:78.56


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:0.7185, EM:76.00, F1:81.20


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:0.6294, EM:78.00, F1:82.00


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:0.5050, EM:76.00, F1:82.03


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 07:34:24,197] Trial 14 finished with value: 76.0 and parameters: {'learning_rate': 3.338101823677286e-05, 'weight_decay': 0.2955579988933377, 'SchedulerType': 'constant'}. Best is trial 14 with value: 76.0.


Epoch:10/10, Train Loss:0.5322, EM:76.00, F1:80.60


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:5.9150, EM:2.00, F1:3.69


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:5.0372, EM:6.00, F1:8.77


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:4.5825, EM:8.00, F1:9.63


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:4.1854, EM:17.00, F1:19.38


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:3.8334, EM:15.00, F1:18.78


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:3.5631, EM:18.00, F1:21.15


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:3.3323, EM:15.00, F1:19.33


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:3.1559, EM:18.00, F1:22.66


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:2.9909, EM:22.00, F1:26.25


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 08:01:29,296] Trial 15 finished with value: 24.0 and parameters: {'learning_rate': 1.8170376234006457e-06, 'weight_decay': 0.191228951638161, 'SchedulerType': 'linear'}. Best is trial 14 with value: 76.0.


Epoch:10/10, Train Loss:2.8410, EM:24.00, F1:29.52


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:3.6047, EM:42.00, F1:45.60


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:1.7068, EM:64.00, F1:71.62


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:1.0935, EM:65.00, F1:73.76


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:0.9071, EM:69.00, F1:76.37


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:0.7725, EM:70.00, F1:77.57


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:0.7709, EM:71.00, F1:78.48


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:0.5814, EM:67.00, F1:75.09


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:0.5252, EM:66.00, F1:75.88


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:0.4847, EM:75.00, F1:80.75


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 08:28:44,336] Trial 16 finished with value: 72.0 and parameters: {'learning_rate': 3.854111240306663e-05, 'weight_decay': 0.24719281184084396, 'SchedulerType': 'constant'}. Best is trial 14 with value: 76.0.


Epoch:10/10, Train Loss:0.4729, EM:72.00, F1:78.53


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:5.7501, EM:1.00, F1:3.38


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:4.7731, EM:8.00, F1:13.03


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:4.2026, EM:11.00, F1:17.02


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:3.7185, EM:17.00, F1:22.03


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:3.2453, EM:23.00, F1:26.63


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:2.8491, EM:27.00, F1:31.94


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:2.3807, EM:43.00, F1:48.30


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:1.8487, EM:57.00, F1:62.33


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:1.5341, EM:56.00, F1:61.85


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 08:56:04,744] Trial 17 finished with value: 64.0 and parameters: {'learning_rate': 2.457883553207727e-06, 'weight_decay': 0.11206405423327809, 'SchedulerType': 'constant'}. Best is trial 14 with value: 76.0.


Epoch:10/10, Train Loss:1.3042, EM:64.00, F1:69.88


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:3.4673, EM:52.00, F1:59.13


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:2.1526, EM:57.00, F1:61.70


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:1.4485, EM:61.00, F1:68.61


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:1.3257, EM:64.00, F1:72.05


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:1.3061, EM:57.00, F1:65.24


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:1.1012, EM:58.00, F1:64.27


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:0.9697, EM:71.00, F1:75.90


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:1.0479, EM:68.00, F1:73.52


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:0.7195, EM:72.00, F1:78.16


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 09:23:15,862] Trial 18 finished with value: 47.0 and parameters: {'learning_rate': 8.348165030061685e-05, 'weight_decay': 0.041001947693927815, 'SchedulerType': 'linear'}. Best is trial 14 with value: 76.0.


Epoch:10/10, Train Loss:0.8933, EM:47.00, F1:52.53


Some weights of BigBirdForQuestionAnswering were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['qa_classifier.output.dense.bias', 'qa_classifier.output.dense.weight', 'qa_classifier.intermediate.dense.weight', 'qa_classifier.output.LayerNorm.bias', 'qa_classifier.intermediate.dense.bias', 'qa_classifier.qa_outputs.weight', 'qa_classifier.qa_outputs.bias', 'qa_classifier.output.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:1/10, Train Loss:4.1519, EM:26.00, F1:32.04


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:2/10, Train Loss:2.3789, EM:54.00, F1:59.74


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:3/10, Train Loss:1.3511, EM:67.00, F1:72.75


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:4/10, Train Loss:0.9373, EM:71.00, F1:78.05


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:5/10, Train Loss:0.7847, EM:65.00, F1:71.02


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:6/10, Train Loss:0.6653, EM:74.00, F1:79.73


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:7/10, Train Loss:0.6155, EM:74.00, F1:78.90


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:8/10, Train Loss:0.6073, EM:78.00, F1:83.41


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch:9/10, Train Loss:0.5133, EM:70.00, F1:77.51


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2023-12-17 09:51:15,398] Trial 19 finished with value: 77.0 and parameters: {'learning_rate': 1.8581330170625473e-05, 'weight_decay': 0.1764120796395892, 'SchedulerType': 'cosine'}. Best is trial 19 with value: 77.0.


Epoch:10/10, Train Loss:0.4567, EM:77.00, F1:81.01


In [35]:
dump(study, '/kaggle/working/aiconnect_mrc_optuna.pkl')

['/kaggle/working/aiconnect_mrc_optuna.pkl']

In [36]:
study = load('/kaggle/working/aiconnect_mrc_optuna.pkl')
df = study.trials_dataframe()
df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_SchedulerType,params_learning_rate,params_weight_decay,state
0,0,0.0,2023-12-17 00:41:16.100126,2023-12-17 01:08:39.024829,0 days 00:27:22.924703,cosine,0.000775,0.152012,COMPLETE
1,1,69.0,2023-12-17 01:08:39.026562,2023-12-17 01:36:42.748698,0 days 00:28:03.722136,cosine,6.6e-05,0.016688,COMPLETE
2,2,70.0,2023-12-17 01:36:42.750412,2023-12-17 02:04:32.639160,0 days 00:27:49.888748,constant,1e-05,0.228739,COMPLETE
3,3,68.0,2023-12-17 02:04:32.640706,2023-12-17 02:32:31.928501,0 days 00:27:59.287795,cosine,9e-06,0.267987,COMPLETE
4,4,11.0,2023-12-17 02:32:31.930029,2023-12-17 03:00:31.746113,0 days 00:27:59.816084,linear,0.000165,0.084027,COMPLETE


In [37]:
df.sort_values(by='value',  ascending=False)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_SchedulerType,params_learning_rate,params_weight_decay,state
19,19,77.0,2023-12-17 09:23:15.862934,2023-12-17 09:51:15.397950,0 days 00:27:59.535016,cosine,1.858133e-05,0.176412,COMPLETE
14,14,76.0,2023-12-17 07:07:15.448408,2023-12-17 07:34:24.196894,0 days 00:27:08.748486,constant,3.338102e-05,0.295558,COMPLETE
7,7,75.0,2023-12-17 03:56:33.683078,2023-12-17 04:24:27.713754,0 days 00:27:54.030676,linear,5.131905e-06,0.295328,COMPLETE
16,16,72.0,2023-12-17 08:01:29.297012,2023-12-17 08:28:44.336121,0 days 00:27:15.039109,constant,3.854111e-05,0.247193,COMPLETE
12,12,72.0,2023-12-17 06:12:54.948784,2023-12-17 06:40:02.737127,0 days 00:27:07.788343,constant,2.187561e-05,0.298273,COMPLETE
2,2,70.0,2023-12-17 01:36:42.750412,2023-12-17 02:04:32.639160,0 days 00:27:49.888748,constant,1.030153e-05,0.228739,COMPLETE
1,1,69.0,2023-12-17 01:08:39.026562,2023-12-17 01:36:42.748698,0 days 00:28:03.722136,cosine,6.553795e-05,0.016688,COMPLETE
3,3,68.0,2023-12-17 02:04:32.640706,2023-12-17 02:32:31.928501,0 days 00:27:59.287795,cosine,9.190383e-06,0.267987,COMPLETE
9,9,68.0,2023-12-17 04:51:34.929503,2023-12-17 05:18:44.711575,0 days 00:27:09.782072,linear,6.681311e-06,0.258194,COMPLETE
11,11,67.0,2023-12-17 05:45:51.355071,2023-12-17 06:12:54.947071,0 days 00:27:03.592000,constant,4.264506e-06,0.211727,COMPLETE


In [55]:
print(f"Number of trials on the Pareto front: {len(study.best_trials)}")

trial_with_highest_em = max(study.best_trials, key=lambda t: t.values)
print(f"Trial with highest accuracy: ")
print(f"\tnumber: {trial_with_highest_em.number}")
print(f"\tparams: {trial_with_highest_em.params}")
print(f"\tvalues: {trial_with_highest_em.values}")

Number of trials on the Pareto front: 1
Trial with highest accuracy: 
	number: 19
	params: {'learning_rate': 1.8581330170625473e-05, 'weight_decay': 0.1764120796395892, 'SchedulerType': 'cosine'}
	values: [77.0]


In [58]:
optuna.visualization.plot_parallel_coordinate(study, target_name="EM Score")

In [70]:
optuna.visualization.plot_optimization_history(study)

In [66]:
optuna.visualization.plot_param_importances(
    study, target_name="EM"
)

In [69]:
optuna.visualization.plot_slice(study, params=["learning_rate", "weight_decay", "SchedulerType"])