<a href="https://colab.research.google.com/github/ttogle918/AI_practice/blob/main/QA%20task/03_BERT_QA_korsquad_BertModel%EB%A1%9C%EA%B5%AC%ED%98%84.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# korsquad를 사용하여 QA task 풀기(BertModel fine-tuning)

참고 사이트

[huggingface QA 설명](https://huggingface.co/course/chapter7/7?fw=tf)

[huggingface git : ModelOutput](https://github.com/huggingface/transformers/blob/v4.21.0/src/transformers/utils/generic.py#L147)

[huggingface git : QuestionAnsweringModelOutput](https://github.com/huggingface/transformers/blob/a9eee2ffecc874df7dd635b2c6abb246fdb318cc/src/transformers/modeling_outputs.py#L764)

[huggingface docs](https://huggingface.co/docs/transformers/model_doc/bert)

In [None]:
# !pip install transformers
# !pip install datasets

In [1]:
from datasets import load_dataset, load_metric#, list_metrics

from transformers import BertModel, AutoTokenizer, BertConfig, BertPreTrainedModel
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import get_linear_schedule_with_warmup
from transformers.modeling_outputs import QuestionAnsweringModelOutput

import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
import torch.nn.functional as F
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
from tqdm import tqdm, tqdm_notebook

import time
import matplotlib.pyplot as plt

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
# gpu 연산이 가능하면 'cuda:0', 아니면 'cpu' 출력
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device, torch.cuda.device_count()

(device(type='cuda', index=0), 1)

# Data Load

In [4]:
# https://huggingface.co/datasets/squad_kor_v1/blob/main/squad_kor_v1.py
# squad_kor_v2
from datasets import load_dataset
dataset = load_dataset('squad_kor_v1')
dataset, dataset['train'][0]

Reusing dataset squad_kor_v1 (/root/.cache/huggingface/datasets/squad_kor_v1/squad_kor_v1/1.0.0/18d4f44736b8ee85671f63cb84965bfb583fa0a4ff2df3c2e10eee9693796725)


  0%|          | 0/2 [00:00<?, ?it/s]

(DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 60407
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5774
    })
}),
 {'answers': {'answer_start': [54], 'text': ['교향곡']},
  'context': '1839년 바그너는 괴테의 파우스트을 처음 읽고 그 내용에 마음이 끌려 이를 소재로 해서 하나의 교향곡을 쓰려는 뜻을 갖는다. 이 시기 바그너는 1838년에 빛 독촉으로 산전수전을 다 걲은 상황이라 좌절과 실망에 가득했으며 메피스토펠레스를 만나는 파우스트의 심경에 공감했다고 한다. 또한 파리에서 아브네크의 지휘로 파리 음악원 관현악단이 연주하는 베토벤의 교향곡 9번을 듣고 깊은 감명을 받았는데, 이것이 이듬해 1월에 파우스트의 서곡으로 쓰여진 이 작품에 조금이라도 영향을 끼쳤으리라는 것은 의심할 여지가 없다. 여기의 라단조 조성의 경우에도 그의 전기에 적혀 있는 것처럼 단순한 정신적 피로나 실의가 반영된 것이 아니라 베토벤의 합창교향곡 조성의 영향을 받은 것을 볼 수 있다. 그렇게 교향곡 작곡을 1839년부터 40년에 걸쳐 파리에서 착수했으나 1악장을 쓴 뒤에 중단했다. 또한 작품의 완성과 동시에 그는 이 서곡(1악장)을 파리 음악원의 연주회에서 연주할 파트보까지 준비하였으나, 실제로는 이루어지지는 않았다. 결국 초연은 4년 반이 지난 후에 드레스덴에서 연주되었고 재연도 이루어졌지만, 이후에 그대로 방치되고 말았다. 그 사이에 그는 리엔치와 방황하는 네덜란드인을 완성하고 탄호이저에도 착수하는 등 분주한 시간을 보냈는데, 그런 바쁜 생활이 이 곡을 잊게 한 것이 아닌가 하는 의

In [19]:
class CustomDataset(Dataset):
    def __init__(self, dataset):
        self.question, self.context, self.answer_start, self.answer_text = self.make_dataset(dataset)

    def make_dataset(self, dataset):
        context, question, answer_start, answer_text = [], [], [], []
        for i, data in enumerate(dataset) :
          start = data['answers']['answer_start']
          if len(start) != 1 : # 답이 없을 때
            print(i, data)
            continue
          text, start = self.get_text(data['context'], start[0])
          answer_start.append(start)
          answer_text.append(data['answers']['text'])
          # answers.append([answer_start[0], answer_start[0] + len(text)])  # 정답의 시작과 끝 index
          context.append(data['context'])
          question.append(data['question'])
        return question, context, answer_start, answer_text
        
    def __len__(self):
        return len(self.question)

    def __getitem__(self, idx):
        return self.question[idx], self.context[idx], self.answer_start[idx], self.answer_text[idx]

    def get_text(self, text, start_loc) :
        text_splited = text.split('. ')
        length_text_answer_idx = -1
        length_text = [0]

        for i, t in enumerate(text_splited) :
          length_text.append(length_text[-1]+len(t)+2)
          
          if length_text_answer_idx == -1 and start_loc > length_text[-1] :
            length_text_answer_idx = i
        length_text[-1] -= 2

        start, end = 0, len(length_text)-1
        while length_text[end] - length_text[start] > 512 :
            if start_loc - length_text[start] > length_text[end] - start_loc :
              start += 1
            else :
              end -= 1
        
        return text[text_splited[start]:text_splited[end]], start_loc - text_splited[start]

In [17]:
text = tokenizer('하나의 교향곡을 쓰려는 뜻을 갖는다.', add_special_tokens=False).input_ids
answer = tokenizer('하나의 교향곡', add_special_tokens=False).input_ids
tokens = tokenizer(['1839년 바그너는 괴테의 파우스트을 처음 읽고 그 내용에 마음이 끌려 이를 소재로 해서 하나의 교향곡을 쓰려는 뜻을 갖는다.', '이를 소재로 해서 하나의 교향곡을 쓰려는 뜻을 갖는다'], 
                   ['이를 소재로 해서 하나의 교향곡을 쓰려는 뜻을 갖는다', '이를 소재로 해서 하나의 교향곡을 쓰려는 뜻을 갖는다'], 
                   return_tensors='pt', truncation=True, padding=True, max_length=512)
text, answer, tokens

([3657, 2079, 19282, 2069, 1363, 2370, 2259, 936, 2069, 554, 2259, 2062, 18],
 [3657, 2079, 19282],
 {'input_ids': tensor([[    2, 13934,  2236,  2440, 27982,  2259, 21310,  2079, 11994,  3791,
           2069,  3790,  1508,  2088,   636,  3800,  2170,  3717,  2052,  9001,
           8345,  4642,  2200,  3689,  3657,  2079, 19282,  2069,  1363,  2370,
           2259,   936,  2069,   554,  2259,  2062,    18,     3,  8345,  4642,
           2200,  3689,  3657,  2079, 19282,  2069,  1363,  2370,  2259,   936,
           2069,   554,  2259,  2062,     3],
         [    2,  8345,  4642,  2200,  3689,  3657,  2079, 19282,  2069,  1363,
           2370,  2259,   936,  2069,   554,  2259,  2062,     3,  8345,  4642,
           2200,  3689,  3657,  2079, 19282,  2069,  1363,  2370,  2259,   936,
           2069,   554,  2259,  2062,     3,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

In [18]:
tensorized_label = np.zeros(tokens.input_ids.shape)
print(answer)
for i in [0,1] :
  padding_start = (tokens['attention_mask'][i] == 1).nonzero()[-1].item()+1
  print(padding_start)
  tensorized_label[i, padding_start-len(text): padding_start-len(text)+len(answer)] = 1
  print(tensorized_label)
  print(tokens['input_ids'][i])
  print(tokens['input_ids'][i, padding_start-len(text): padding_start-len(text)+len(answer)])
  print('\n\n')

[3657, 2079, 19282]
55
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0.]]
tensor([    2, 13934,  2236,  2440, 27982,  2259, 21310,  2079, 11994,  3791,
         2069,  3790,  1508,  2088,   636,  3800,  2170,  3717,  2052,  9001,
         8345,  4642,  2200,  3689,  3657,  2079, 19282,  2069,  1363,  2370,
         2259,   936,  2069,   554,  2259,  2062,    18,     3,  8345,  4642,
         2200,  3689,  3657,  2079, 19282,  2069,  1363,  2370,  2259,   936,
         2069,   554,  2259,  2062,     3])
tensor([ 3657,  2079, 19282])



35
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0.
  0. 0. 0. 0. 

In [7]:
def custom_collate_fn(batch):
    global tokenizer
    question_list, context_list, answer_start, answer_text, after_list = [], [], [], [], []

    for _question, _context, _start, _text in batch:
        question_list.append(_question)
        context_list.append(_context)
        after_list.append(_context[_start:])
        answer_start.append(_start)
        answer_text.append(_text)
    
    tensorized_input = tokenizer(
        question_list, context_list,
        add_special_tokens=True,
        padding="longest",  # 배치내 가장 긴 문장을 기준으로 부족한 문장은 [PAD] 토큰을 추가
        return_tensors='pt',
        max_length=512,
        truncation=True
    )
    # answer_start token의 위치를 찾기 위해 list로 반환
    after_text = tokenizer(
        after_list,
        add_special_tokens=False,
        return_tensors=None
    ).input_ids

    # answer text token만 변환
    answer_tokens = tokenizer(
        after_list,
        add_special_tokens=False,
        return_tensors=None
    ).input_ids

    tensorized_label = np.zeros(tensorized_input.input_ids.shape)    # input_ids만큼의 길이이고 0으로 된 array

    for i, text, answer in enumerate(zip(after_text, answer_tokens)) :
        padding_start = (tensorized_input['attention_mask'][i] == 1).nonzero()[-1].item()+1
        tensorized_label[i, padding_start-len(text): padding_start-len(text)+len(answer)] = 1

    return tensorized_input, torch.from_numpy(tensorized_label)

In [8]:
def make_dataloader(dataset, tokenizer, batch_size, s='train') :
  dataloader = DataLoader(
      dataset,
      batch_size =batch_size,
      sampler = RandomSampler(dataset) if s == 'train' else SequentialSampler(dataset),
      collate_fn = custom_collate_fn
  )
  print(f'batch_size : {batch_size}')
  return dataloader

# 모델 설명


In [9]:
class CustomBertForQuestionAnswering(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config, add_pooling_layer=False)
        self.qa_output = nn.Linear(config.hidden_size, config.num_labels)
        self.loss_fct = CrossEntropyLoss()

        self.post_init()
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
                positions=None, output_attentions=None, output_hidden_states=None):
        
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids,
                            head_mask=head_mask, inputs_embeds=inputs_embeds,
                  output_attentions=output_attentions, output_hidden_states=output_hidden_states)
        
        sequence_output = outputs[0]
        logits = self.qa_output(sequence_output)    # linear 통과해서 num_label로 분류
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        start_positions, end_positions = positions.split(1, dim=-1)
        if start_positions is not None and end_positions is not None:
          if len(start_positions.size()) > 1:
              start_positions = start_positions.squeeze(-1)
          if len(end_positions.size()) > 1:
              end_positions = end_positions.squeeze(-1)

          ignored_index = start_logits.size(1)
          start_positions = start_positions.clamp(0, ignored_index)
          end_positions = end_positions.clamp(0, ignored_index)

          start_logits_idx, end_logits_idx = torch.argmax(start_logits, dim=-1).float(), torch.tensor(end_logits.argmax(dim=-1), dtype=torch.float16)
          start_loss = self.loss_fct(start_logits_idx, start_positions.float())
          end_loss = self.loss_fct(end_logits_idx, end_positions.float())
          total_loss = (start_loss + end_loss) / 2
          total_loss.requires_grad_(True)

        return QuestionAnsweringModelOutput(loss=total_loss, 
                                            start_logits=start_logits_idx, 
                                            end_logits=start_logits_idx,
                                            hidden_states=outputs.hidden_states, 
                                            attentions=outputs.attentions)


# train

In [10]:
def initializer(train_dataloader, epochs=2, model_name='klue/bert-base', lr=4e-5, wd=4e-5):
    """
    모델, 옵티마이저, 스케쥴러 초기화
    """
    config = BertConfig.from_pretrained(model_name)
    config.max_length = 512
    model = CustomBertForQuestionAnswering(config)

    optimizer = AdamW(
        model.parameters(), # update 대상 파라미터를 입력
        lr=lr,    # 2e-5
        eps=1e-8,
        weight_decay=wd
    )
    
    total_steps = len(train_dataloader) * epochs
    print(f"Total train steps with {epochs} epochs: {total_steps}")

    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = 0, # 여기서는 warmup을 사용하지 않는다.
        num_training_steps = total_steps
    )
    print(f'model_name : {model_name}, lr : {lr}, weight_decay : {wd}, epochs : {epochs}')
    return model, optimizer, scheduler

In [11]:
def save_checkpoint(path, model, optimizer, scheduler, epoch, loss, f1, model_name=''):
    file_name = f'{path}/epoch:{epoch}_loss:{loss:.4f}_f1:{f1:.4f}.ckpt'
    
    torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss' : loss,
            'f1' : f1
        }, 
        file_name
    )
    
    print(f"Saving epoch {epoch} checkpoint at {file_name}")

### train code

In [12]:
def train(model, optimizer, scheduler, train_dataloader, valid_dataloader=None, epochs=1):
  loss_fct = nn.MSELoss()
  em_fct = load_metric('exact_match')
  f1_fct = load_metric('f1')


  train_dict = {'loss' : [], 'f1' : []}
  valid_dict = {'loss' : [], 'f1' : [], 'em' : []}

  for epoch in range(epochs) :

    print(f"*****Epoch {epoch} Train Start*****")
    total_loss, total_f1, batch_f1, batch_em, total_em, batch_loss, batch_count = 0,0,0,0,0,0,0
    
    model.train()
    model.to(device)
    
    for step, batch in enumerate(train_dataloader):
      batch_count+=1
      
      batch = tuple(item.to(device) for item in batch)
      batch_input, batch_label = batch
      
      model.zero_grad()
      
      outputs = model(**batch_input, positions=batch_label)  # forward
      # start_logits_idx, end_logits_idx = torch.argmax(outputs.start_logits, dim=-1).float16(), torch.argmax(outputs.end_logits, dim=-1).float16()

      start_pos, end_pos = batch_label.split(1, dim=-1)
      if start_pos is not None and end_pos is not None:
        if len(start_pos.size()) > 1:
            start_pos = start_pos.squeeze(-1)
        if len(end_pos.size()) > 1:
            end_pos = end_pos.squeeze(-1)

        ignored_index = outputs.start_logits.size(0)
        start_pos = start_pos.clamp(0, ignored_index)
        end_pos = end_pos.clamp(0, ignored_index)

      # start_loss = loss_fct(start_logits_idx, start_pos.float())
      # end_loss = loss_fct(end_logits_idx, end_pos.float())
      # loss = (start_loss + end_loss) / 2
      
      loss = outputs.loss
      batch_loss += loss.item()
      total_loss += loss.item()


      # start_pos, end_pos = batch_label.split(1, dim=-1)
      # if start_pos is not None and end_pos is not None:
      #   if len(start_pos.size()) > 1:
      #       start_pos = start_pos.squeeze(-1)
      #   if len(end_pos.size()) > 1:
      #       end_pos = end_pos.squeeze(-1)
      # ignored_index = probs['start_logits_idx'].size(0)
      # start_pos = start_pos.clamp(0, ignored_index)
      # end_pos = end_pos.clamp(0, ignored_index)

      # probs_idx = torch.stack([probs['start_logits_idx'], probs['end_logits_idx']], dim=-1)
      # em = em_fct.compute(predictions=torch.stack([start_logits_idx, end_logits_idx], dim=-1),
      #                       references=batch_label)['exact_match']

      # batch_em += em
      # total_em += em
      
      start_results = f1_fct.compute(predictions=outputs.start_logits, references=start_pos, average='micro')['f1']
      end_results = f1_fct.compute(predictions=outputs.end_logits, references=end_pos, average='micro')['f1']

      f1 = (start_results + end_results)/2
      batch_f1 += f1
      total_f1 += f1

      # backward -> 파라미터의 미분(gradient)를 자동으로 계산
      loss.backward()

      # gradient clipping 적용 
      clip_grad_norm_(model.parameters(), 1.0)
      
      # optimizer & scheduler 업데이트
      optimizer.step()
      scheduler.step()

      # 그래디언트 초기화
      model.zero_grad()

      if (step % 128 == 0 and step != 0):
          learning_rate = optimizer.param_groups[0]['lr']
          print(f"Epoch: {epoch}, Step : {step}, LR : {learning_rate:.10f}, Avg Loss : {batch_loss / batch_count:.4f}, f1 score : {batch_f1 / batch_count:.4f}")
          
          if (round(batch_f1 / batch_count, 5) == 0) and (round(learning_rate, 10) == 0) :
              print("Train Finished, learning_rate is 0 and train_f1 is 0")
              return train_dict, valid_dict

          batch_loss, batch_f1, batch_count = 0,0,0


    print(f"Epoch {epoch} Total Mean Loss : {total_loss/(step+1):.4f}")
    print(f"Epoch {epoch} Total Mean f1 : {total_f1/(step+1):.4f}")
    print(f"*****Epoch {epoch} Train Finish*****\n")

    train_dict['f1'].append(total_f1/(step+1))
    train_dict['loss'].append(total_loss/(step+1))
    # train_dict['em'].append(total_em/(step+1))
    
    if valid_dataloader is not None:
        print(f"*****Epoch {epoch} Valid Start*****")
        valid_loss, valid_em, valid_f1 = validate(model, valid_dataloader, f1_fct, em_fct)
        print(f"Epoch {epoch} Valid Loss : {valid_loss:.4f} Valid f1 : {valid_f1:.4f} Valid em : {valid_em:.4f}")
        print(f"*****Epoch {epoch} Valid Finish*****\n")

    valid_dict['f1'].append(valid_f1)
    valid_dict['loss'].append(valid_loss)
    valid_dict['em'].append(valid_em)
    if round(valid_f1, 4) == 0 :
        break
    # if before_loss > valid_loss :
    #     before_loss = valid_loss
    #     save_checkpoint("/content/drive/MyDrive/Colab Notebooks/nlp/qa", model, optimizer, scheduler, epoch, valid_loss, valid_f1, model_name)

    # elif before_f1 < valid_f1  :
    #     before_f1 = valid_f1
    #     save_checkpoint("/content/drive/MyDrive/Colab Notebooks/nlp/qa", model, optimizer, scheduler, epoch, valid_loss, valid_f1, model_name)

  print("Train Finished")
  return train_dict, valid_dict

### validation code

In [13]:
def validate(model, valid_dataloader, f1_fct, em_fct):
    loss_fct = nn.MSELoss()
    model.eval()
    model.to(device)
    
    total_loss, total_em, total_f1= 0,0, 0
        
    for step, batch in enumerate(valid_dataloader):
        
        batch = tuple(item.to(device) for item in batch)
            
        batch_input, batch_label = batch
            
        # gradient 계산하지 않음
        with torch.no_grad():
            outputs = model(**batch_input, positions=batch_label)
            # start_logit_idx, end_logit_idx = torch.argmax(outputs.start_logits, dim=-1).float16(), torch.argmax(outputs.end_logits, dim=-1).float16()
            
        start_pos, end_pos = batch_label.split(1, dim=-1)
        if start_pos is not None and end_pos is not None:
          if len(start_pos.size()) > 1:
              start_pos = start_pos.squeeze(-1)
          if len(end_pos.size()) > 1:
              end_pos = end_pos.squeeze(-1)
        ignored_index = outputs.end_logits.size(0)
        start_pos = start_pos.clamp(0, ignored_index)
        end_pos = end_pos.clamp(0, ignored_index)
        
        loss = outputs.loss
        total_loss += loss.item()
        
        em = em_fct.compute(predictions=torch.stack([outputs.start_logits, outputs.end_logits], dim=-1), 
                            references=batch_label)
        total_em += em['exact_match']
        
        start_results = f1_fct.compute(predictions=outputs.start_logits, references=start_pos, average='micro')['f1']
        end_results = f1_fct.compute(predictions=outputs.end_logits, references=end_pos, average='micro')['f1']
        f1 = (start_results + end_results)/2
        total_f1 += f1

    total_loss = total_loss/(step+1)
    total_em = total_em/(step+1)
    total_f1 = total_f1/(step+1)
    return total_loss, total_em, total_f1

### draw_plot

In [14]:
# loss와 f1-score의 변화를 epoch마다 보기 위한 plot
def draw_plot(train_dict, valid_dict, i) :
  print('green is loss, gray is f1')
  plt.subplot(1, 2, 1)
  plt.xlabel('Epochs')
  plt.title('Loss and F1 of Train data')
  x_values= [n for n in range(len(train_dict['loss']))]
  plt.plot(x_values, train_dict['loss'], color='green', marker='o')  # loss
  plt.plot(x_values, train_dict['f1'], color='#AAAAAA', marker='*')  # f1

  plt.subplot(1, 2, 2)
  plt.xlabel('Epochs')
  plt.title('Loss and F1 of Validation data')
  x_values= [n for n in range(len(valid_dict['loss']))]
  plt.plot(x_values, valid_dict['loss'], color='green', marker='o')  # loss
  plt.plot(x_values, valid_dict['f1'], color='#AAAAAA', marker='*')  # f1

  plt.show()
  plt.savefig(f'figure_{i}.png')

In [16]:
model_name = 'klue/bert-base'   # 다시 설정 필요
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
train_dataset = CustomDataset(dataset['train'])
valid_dataset = CustomDataset(dataset['validation'])

del dataset

In [None]:
import gc
gc.collect()

In [None]:
train_dataloader = make_dataloader(train_dataset, model_name, 16, 'train')
valid_dataloader = make_dataloader(valid_dataset, model_name, 8, 'valid')

learning_rate = 5e-5
weight_decay = 4e-5
model, optimizer, scheduler = initializer(train_dataloader, 4, model_name, learning_rate, weight_decay)
start = time.time()


In [None]:
del train_dataset
del valid_dataset

In [None]:
import gc
gc.collect()

88

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
train_dict, valid_dict = train(model, optimizer, scheduler, train_dataloader, valid_dataloader, 4)
end = time.time()
print(f"time : {(end - start)//60}분 {(end - start)%60}초")

# draw_plot(train_dict, valid_dict, 0)

*****Epoch 0 Train Start*****
Epoch: 0, Step : 128, LR : 0.0000370730, Avg Loss : 559556.4972, f1 score : 0.0000
Epoch: 0, Step : 256, LR : 0.0000366492, Avg Loss : 531714.2206, f1 score : 0.0005
Epoch: 0, Step : 384, LR : 0.0000362255, Avg Loss : 568919.5496, f1 score : 0.0022
Epoch: 0, Step : 512, LR : 0.0000358018, Avg Loss : 544838.4856, f1 score : 0.0007
Epoch: 0, Step : 640, LR : 0.0000353780, Avg Loss : 584851.4816, f1 score : 0.0005
Epoch: 0, Step : 768, LR : 0.0000349543, Avg Loss : 551967.8732, f1 score : 0.0002
Epoch: 0, Step : 896, LR : 0.0000345306, Avg Loss : 544566.5043, f1 score : 0.0005
Epoch: 0, Step : 1024, LR : 0.0000341069, Avg Loss : 559160.8174, f1 score : 0.0007
Epoch: 0, Step : 1152, LR : 0.0000336831, Avg Loss : 561093.9429, f1 score : 0.0010
Epoch: 0, Step : 1280, LR : 0.0000332594, Avg Loss : 523954.7056, f1 score : 0.0024
Epoch: 0, Step : 1408, LR : 0.0000328357, Avg Loss : 566050.6188, f1 score : 0.0000
Epoch: 0, Step : 1536, LR : 0.0000324119, Avg Loss : 

KeyboardInterrupt: ignored

전혀 수렴하지 못하고 있다.

start_idx와 end_idx가 영역에 존재하는지 여부를 나타내는 평가지표를 추가해야겠다.

(end_idx - start_idx)/(예측된 end_idx - 예측된 start_idx)

그리고, target인 start_idx가 tokenizer된 이후를 나타내지 않는다. 이 부분도 수정해야한다.

start_idx 이전까지 tokenizing, 정답 text tokenizing하면 동일할 것이다.