In [22]:
#KLUE QA Dataset과 Roberta-large모델을 활용한 학습 예제 입니다.
#Model을 만드는 코드
import json
import random
import collections
import gc

import torch
import numpy as np
from tqdm.notebook import tqdm
from torch.utils.data import Dataset,DataLoader
from transformers import AutoModel, AutoTokenizer
from transformers import AutoTokenizer,AutoModelForQuestionAnswering,AdamW

In [23]:
#data를 읽어서 각 feature 별로 전처리하는 함수
def readTrainData(path):
    with open(path,'rb')as file:
        MRCdata=json.load(file)
    #data 구성
    #context - 문장
    #question - 질문
    #answer - 정답
    MRCdata=MRCdata["data"][0:int(len(MRCdata["data"])*0.8)]
    contexts=list()
    questions=list()
    answers=list()
    
    #하나의 문장에 여러 질문이 있을 수 있고, 질문이 여러개면 답변도 여러개이기 때문에 4중for문 사용
    for item in tqdm(MRCdata):
        for passage in item['paragraphs']:
            context=passage['context']
            for qa in passage['qas']:
                question=qa['question']
                for ans in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(ans)
    return contexts,questions,answers

In [24]:
#dataset에 정답의 끝나는 index도 추가하는 함수
def endIdx(answers,contexts):
    for answer,context in zip(answers,contexts):
        ansText=answer['text']
        startIdx=answer['answer_start']
        endIdx=startIdx+len(ansText)
        
        answer['answer_end']=endIdx
        

In [25]:
class KlueMRCDataset(Dataset):
    
    def __init__(self, contexts, questions, answers, modelMaxPositionEmbedings, tokenizer):
        self.tokenizer = tokenizer
        self.answers = answers
        self.questions = questions
        self.contexts = contexts
        self.model_max_position_embedings = modelMaxPositionEmbedings
        self.encodings = self.tokenizer(self.contexts, 
                                        self.questions,
                                        max_length=512,
                                        truncation=True,
                                        padding="max_length",
                                        return_token_type_ids=False)
        self.addTokenPositions()
        
    def addTokenPositions(self):
        startPositions = []
        endPositions = []
        for i in range(len(self.answers)):
            startPositions.append(self.encodings.char_to_token(i, self.answers[i]['answer_start']))
            endPositions.append(self.encodings.char_to_token(i, self.answers[i]['answer_end'] - 1))

            # positions 값이 None 값이라면, answer가 포함된 context가 잘렸다는 의미
            if startPositions[-1] is None:
                startPositions[-1] = self.model_max_position_embedings
            if endPositions[-1] is None:
                endPositions[-1] = self.model_max_position_embedings

        self.encodings.update({'startPositions': startPositions, 'endPositions': endPositions})

        
    def get_data(self):
        return {"contexts":self.contexts, 'questions':self.questions, 'answers':self.answers}
    
    
    def get_encodings(self):
        return self.encodings
        
    
    def __getitem__(self, idx):
        return {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings['input_ids'])

In [26]:
#토크나이저 가저오기
tokenizerName="klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(tokenizerName)

In [27]:
#model 가저오기
modelName="klue/bert-base"
model = AutoModelForQuestionAnswering.from_pretrained(modelName)

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:73] data. DefaultCPUAllocator: not enough memory: you tried to allocate 2359296 bytes. Buy new RAM!

In [19]:
#훈련 data 전처리
#aihubData
dataName="./dataset/ko_nia_normal_squad_all.json"

#KLUE-Data
#dataName="./dataset/klue-mrc-v1.1_train.json"

In [20]:
contexts,questions,answers=readTrainData(dataName)
endIdx(answers,contexts)

  0%|          | 0/37851 [00:00<?, ?it/s]

In [21]:
trainDataset=KlueMRCDataset(contexts,questions,answers,512,tokenizer)

MemoryError: 

In [None]:
#하이퍼파라미터 정의
EPOCH=10
LEARNING_RATE=5e-5
BATCH_SIZE=8

In [None]:
#모델 훈련 실행함수(AdamW사용)
#좀 더 다양한 dataset을 활용해 여러 모델을 만들 예정이므로 각 모델의 이름을 구별하기 위해 modelName변수를 활용
def train_runner(model, dataset, batch_size, num_train_epochs, learning_rate):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    
    model.to(device)
    model.train()
    train_dataloader = DataLoader(dataset=dataset, batch_size=batch_size)
    global_total_step = len(train_dataloader) * num_train_epochs
    optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0)
    with tqdm(total=global_total_step, unit='step') as t:
        total = 0
        total_loss = 0
        for epoch in range(num_train_epochs):
            for batch in train_dataloader:
                optimizer.zero_grad()
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                startPositions = batch['startPositions'].to(device)
                endPositions = batch['endPositions'].to(device)
                outputs = model(input_ids,
                             attention_mask=attention_mask,
                             start_positions=startPositions,
                             end_positions=endPositions)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                
                batch_loss = loss.item() * len(input_ids)
                total += len(input_ids)
                total_loss += batch_loss
                global_total_step += 1
                t.set_postfix(loss="{:.6f}".format(total_loss / total), batch_loss="{:.6f}".format(batch_loss))
                t.update(1)
                
                del input_ids
                del attention_mask
                del startPositions
                del endPositions
                del outputs
                del loss
    outputName=(modelName+"_"+tokenizerName+"_epoch-"+str(EPOCH)).replace("/","-")
    model.save_pretrained("./outputs/"+outputName)


In [None]:
#모델 훈련 실행
#메모리 확보
gc.collect()
torch.cuda.empty_cache()
train_runner(model,trainDataset, BATCH_SIZE, EPOCH, LEARNING_RATE)