In [16]:
#KLUE QA Dataset과 Roberta-large모델을 활용한 학습 예제 입니다.

import json
import random
import collections

import torch
import numpy as np
from tqdm.notebook import tqdm
from torch.utils.data import Dataset,DataLoader
from transformers import AutoModel, AutoTokenizer
from transformers import AutoTokenizer,AutoModelForQuestionAnswering,AdamW

In [17]:
def readDevData(path):
    with open(path, 'rb') as f:
        testData = json.load(f)

    contexts = []
    questions = []
    answers = []
    
    #데이터셋이 하나밖에 없으면 비율 나누기, train, dev로 나뉘었으면 그냥 쓰기
    #testData=testData["data"]
    testData=testData["data"][int(len(testData["data"])*0.8):len(testData["data"])]
    
    for group in tqdm(testData):
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                temp_answer = []
                for answer in qa['answers']:
                    temp_answer.append(answer['text'])
                if len(temp_answer) != 0:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(temp_answer)

    return contexts, questions, answers

In [18]:
def prediction(contexts, questions):
    
    model.to(device)
    model.eval()
    
    result = []
    
    with torch.no_grad():
        for context, question in tqdm(zip(contexts, questions),total=len(contexts)):
            encodings = tokenizer(context, question, max_length=512, truncation=True,
                                     padding="max_length", return_token_type_ids=False)
            encodings = {key: torch.tensor([val]) for key, val in encodings.items()}
            
            input_ids = encodings["input_ids"].to(device)
            attention_mask = encodings["attention_mask"].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            token_start_index, token_end_index = start_logits.argmax(dim=-1), end_logits.argmax(dim=-1)
            pred_ids = input_ids[0][token_start_index: token_end_index + 1]
            pred = tokenizer.decode(pred_ids)
            result.append(pred)

    return result

In [19]:
#정확도 점수 측정
def emEvalutate(preds, reals):
    total = len(preds)
    exact_match = 0
    for pred, real in zip(preds, reals):
        if pred in real:
            exact_match += 1
    
    return (exact_match/total) * 100

In [20]:
#훈련 data 전처리
#aihubData
#dataName="./dataset/ko_nia_normal_squad_all.json"

#KLUE-Data
dataName="./dataset/klue-mrc-v1.1_dev.json"
dev_contexts ,dev_questions ,dev_answers =readDevData(dataName)

  0%|          | 0/1015 [00:00<?, ?it/s]

In [21]:
#MRCModel에서 훈련을 완료한 model 가저오기 및 tokenizer 설정
device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
modelName="klue-bert-base_klue-bert-base_epoch-10"

tokenizer=AutoTokenizer.from_pretrained("klue/bert-base")
model = AutoModelForQuestionAnswering.from_pretrained("outputs/"+modelName)

In [22]:
#결과값(예측 answer)
preds = prediction(dev_contexts, dev_questions)

  0%|          | 0/737 [00:00<?, ?it/s]

In [23]:
#정확도 측정
emScore=emEvalutate(preds,dev_answers)
emScore

26.45861601085482