# Implement Retro Reader

## Load Dataset

In [1]:
import datasets

# !git clone https://huggingface.co/datasets/jinmang2/aihub-mrc
aihub = datasets.load_from_disk("./data/aihub-mrc")
klue = datasets.load_dataset("klue", "mrc")
# !git clone https://huggingface.co/datasets/jinmang2/KorQuAD-v1.0
korquad = datasets.load_from_disk("./data/KorQuAD-v1.0")

Reusing dataset klue (/opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
aihub

DatasetDict({
    normal: Dataset({
        features: ['answers', 'context', 'id', 'question', 'title', 'classtype'],
        num_rows: 243425
    })
    noanswer: Dataset({
        features: ['context', 'id', 'question', 'title', 'classtype'],
        num_rows: 100244
    })
    clue: Dataset({
        features: ['answers', 'clue', 'context', 'id', 'question', 'title', 'classtype'],
        num_rows: 96663
    })
})

In [3]:
klue

DatasetDict({
    train: Dataset({
        features: ['title', 'context', 'news_category', 'source', 'guid', 'is_impossible', 'question_type', 'question', 'answers'],
        num_rows: 17554
    })
    validation: Dataset({
        features: ['title', 'context', 'news_category', 'source', 'guid', 'is_impossible', 'question_type', 'question', 'answers'],
        num_rows: 5841
    })
})

In [4]:
korquad

DatasetDict({
    train: Dataset({
        features: ['answers', 'context', 'id', 'question', 'title'],
        num_rows: 60407
    })
    validation: Dataset({
        features: ['answers', 'context', 'id', 'question', 'title'],
        num_rows: 5774
    })
})

## Incorporate data schema

In [5]:
from datasets import Sequence, Value, Features
from datasets import Dataset, DatasetDict

features = Features(
    {
        "guid": Value(dtype="string", id=None),
        "question": Value(dtype="string", id=None),
        "context": Value(dtype="string", id=None),
        "answers": Sequence(
            feature={
                "text": Value(dtype="string", id=None),
                "answer_start": Value(dtype="int32", id=None),
            },
            length=-1,
            id=None,
        ),
        "is_impossible": Value(dtype="bool", id=None),
        "title": Value(dtype="string", id=None),
        "classtype": Value(dtype="string", id=None),
        "source": Value(dtype="string", id=None),
        "dataset": Value(dtype="string", id=None),
    }
)

In [6]:
def schema_integration(dataset_name):
    
    def func(example):
        title = example["title"]
        classtype = ""
        question = example["question"]
        context = example["context"]
        if dataset_name == "klue":
            guid = example["guid"]
            source = example["source"] + "-" + example["news_category"]
            is_impossible = example["is_impossible"]
            answers = example["answers"]
        elif dataset_name == "aihub":
            guid = example["id"]
            source = "aihub"
            classtype = example["classtype"]
            try:
                answers = example["answers"]
                is_impossible = False
            except:
                answers = {"text": "", "answer_start": [-1]}
                is_impossible = True
        elif dataset_name == "korquad":
            guid = example["id"]
            source = "korquad"
            answers = example["answers"]
            is_impossible = False
        # The feature names must be sorted.
        return {
            "guid": guid,
            "question": question,
            "context": context,
            "answers": answers,
            "title": title,
            "classtype": classtype,
            "source": source,
            "is_impossible": is_impossible,
            "dataset": dataset_name,
        }
    
    return func

In [7]:
klue_mrc = klue.map(
    schema_integration("klue"), 
    remove_columns=klue.column_names["train"],
    features=features,
)

Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90/cache-c9ca67749382b418.arrow
Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90/cache-264fefb4af162854.arrow


In [8]:
aihub_normal = aihub["normal"].map(
    schema_integration("aihub"), 
    remove_columns=aihub.column_names["normal"],
    features=features,
)
aihub_noanswer = aihub["noanswer"].map(
    schema_integration("aihub"), 
    remove_columns=aihub.column_names["noanswer"],
    features=features,
)

Loading cached processed dataset at data/aihub-mrc/normal/cache-55298e4e92311701.arrow
Loading cached processed dataset at data/aihub-mrc/noanswer/cache-426c086f90c0cedc.arrow


In [9]:
korquad = korquad.map(
    schema_integration("korquad"),
    remove_columns=korquad.column_names["train"],
    features=features,
)

Loading cached processed dataset at data/KorQuAD-v1.0/train/cache-18ffad6d3b577588.arrow
Loading cached processed dataset at data/KorQuAD-v1.0/validation/cache-26d68b8a8c1d0516.arrow


In [10]:
klue_valid_noanswer = klue_mrc["validation"].filter(lambda ex: ex["is_impossible"])
klue_valid_normal = klue_mrc["validation"].filter(lambda ex: not ex["is_impossible"])

Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90/cache-f1184c422f03e3ce.arrow
Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90/cache-377288fdb5733764.arrow


In [11]:
aihub_test_size = len(korquad["validation"]) - len(klue_valid_noanswer)
aihub_noanswer = aihub_noanswer.train_test_split(test_size=aihub_test_size)
aihub_valid_noanswer = aihub_noanswer["test"]
aihub_noanswer = aihub_noanswer["train"]

Loading cached split indices for dataset at data/aihub-mrc/noanswer/cache-4e9b40c82ec5c010.arrow and data/aihub-mrc/noanswer/cache-28dbe05943c95b98.arrow


In [12]:
train = datasets.concatenate_datasets(
    [
        klue_mrc["train"], klue_valid_normal,
        korquad["train"],
        aihub_normal, aihub_noanswer,
    ]
)
valid = datasets.concatenate_datasets(
    [klue_valid_noanswer, aihub_valid_noanswer, korquad["validation"]]
)
mrc_data = datasets.DatasetDict(
    {"train": train, "valid": valid}
)

In [13]:
mrc_data

DatasetDict({
    train: Dataset({
        features: ['guid', 'question', 'context', 'answers', 'is_impossible', 'title', 'classtype', 'source', 'dataset'],
        num_rows: 421697
    })
    valid: Dataset({
        features: ['guid', 'question', 'context', 'answers', 'is_impossible', 'title', 'classtype', 'source', 'dataset'],
        num_rows: 11548
    })
})

## Convert example to features

### Preprocessing

In [14]:
mrc_data["train"][0]

{'guid': 'klue-mrc-v1_train_12759',
 'question': '북태평양 기단과 오호츠크해 기단이 만나 국내에 머무르는 기간은?',
 'context': '올여름 장마가 17일 제주도에서 시작됐다. 서울 등 중부지방은 예년보다 사나흘 정도 늦은 이달 말께 장마가 시작될 전망이다.17일 기상청에 따르면 제주도 남쪽 먼바다에 있는 장마전선의 영향으로 이날 제주도 산간 및 내륙지역에 호우주의보가 내려지면서 곳곳에 100㎜에 육박하는 많은 비가 내렸다. 제주의 장마는 평년보다 2~3일, 지난해보다는 하루 일찍 시작됐다. 장마는 고온다습한 북태평양 기단과 한랭 습윤한 오호츠크해 기단이 만나 형성되는 장마전선에서 내리는 비를 뜻한다.장마전선은 18일 제주도 먼 남쪽 해상으로 내려갔다가 20일께 다시 북상해 전남 남해안까지 영향을 줄 것으로 보인다. 이에 따라 20~21일 남부지방에도 예년보다 사흘 정도 장마가 일찍 찾아올 전망이다. 그러나 장마전선을 밀어올리는 북태평양 고기압 세력이 약해 서울 등 중부지방은 평년보다 사나흘가량 늦은 이달 말부터 장마가 시작될 것이라는 게 기상청의 설명이다. 장마전선은 이후 한 달가량 한반도 중남부를 오르내리며 곳곳에 비를 뿌릴 전망이다. 최근 30년간 평균치에 따르면 중부지방의 장마 시작일은 6월24~25일이었으며 장마기간은 32일, 강수일수는 17.2일이었다.기상청은 올해 장마기간의 평균 강수량이 350~400㎜로 평년과 비슷하거나 적을 것으로 내다봤다. 브라질 월드컵 한국과 러시아의 경기가 열리는 18일 오전 서울은 대체로 구름이 많이 끼지만 비는 오지 않을 것으로 예상돼 거리 응원에는 지장이 없을 전망이다.',
 'answers': {'text': ['한 달가량', '한 달'], 'answer_start': [478, 478]},
 'is_impossible': False,
 'title': '제주도 장마 시작 … 중부는 이달 말부터',
 'classtype': '',
 'source': 'hankyung-

In [15]:
# data augmentation for multiple answers
def data_aug_for_multiple_answers(examples):
    result = {key: [] for key in examples.keys()}
    
    def update(i, answers=None):
        for key in result.keys():
            if key == "answers" and answers is not None:
                result[key].append(answers)
            else:
                result[key].append(examples[key][i])
                
    for i, (answers, unanswerable) in enumerate(
        zip(examples["answers"], examples["is_impossible"])
    ):
        answerable = not unanswerable
        assert (
            len(answers["text"]) == len(answers["answer_start"]) or
            answers["answer_start"][0] == -1
        )
        if answerable and len(answers["text"]) > 1:
            for n_ans in range(len(answers["text"])):
                ans = {
                    "text": [answers["text"][n_ans]],
                    "answer_start": [answers["answer_start"][n_ans]],
                }
                update(i, ans)
        elif not answerable:
            update(i, {"text": [], "answer_start": []})
        else:
            update(i)
            
    return result

In [16]:
mrc_data = mrc_data.map(
    data_aug_for_multiple_answers,
    batched=True,
    batch_size=1000,
    num_proc=5,
)
mrc_data

Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90/cache-f928b1bcb7f89aee.arrow
Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90/cache-550b0408183d5b44.arrow
Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90/cache-715fc98b25a3b767.arrow
Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90/cache-4ad91b7f61e87343.arrow
Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90/cache-63981d3d2570cac6.arrow
Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92

DatasetDict({
    train: Dataset({
        features: ['answers', 'classtype', 'context', 'dataset', 'guid', 'is_impossible', 'question', 'source', 'title'],
        num_rows: 429126
    })
    valid: Dataset({
        features: ['answers', 'classtype', 'context', 'dataset', 'guid', 'is_impossible', 'question', 'source', 'title'],
        num_rows: 11548
    })
})

In [17]:
# import re

# def preprocess(text):
#     text = re.sub(r'\n', ' ', text)
#     text = re.sub(r"\\n", " ", text)
#     text = re.sub(r"\s+", " ", text)
#     text = re.sub(r'#', ' ', text)
#     text = re.sub(r"[^a-zA-Z0-9가-힣ㄱ-ㅎㅏ-ㅣぁ-ゔァ-ヴー々〆〤一-龥<>()\s\.\?!》《≪≫\'<>〈〉:‘’%,『』「」＜＞・\"-“”∧㎜]", "", text)
#     return text


# def apply_prep(examples):
#     context = []
#     answers = []
#     for ans, ctxt in zip(examples["answers"], examples["context"]):
#         answer_start = ans["answer_start"]
#         if answer_start:
#             left_ctxt = ctxt[:answer_start[0]]
#             right_ctxt = ctxt[answer_start[0]:]
#             ctxt = preprocess(left_ctxt) + preprocess(right_ctxt)
#             ans = {"text": ans["text"],
#                    "answer_start": [len(preprocess(left_ctxt))]}
#         else:
#             ctxt = preprocess(ctxt)
#             ans = {"text": [], "answer_start": []}
#         context.append(ctxt)
#         answers.append(ans)
#     return {"context": context, "answers": answers}

## Train a sketchy reader

In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="outputs",
    per_gpu_train_batch_size=32,
    per_gpu_eval_batch_size=32,
    warmup_steps=814,
    learning_rate=2e-5,
    num_train_epochs=2.0,
    save_steps=2500,
    fp16=True,
)

In [19]:
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
from datasets import load_dataset, load_metric

import transformers
from solution.reader.trainers.qa import QuestionAnsweringTrainer
from solution.reader.architectures.models.modeling_electra import ElectraForQuestionAnsweringSquad
from transformers import (
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    TrainingArguments,
    XLNetConfig,
    XLNetForQuestionAnswering,
    XLNetTokenizerFast,
    default_data_collator,
    set_seed,
    AutoConfig,
    AutoTokenizer,
    AutoModelForQuestionAnswering
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version

In [20]:
from transformers import Trainer, is_torch_tpu_available
from transformers.trainer_utils import PredictionOutput


if is_torch_tpu_available():
    import torch_xla.core.xla_model as xm
    import torch_xla.debug.metrics as met


class QuestionAnsweringTrainer(Trainer):
    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.eval_examples = eval_examples
        self.post_process_function = post_process_function

    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
        eval_dataloader = self.get_eval_dataloader(eval_dataset)
        eval_examples = self.eval_examples if eval_examples is None else eval_examples

        # Temporarily disable metric computation, we will do it in the loop here.
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        try:
            output = eval_loop(
                eval_dataloader,
                description="Evaluation",
                # No point gathering the predictions if there are no metrics, otherwise we defer to
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
            )
        finally:
            self.compute_metrics = compute_metrics

        if self.post_process_function is not None and self.compute_metrics is not None:
            eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
            metrics = self.compute_metrics(eval_preds)

            # Prefix all keys with metric_key_prefix + '_'
            for key in list(metrics.keys()):
                if not key.startswith(f"{metric_key_prefix}_"):
                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)

            self.log(metrics)
        else:
            metrics = {}

        if self.args.tpu_metrics_debug or self.args.debug:
            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
            xm.master_print(met.metrics_report())

        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
        return metrics

    def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
        predict_dataloader = self.get_test_dataloader(predict_dataset)

        # Temporarily disable metric computation, we will do it in the loop here.
        compute_metrics = self.compute_metrics
        self.compute_metrics = None
        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        try:
            output = eval_loop(
                predict_dataloader,
                description="Prediction",
                # No point gathering the predictions if there are no metrics, otherwise we defer to
                # self.args.prediction_loss_only
                prediction_loss_only=True if compute_metrics is None else None,
                ignore_keys=ignore_keys,
            )
        finally:
            self.compute_metrics = compute_metrics

        if self.post_process_function is None or self.compute_metrics is None:
            return output

        predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
        metrics = self.compute_metrics(predictions)

        # Prefix all keys with metric_key_prefix + '_'
        for key in list(metrics.keys()):
            if not key.startswith(f"{metric_key_prefix}_"):
                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)

        return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)

In [21]:
set_seed(42)

In [22]:
# def _set_attr(main: Any, attrname: str, getfrom: Any):
#     """ set value on configuration using other model_args """
#     args_value = getattr(getfrom, attrname, None)
#     args_value = getattr(main, attrname, args_value)
#     setattr(main, attrname, args_value)

config = AutoConfig.from_pretrained(
    'monologg/koelectra-small-v3-discriminator',
)

setattr(config, 'start_n_top', 5)
setattr(config, 'end_n_top', 5)

In [23]:
tokenizer = AutoTokenizer.from_pretrained(
    'monologg/koelectra-small-v3-discriminator',
)
# model = ElectraForQuestionAnsweringSquad.from_pretrained(
#     'monologg/koelectra-small-v3-discriminator',
#     config=config
# )
model = AutoModelForQuestionAnswering.from_pretrained(
    'monologg/koelectra-small-v3-discriminator',
    config=config
)

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized: ['qa_outputs.bias'

In [24]:
do_train = True
do_eval = False

In [25]:
if do_train:
    column_names = mrc_data["train"].column_names
question_column_name = "question"
context_column_name = "context"
answer_column_name = "answers"

In [26]:
column_names

['answers',
 'classtype',
 'context',
 'dataset',
 'guid',
 'is_impossible',
 'question',
 'source',
 'title']

In [27]:
max_seq_length = 384

In [28]:
pad_on_right = tokenizer.padding_side == "right"

if max_seq_length > tokenizer.model_max_length:
    logger.warning(
        f"The max_seq_length passed ({max_seq_length}) is larger than the maximum length for the"
        f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
    )
max_seq_length = min(max_seq_length, tokenizer.model_max_length)

In [29]:
doc_stride = 128
def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples[question_column_name if pad_on_right else context_column_name],
        examples[context_column_name if pad_on_right else question_column_name],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_seq_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_special_tokens_mask=True,
        return_token_type_ids=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")
    # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
    special_tokens = tokenized_examples.pop("special_tokens_mask")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    tokenized_examples["is_impossible"] = []
    tokenized_examples["cls_index"] = []
    tokenized_examples["p_mask"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        tokenized_examples["cls_index"].append(cls_index)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples["token_type_ids"][i]
        for k, s in enumerate(special_tokens[i]):
            if s:
                sequence_ids[k] = 3
        context_idx = 1 if pad_on_right else 0

        # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0.
        # The cls token gets 1.0 too (for predictions of empty answers).
        tokenized_examples["p_mask"].append(
            [
                0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0
                for k, s in enumerate(sequence_ids)
            ]
        )

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples[answer_column_name][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
            tokenized_examples["is_impossible"].append(1.0)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != context_idx:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != context_idx:
                token_end_index -= 1
            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
                tokenized_examples["is_impossible"].append(1.0)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)
                tokenized_examples["is_impossible"].append(0.0)

    return tokenized_examples

In [30]:
mrc_data

DatasetDict({
    train: Dataset({
        features: ['answers', 'classtype', 'context', 'dataset', 'guid', 'is_impossible', 'question', 'source', 'title'],
        num_rows: 429126
    })
    valid: Dataset({
        features: ['answers', 'classtype', 'context', 'dataset', 'guid', 'is_impossible', 'question', 'source', 'title'],
        num_rows: 11548
    })
})

In [31]:
max_train_samples = 1000
if do_train:
    if "train" not in mrc_data:
        raise ValueError("--do_train requires a train dataset")
    train_dataset = mrc_data["train"]
    if max_train_samples is not None:
        # Select samples from Dataset, This will help to decrease processing time
        train_dataset = train_dataset.select(range(max_train_samples))
    # Create Training Features
    train_dataset = train_dataset.map(
        prepare_train_features,
        batched=True,
        remove_columns=column_names,
    )
    if max_train_samples is not None:
        # Select samples from dataset again since Feature Creation might increase number of features
        train_dataset = train_dataset.select(range(max_train_samples))

Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90/cache-4124e99b954f16ce.arrow


In [32]:
train_dataset

Dataset({
    features: ['attention_mask', 'cls_index', 'end_positions', 'input_ids', 'is_impossible', 'p_mask', 'start_positions', 'token_type_ids'],
    num_rows: 1000
})

In [33]:
def prepare_validation_features(examples):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples[question_column_name if pad_on_right else context_column_name],
        examples[context_column_name if pad_on_right else question_column_name],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_seq_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_special_tokens_mask=True,
        return_token_type_ids=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers).
    special_tokens = tokenized_examples.pop("special_tokens_mask")

    # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
    # corresponding example_id and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label.
    tokenized_examples["cls_index"] = []
    tokenized_examples["p_mask"] = []

    for i, input_ids in enumerate(tokenized_examples["input_ids"]):
        # Find the CLS token in the input ids.
        cls_index = input_ids.index(tokenizer.cls_token_id)
        tokenized_examples["cls_index"].append(cls_index)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples["token_type_ids"][i]
        for k, s in enumerate(special_tokens[i]):
            if s:
                sequence_ids[k] = 3
        context_idx = 1 if pad_on_right else 0

        # Build the p_mask: non special tokens and context gets 0.0, the others 1.0.
        tokenized_examples["p_mask"].append(
            [
                0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0
                for k, s in enumerate(sequence_ids)
            ]
        )

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["guid"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_idx else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [34]:
do_eval = True

if do_eval:
    column_names = mrc_data["valid"].column_names

In [35]:
max_eval_samples = 200

In [36]:
mrc_data

DatasetDict({
    train: Dataset({
        features: ['answers', 'classtype', 'context', 'dataset', 'guid', 'is_impossible', 'question', 'source', 'title'],
        num_rows: 429126
    })
    valid: Dataset({
        features: ['answers', 'classtype', 'context', 'dataset', 'guid', 'is_impossible', 'question', 'source', 'title'],
        num_rows: 11548
    })
})

In [37]:
if do_eval:
    if "valid" not in mrc_data:
        raise ValueError("--do_eval requires a validation dataset")
    eval_examples = mrc_data["valid"]
    if max_eval_samples is not None:
        # Selecting Eval Samples from Dataset
        eval_examples = eval_examples.select(range(max_eval_samples))
    # Create Features from Eval Dataset
    eval_dataset = eval_examples.map(
        prepare_validation_features,
        batched=True,
        remove_columns=column_names,
    )
    if max_eval_samples is not None:
        # Selecting Samples from Dataset again since Feature Creation might increase samples size
        eval_dataset = eval_dataset.select(range(max_eval_samples))


Loading cached processed dataset at /opt/ml/.cache/huggingface/datasets/klue/mrc/1.0.0/55ff8f92b7a4b9842be6514ce0b4b5295b46d5e493f8bb5760da4be717018f90/cache-4dcc3e2df38b52c5.arrow


In [38]:
# if training_args.do_predict:
#     if "test" not in raw_datasets:
#         raise ValueError("--do_predict requires a test dataset")
#     predict_examples = raw_datasets["test"]
#     if data_args.max_predict_samples is not None:
#         # We will select sample from whole data
#         predict_examples = predict_examples.select(range(data_args.max_predict_samples))
#     # Test Feature Creation
#     with training_args.main_process_first(desc="prediction dataset map pre-processing"):
#         predict_dataset = predict_examples.map(
#             prepare_validation_features,
#             batched=True,
#             num_proc=data_args.preprocessing_num_workers,
#             remove_columns=column_names,
#             load_from_cache_file=not data_args.overwrite_cache,
#             desc="Running tokenizer on prediction dataset",
#         )
#     if data_args.max_predict_samples is not None:
#         # During Feature creation dataset samples might increase, we will select required samples again
#         predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))

In [39]:
data_collator = DataCollatorWithPadding(
            tokenizer=tokenizer,
            pad_to_multiple_of=8 if True else None,
        )

In [40]:
from solution.data.processors.post import postprocess_qa_predictions_with_beam_search

In [41]:
n_best_size = 20
max_answer_length = 30
output_dir = './outputs'
version_2_with_negative = True

In [42]:
def post_processing_function(examples, features, predictions, stage="eval"):
    # Post-processing: we match the start logits and end logits to answers in the original context.
    predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search(
        examples=examples,
        features=features,
        predictions=predictions,
        version_2_with_negative=version_2_with_negative,
        n_best_size=n_best_size,
        max_answer_length=max_answer_length,
        start_n_top=model.config.start_n_top,
        end_n_top=model.config.end_n_top,
        output_dir=output_dir,
        log_level=log_level,
        prefix=stage,
    )
    # Format the result to the format the metric expects.
    if version_2_with_negative:
        formatted_predictions = [
            {"id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k]}
            for k, v in predictions.items()
        ]
    else:
        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]

    references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
    return EvalPrediction(predictions=formatted_predictions, label_ids=references)

metric = load_metric("squad_v2" if version_2_with_negative else "squad")

def compute_metrics(p: EvalPrediction):
    return metric.compute(predictions=p.predictions, references=p.label_ids)

In [43]:
import os
from solution.args import HfArgumentParser
from solution.args import (
    MrcDataArguments,
    MrcModelArguments,
    MrcTrainingArguments,
    MrcProjectArguments,
)

parser = HfArgumentParser(
    [MrcDataArguments,
     MrcModelArguments,
     MrcTrainingArguments,
     MrcProjectArguments]
)
args = parser.parse_yaml_file(yaml_file="configs/example.yaml")
data_args, model_args, training_args, project_args = args

In [44]:
# # HfTrainingArguments
# report_to: wandb
# run_name: koelectra_v3_test2
# output_dir: outputs/koelectra_v3_test2
# overwrite_output_dir: False
# learning_rate: 5e-5
# do_train: True
# do_eval: True
# do_predict: False            # when True, must set eval_retrieval = True
# evaluation_strategy: epoch  # when do_eval == False, set 'no'
# save_strategy: epoch        # when do_eval == False, set 'no'
# per_device_train_batch_size: 32
# per_device_eval_batch_size: 32
# num_train_epochs: 20
# do_pos_ensemble: False
# # eval_steps: 100
# # save_steps: 100
# save_total_limit: 5
# fp16: True
# weight_decay: 0.0
# warmup_steps: 0
# load_best_model_at_end: True
# metric_for_best_model: exact_match
# logging_dir: logs
# lr_scheduler_type: cosine

In [45]:
training_args.run_name = "noanswer"
training_args.output_dir = "outputs/noanswer1"
training_args.num_train_epochs = 10
training_args.per_device_train_batch_size = 16
training_args.per_device_eval_batch_size = 16
training_args.gradient_accumulation_steps = 1
training_args.learning_rate = 5e-5
training_args.fp16 = True
training_args.do_eval = False
training_args.max_answer_length = 30

In [46]:
trainer = QuestionAnsweringTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset if training_args.do_train else None,
    eval_dataset=eval_dataset if training_args.do_eval else None,
    eval_examples=eval_examples if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    post_process_function=post_processing_function,
    compute_metrics=compute_metrics,
)

Using amp fp16 backend


In [47]:
train_dataset

Dataset({
    features: ['attention_mask', 'cls_index', 'end_positions', 'input_ids', 'is_impossible', 'p_mask', 'start_positions', 'token_type_ids'],
    num_rows: 1000
})

In [47]:
from torch.utils.data import DataLoader

In [56]:
train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=16
    )

In [57]:
import torch

In [58]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

ElectraForQuestionAnswering(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_featu

In [59]:
for step, batch in enumerate(train_dataloader):
    outputs = model(**batch)
    break

TypeError: forward() got an unexpected keyword argument 'cls_index'

In [48]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["TRANSFORMERS_VERBOSITY"] = "info"


In [49]:
if training_args.do_train:
    # checkpoint = None
    # if training_args.resume_from_checkpoint is not None:
    #     checkpoint = training_args.resume_from_checkpoint
    # elif last_checkpoint is not None:
    #     checkpoint = last_checkpoint
    train_result = trainer.train()
    trainer.save_model()  # Saves the tokenizer too for easy upload

    metrics = train_result.metrics

    max_train_samples = (
        data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
    )
    metrics["train_samples"] = min(max_train_samples, len(train_dataset))

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

The following columns in the training set  don't have a corresponding argument in `ElectraForQuestionAnswering.forward` and have been ignored: cls_index, is_impossible, p_mask.
***** Running training *****
  Num examples = 1000
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 630
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkiyoung2[0m (use `wandb login --relogin` to force relogin)


/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [339,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [339,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [339,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [339,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [339,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: indexSelectLargeIndex: block: [339,0,0], thread: [69,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:702: 

RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`