In [1]:
import os
from solution.args import HfArgumentParser
from solution.args import (
    MrcDataArguments,
    MrcModelArguments,
    MrcTrainingArguments,
    MrcProjectArguments,
)

parser = HfArgumentParser(
    [MrcDataArguments,
     MrcModelArguments,
     MrcTrainingArguments,
     MrcProjectArguments]
)
args = parser.parse_yaml_file(yaml_file="configs/example.yaml")
data_args, model_args, training_args, project_args = args

In [2]:
model_args

MrcModelArguments(model_name_or_path='monologg/koelectra-small-v3-discriminator', reader_type='extractive', architectures='AutoModelForQuestionAnswering', config_name=None, tokenizer_name=None, model_cache_dir='None', model_init='basic', use_auth_token='Fasle', revision='main', model_head='None', qa_conv_out_channel=1024, qa_conv_input_size=384, qa_conv_n_layers=5)

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("./outputs/koelectra_v3_test2")

In [4]:
from datasets import load_from_disk

noanswer_data = load_from_disk("./data/aihub/temp")

In [5]:
data_args.pad_to_max_length = True

In [6]:
from solution.data.processors.prep import PREP_PIPELINE

prep_fn, is_batched = PREP_PIPELINE["extractive"](
    tokenizer, "train", data_args,
)

In [7]:
QUESTION_COLUMN_NAME = "question"
CONTEXT_COLUMN_NAME = "context"
ANSWER_COLUMN_NAME = "answers"

In [8]:
def tokenize_fn(examples):
    """Tokenize questions and contexts

    Args:
        examples (Dict): DatasetDict

    Returns:
        Dict: Tokenized examples
    """

    pad_on_right = tokenizer.padding_side == "right"
    max_seq_length = min(data_args.max_seq_length,
                         tokenizer.model_max_length)

    # truncation과 padding을 통해 tokenization을 진행
    # stride를 이용하여 overflow를 유지
    # 각 example들은 이전의 context와 조금씩 겹침
    # overflow 발생 시 지정한 batch size보다 더 많은 sample이 들어올 수 있음 -> data augmentation
    tokenized_examples = tokenizer(
        examples[QUESTION_COLUMN_NAME if pad_on_right else CONTEXT_COLUMN_NAME],
        examples[CONTEXT_COLUMN_NAME if pad_on_right else QUESTION_COLUMN_NAME],
        # 길이가 긴 context가 등장할 경우 truncation을 진행
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_seq_length,
        stride=data_args.doc_stride,
        # overflow 발생 시 원래 인덱스를 찾을 수 있게 mapping 가능한 값이 필요
        return_overflowing_tokens=True,
        # token의 캐릭터 단위 position을 찾을 수 있는 offset을 반환
        # start position과 end position을 찾는데 도움을 줌
        return_offsets_mapping=True,
        # sentence pair가 입력으로 들어올 때 0과 1로 구분지음
        return_token_type_ids=data_args.return_token_type_ids,
        padding="max_length" if data_args.pad_to_max_length else False,
        # return_tensors='pt'
    )
    return tokenized_examples
    
def prepare_train_features(examples):
    """
    Reset for train dataset that do not have the correct answer 
    or where the correct answer position has changed.

    Args:
        examples (Dict): DatasetDict

    Returns:
        Dict: Tokenized examples where the answer has been reset
    """

    pad_on_right = tokenizer.padding_side == "right"

    tokenized_examples = tokenize_fn(examples)
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # 데이터셋에 "start position", "enc position" label을 부여합니다.
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)  # cls index

        # sequence id를 설정합니다 (context와 question을 구분).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # 길이가 긴 context에 대해 truncation을 진행하기 때문에
        # 하나의 example이 여러 개의 span을 가질 수 있음
        sample_index = sample_mapping[i]
        answers = examples[ANSWER_COLUMN_NAME][sample_index]

        # answer가 없을 경우 cls_index를 answer로 설정
        # example에서 정답이 없는 경우가 있을 수 있음
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # text에서 정답의 start/end character index를 가져옴
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # sequence_ids는 0, 1, None의 세 값만 가짐
            # None 0 0 ... 0 None 1 1 ... 1 None

            # text에서 context가 시작하는 위치로 이동
            token_start_index = 0
            while sequence_ids[token_start_index] != context_index:
                token_start_index += 1

            # text에서 context가 끝나는 위치로 이동
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != context_index:
                token_end_index -= 1

            # 정답이 span을 벗어나는지 체크.
            # 정답이 없는 경우 CLS index로 labeling (Retro일 경우 다르게 처리)
            if not (
                offsets[token_start_index][0] <= start_char and
                offsets[token_end_index][1] >= end_char
            ):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # token_start_index 및 token_end_index를 answer의 끝으로 이동
                # Note: answer가 마지막 단어인 경우 last offset을 따라갈 수 있음

                # token_start_index를 실제 위치로 맞춰주는 과정
                while (
                    token_start_index < len(offsets) and
                    offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1

                tokenized_examples["start_positions"].append(
                    token_start_index - 1)

                # token_end_index를 실제 위치로 맞춰주는 과정
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1

                tokenized_examples["end_positions"].append(
                    token_end_index + 1)

    return tokenized_examples

In [12]:
train_features = noanswer_data["train"].map(
    prep_fn,
    batched=is_batched,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=noanswer_data["train"].column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)

  0%|          | 0/297 [00:00<?, ?ba/s]

In [27]:
train_features

Dataset({
    features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions'],
    num_rows: 410964
})

In [None]:
train_datasets = noanswer_data["train"]

In [None]:
prep_fn, is_batched = PREP_PIPELINE["extractive"](
    tokenizer, "eval", data_args,
)

In [None]:
valid_features = noanswer_data["validation"].map(
    prep_fn,
    batched=is_batched,
    num_proc=data_args.preprocessing_num_workers,
    remove_columns=korquad_data["validation"].column_names,
    load_from_cache_file=not data_args.overwrite_cache,
)

In [None]:
valid_datasets = noanswer_data["validation"]

In [None]:
from solution.reader import READER_HOST
from solution.data.metrics import compute_metrics

from solution.reader.architectures import MODEL_INIT

In [None]:
reader_cls = READER_HOST["extractive"]

In [None]:
model_args.model_name_or_path = "./outputs/koelectra_v3_test2"
# model_args.architectures = "RobertaForQAWithConvSDSHead"
# model_args.model_init = "qaconv_head"
# model_args.use_auth_token = True
# model_args.reader_type = "extractive"
# model_args.model_head = "sds_conv"

In [None]:
from solution.data.processors.post import post_processing_function

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [None]:
os.environ["WANDB_PROJECT"] = "korquadv1.0"

In [None]:
training_args.run_name = "korquad"
training_args.output_dir = "outputs/korquad"
training_args.num_train_epochs = 10
training_args.per_device_train_batch_size = 12
training_args.per_device_eval_batch_size = 12
training_args.gradient_accumulation_steps = 1
training_args.learning_rate = 5e-5
training_args.fp16 = True
training_args.max_answer_length = 50

In [None]:
reader = reader_cls(model_args, tokenizer)
reader.set_trainer(
    model_init=reader.model_init,
    args=training_args,
    train_dataset=train_features,
    eval_dataset=valid_features,
    eval_examples=valid_datasets,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    post_process_function=post_processing_function,
)

loading configuration file https://huggingface.co/klue/roberta-large/resolve/main/config.json from cache at /opt/ml/.cache/huggingface/transformers/571e05a2160c18c93365862223c4dae92bbd1b41464a4bd5f372ad703dba6097.ae5b7f8d8a28a3ff0b1560b4d08c6c3bd80f627288eee2024e02959dd60380d0
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "tokenizer_class": "BertTokenizer",
  "transformers_version": "4.12.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file https://huggingface.co/klue/roberta-large/resolve/main/pytorch_model.bin from cache at /opt/ml/.cache/huggingface/transformers/fd91c85effc137c99cd14cfe5c3459faa223c005b1577dc2c5aa48f6b2c4fbb1.3d5d467e78cd19d9a87029910ed83289edde0111a75a41e0cc79ad3fc06e4a51
Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForQAWithConvSDSHead: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForQAWithConvSDSHead from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQAWithConvSDSHead from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQAWithConvSDSHead were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['qa_outputs.convs.1.conv2.weight', 'qa_outputs.convs.4.conv1.weight', 'qa_outputs.convs.1.layer_norm.bias', 'qa_outputs.convs.0.layer_norm.bias', 'qa_outputs.convs.4.conv1.bias', 'qa_outputs.convs.2.layer_norm.weight', 'qa_outputs.convs.1.conv1.weight', 'qa_outputs.convs.0.conv2.weight', 'qa_outputs.convs.2.conv1.weight', 'qa_outputs.convs.1.conv1.bias', 'qa_outputs.convs.0.conv1.bias', 'qa_outputs.convs.0.conv1.weight', 'qa_outputs.convs.0.conv2.bias', 'qa_outputs.convs.2.conv1.bias', 'qa_outputs.qa_output.weight', 'qa_outputs.convs.3.conv1.bias', 'qa_outputs.convs.2.layer_norm.bias', 'qa_outputs.convs.2.conv2.bias', 'qa_outputs.convs.1.layer_norm.weight', 'qa_outputs.convs.3.conv2.bias', 'qa_outputs.convs.0.layer_norm.weight', 'qa_outputs.convs.3.layer_norm.weight', 'qa_outputs.convs.4.conv2.bias', 'qa_outputs.convs.3.layer_norm.bias', 'qa_outputs.convs.4.conv2.weight', 'qa_outputs.qa_output.bias', 'qa_outputs.convs.3.conv1.weight', 'qa_outputs.convs.3.conv2.weight', 'qa_outputs.convs.4.layer_norm.weight', 'qa_outputs.convs.4.layer_norm.bias', 'qa_outputs.convs.2.conv2.weight', 'qa_outputs.convs.1.conv2.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using amp fp16 backend

In [None]:
with reader.mode_change(mode="train"):
    train_results = reader.read(resume_from_checkpoint=None)

```
Epoch	Training Loss	Validation Loss	Exact Match	F1
1	0.690800	No log	79.425009	86.229322
2	0.543700	No log	79.736751	86.170849
3	0.409500	No log	79.857984	86.449094
4	0.310300	No log	80.498788	86.746011
5	0.186200	No log	80.689297	87.123838
6	0.111100	No log	82.213370	88.365081
7	0.063800	No log	83.304468	89.064874
8	0.022400	No log	83.200554	88.944423
9	0.008600	No log	84.187738	89.685415
10	0.007800	No log	84.447523	89.923963
```

In [None]:
model = reader._trainer.model

In [None]:
model.config.architectures.append("RobertaForQuestionAnswering")
model.config._name_or_path = "jinmang2/roberta-large-qaconv-sds-korquad"

In [None]:
tokenizer.name_or_path = "jinmang2/roberta-large-qaconv-sds-korquad"

In [None]:
tokenizer.push_to_hub(
    "jinmang2/roberta-large-qaconv-sds-korquad",
    use_auth_token=True
)

In [None]:
model.push_to_hub(
    "jinmang2/roberta-large-qaconv-sds-korquad",
    use_auth_token=True
)