# distillbertのノートブック

## 下準備

In [198]:
import os
import random
import numpy as np
import pandas as pd
import string
import unicodedata
from typing import Any, Dict, Iterator, List, Tuple, Union
import torch
import datasets


import transformers 
from transformers import AutoTokenizer,AlbertTokenizer,BertJapaneseTokenizer,AutoModelForQuestionAnswering,TrainingArguments,Trainer

from pytorch_lightning import LightningModule, seed_everything
from pytorch_lightning.loggers import WandbLogger

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [69]:
args = {
    'random_seed': 42,  # Random Seed
    # Transformers PLM name.
    'pretrained_model': 'bandainamco-mirai/distilbert-base-japanese',
    # Optional, Transformers Tokenizer name. Overrides `pretrained_model`
    'pretrained_tokenizer': 'cl-tohoku/bert-base-japanese-whole-word-masking',
    'batch_size': 8,  # <=32 for TPUv2-8
    'lr': 2e-5,  # Learning Rate
    'max_length': 384,  # Max Length input size
    'doc_stride': 128,  # The interval of the context when splitting is needed
    'epochs': 3,  # Max Epochs
    'dataset': 'SkelterLabsInc/JaQuAD',
    'optimizer': 'AdamW',
    'norm_form': 'NFKC',
    'weight_decay': 0.01,  # Weight decaying parameter for AdamW
    'lr_scheduler': 'warmup_lin',
    'warmup_ratio': 0.1,
    'note':"リクルートベースライン",
}
args

#seed値を固定
def set_seed(seed =42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic =True
set_seed(seed=args["random_seed"])

In [186]:
print(transformers.__version__)

4.12.5


## データセットの準備

### datasetdict

In [65]:
datasetdict = datasets.load_dataset(args['dataset'])
datasetdict = datasetdict.flatten()\
            .rename_column('answers.text', 'answer')\
            .rename_column('answers.answer_start', 'answer_start')\
            .rename_column('answers.answer_type', 'answer_type')

Using custom data configuration default
Reusing dataset ja_qu_ad (/home/s16991/.cache/huggingface/datasets/SkelterLabsInc___ja_qu_ad/default/0.1.0/5847b2e2ab5e02de284395bb15f87f13eae8f6f6ff1f01e4ee9c5c0dcf8ef8eb)


  0%|          | 0/2 [00:00<?, ?it/s]

In [132]:
tokenizer = AutoTokenizer.from_pretrained(
            args["pretrained_tokenizer"]
            if args["pretrained_tokenizer"] else
            args["pretrained_model"])
pad_on_right = tokenizer.padding_side == "right"

In [172]:
def preprocess_function(examples):
        tokenized_examples = tokenizer(
            examples['question' if pad_on_right else "context"],
            examples['context' if pad_on_right else "question"],
            return_overflowing_tokens=True,
            padding="max_length",
        )

        inputs = {
            'input_ids': [],
            'attention_mask': [],
            'start_positions': [],
            'end_positions': [],
        }
        for tokens, att_mask, type_ids, context, answer,question,start_char \
                in zip(tokenized_examples['input_ids'],
                       tokenized_examples['attention_mask'],
                       tokenized_examples['token_type_ids'],
                       examples['context'],
                       examples['answer'],
                       examples['question'],
                       examples['answer_start']):

            answer = answer[0]
            start_char = start_char[0]
            offsets = get_offsets(tokens, context, tokenizer,
                                  args["norm_form"])
            

            ctx_start = tokens.index(tokenizer.sep_token_id) + 1
            answer_start_index = 0
            answer_end_index = len(offsets) - 2
            
            while offsets[answer_start_index][0] < start_char:
                answer_start_index += 1
            while offsets[answer_end_index][1] > start_char + len(answer):
                answer_end_index -= 1
            answer_start_index += ctx_start
            answer_end_index += ctx_start

            span_inputs = {
                'input_ids': tokens,
                'attention_mask': att_mask,
                'token_type_ids': type_ids,
            }
            for span, answer_idx in make_spans(
                    span_inputs,
                    question_len=ctx_start,
                    max_seq_len=args["max_length"],
                    stride=args["doc_stride"],
                    answer_start_position=answer_start_index,
                    answer_end_position=answer_end_index):
                inputs['input_ids'].append(span['input_ids'])
                inputs['attention_mask'].append(span['attention_mask'])
                inputs['start_positions'].append(answer_idx[0])
                inputs['end_positions'].append(answer_idx[1])
        return inputs


def make_spans(
    inputs: Dict[str, Union[int, List[int]]],
    question_len: int,
    max_seq_len: int,
    stride: int,
    answer_start_position: int = -1,
    answer_end_position: int = -1
) -> Iterator[Tuple[Dict[str, List[int]], Tuple[int, int]]]:
    input_len = len(inputs['input_ids'])
    context_len = input_len - question_len

    def make_value(input_list, i, padding=0):
        context_end = min(max_seq_len - question_len, context_len - i)
        pad_len = max_seq_len - question_len - context_end
        val = input_list[:question_len]
        val += input_list[question_len + i:question_len + i + context_end]
        val[-1] = input_list[-1]
        val += [padding] * pad_len
        return val

    for i in range(0, input_len - max_seq_len + stride, stride):
        span = {key: make_value(val, i) for key, val in inputs.items()}
        answer_start = answer_start_position - i
        answer_end = answer_end_position - i
        if answer_start < question_len or answer_end >= max_seq_len - 1:
            answer_start = answer_end = 0
        yield span, (answer_start, answer_end)
        

def get_offsets(input_ids: List[int],
                context: str,
                tokenizer: AutoTokenizer,
                norm_form='NFKC') -> List[Tuple[int, int]]:
    
    cxt_start = input_ids.index(tokenizer.sep_token_id) + 1
    cxt_end = cxt_start + input_ids[cxt_start:].index(tokenizer.sep_token_id)
    tokens = tokenizer.convert_ids_to_tokens(input_ids[cxt_start:cxt_end])
    tokens = [tok[2:] if tok.startswith('##') else tok for tok in tokens]
    whitespace = string.whitespace + '\u3000'

    # 1 . Make offsets of normalized context within the original context.
    offsets_norm_context = []
    norm_context = ''
    for idx, char in enumerate(context):
        norm_char = unicodedata.normalize(norm_form, char)
        norm_context += norm_char
        offsets_norm_context.extend([idx] * len(norm_char))
    norm_context_org = unicodedata.normalize(norm_form, context)
    assert norm_context == norm_context_org, \
        'Normalized contexts are not the same: ' \
        + f'{norm_context} != {norm_context_org}'
    assert len(norm_context) == len(offsets_norm_context), \
        'Normalized contexts have different numbers of tokens: ' \
        + f'{len(norm_context)} != {len(offsets_norm_context)}'

    # 2. Make offsets of tokens (input_ids) within the normalized context.
    offsets_token = []
    unk_pointer = None
    cid = 0
    tid = 0
    while tid < len(tokens):
        cur_token = tokens[tid]
        if cur_token == tokenizer.unk_token:
            unk_pointer = tid
            offsets_token.append([cid, cid])
            cid += 1
        elif norm_context[cid:cid + len(cur_token)] != cur_token:
            # Wrong offsets of the previous UNK token
            assert unk_pointer is not None, \
                'Normalized context and tokens are not matched'
            prev_unk_expected = offsets_token[unk_pointer]
            prev_unk_expected[1] += norm_context[prev_unk_expected[1] + 2:]\
                .index(tokens[unk_pointer + 1]) + 1
            tid = unk_pointer
            offsets_token = offsets_token[:tid] + [prev_unk_expected]
            cid = prev_unk_expected[1] + 1
        else:
            start_pos = norm_context[cid:].index(cur_token)
            if start_pos > 0 and tokens[tid - 1] == tokenizer.unk_token:
                offsets_token[-1][1] += start_pos
                cid += start_pos
                start_pos = 0
            assert start_pos == 0, f'{start_pos} != 0 (cur: {cur_token}'
            offsets_token.append([cid, cid + len(cur_token) - 1])
            cid += len(cur_token)
            while cid < len(norm_context) and norm_context[cid] in whitespace:
                offsets_token[-1][1] += 1
                cid += 1
        tid += 1
    if tokens[-1] == tokenizer.unk_token:
        offsets_token[-1][1] = len(norm_context) - 1
    else:
        assert cid == len(norm_context) == offsets_token[-1][1] + 1, \
            'Offsets do not include all characters'
    assert len(offsets_token) == len(tokens), \
        'The numbers of tokens and offsets are different'

    offsets_mapping = [(offsets_norm_context[start], offsets_norm_context[end])
                       for start, end in offsets_token]
    return [(0, 0)] + offsets_mapping+[(0, 0)] 

In [167]:
features = preprocess_function(datasetdict['train'])

KeyboardInterrupt: 

In [173]:
tokenized_datasets = datasetdict.map(preprocess_function, batched=True, remove_columns=datasetdict["train"].column_names)

  0%|          | 0/32 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [175]:
model = AutoModelForQuestionAnswering.from_pretrained(args['pretrained_model'])

Some weights of the model checkpoint at bandainamco-mirai/distilbert-base-japanese were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at bandainamco-mirai/distilbert-base-japanese and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight

In [179]:
model_name = args['pretrained_model'].split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

In [180]:
from transformers import default_data_collator

data_collator = default_data_collator

In [202]:
trainer =  Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    load_best_model_at_end = True 
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Cloning https://huggingface.co/chiba/distilbert-base-japanese-finetuned-squad into local empty directory.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [201]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
trainer.train()

NameError: name 'trainer' is not defined

In [196]:
!sudo apt install git-lfs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  python3-crcmod
Use 'sudo apt autoremove' to remove it.
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 11 not upgraded.
Need to get 2899 kB of archives.
After this operation, 9731 kB of additional disk space will be used.
Get:1 http://deb.debian.org/debian buster/main amd64 git-lfs amd64 2.7.1-1+deb10u1 [2899 kB]
Fetched 2899 kB in 0s (58.2 MB/s)[33m

7[0;23r8[1ASelecting previously unselected package git-lfs.
(Reading database ... 305475 files and directories currently install

In [195]:
!sudo apt update

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Hit:1 http://security.debian.org/debian-security buster/updates InRelease
Hit:2 http://deb.debian.org/debian buster InRelease                            [0m
Hit:3 http://deb.debian.org/debian buster-updates InRelease                    
Hit:4 http://deb.debian.org/debian buster-backports InRelease                  
Get:5 https://nvidia.github.io/libnvidia-container/stable/debian10/amd64  InRelease [1484 B]
Get:6 https://nvidia.github.io/nvidia-container-runtime/stable/debian10/amd64  InRelease [1481 B]
Get:7 https://nvidia.github.io/nvidia-docker/debian10/amd64  InRelease [1474 B][0m[33m
Get:8 https://download.docker.com/linux/debian buster InRelease [54.0 kB]      [0m[33m
Hit:9 http://packages.cloud.go