# Install

In [1]:
!pip install -U \
     datasets==2.20.0 \
     evaluate==0.4.0 \
     scikit-learn==1.4.2

Collecting datasets==2.20.0
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate==0.4.0
  Downloading evaluate-0.4.0-py3-none-any.whl.metadata (9.4 kB)
Collecting scikit-learn==1.4.2
  Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pyarrow-hotfix (from datasets==2.20.0)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==2.20.0)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets==2.20.0)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets==2.20.0)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets==2.20.0)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting response

# 5.2.2 Sequence Classification

### 모델

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [3]:
model.config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

In [4]:
inputs = tokenizer("안녕? 내 강아지는 귀여워.", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'LABEL_1'

### 데이터셋

In [5]:
from datasets import load_dataset

dataset = load_dataset("klue", "sts")
dataset["train"]

Downloading readme:   0%|          | 0.00/22.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.52M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/519 [00:00<?, ? examples/s]

Dataset({
    features: ['guid', 'source', 'sentence1', 'sentence2', 'labels'],
    num_rows: 11668
})

In [6]:
def process_data(batch):
  result = tokenizer(batch["sentence1"], text_pair=batch["sentence2"])
  result["labels"] = [x["binary-label"] for x in batch["labels"]]
  return result

dataset = dataset.map(
    process_data,
    batched=True,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/11668 [00:00<?, ? examples/s]

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer)
batch = collator([dataset["train"][i] for i in range(10)])

In [8]:
with torch.no_grad():
  logits = model(**batch).logits

logits

tensor([[-0.0214, -0.1048],
        [-0.3059, -0.1155],
        [ 0.2583,  0.2576],
        [-0.1867,  0.3904],
        [ 0.1820, -0.1036],
        [-0.3989, -0.2500],
        [ 0.0538,  0.1395],
        [-0.1382,  0.2030],
        [-0.3700, -0.1376],
        [ 0.1899, -0.0550]])

### 평가 지표

In [9]:
pred_labels = logits.argmax(dim=1).cpu().numpy()
true_labels = batch["labels"].numpy()
print(pred_labels)
print(true_labels)

[0 1 0 1 0 1 1 1 1 0]
[1 0 0 0 1 0 1 0 0 1]


In [10]:
import evaluate

f1 = evaluate.load("f1")
f1.compute(predictions=pred_labels, references=true_labels, average="micro")

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

{'f1': 0.2}

### 회귀

In [11]:
import torch
from transformers import AutoTokenizer, BertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
model = BertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=1)
print(model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
with torch.no_grad():
  logits = model(**batch).logits

logits

tensor([[-0.2084],
        [-0.0209],
        [-0.3917],
        [-0.5131],
        [-0.1470],
        [ 0.1152],
        [-0.2188],
        [ 0.3465],
        [-0.1856],
        [-0.1182]])

# 5.2.3 Multiple Choice

### 모델

In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForMultipleChoice

model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMultipleChoice.from_pretrained(model_name)
model

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForMultipleChoice(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

### 데이터셋

In [14]:
from datasets import load_dataset

dataset = load_dataset("HAERAE-HUB/csatqa", "full")
print(dataset["test"][0])

ending_names = ["option#1", "option#2", "option#3", "option#4", "option#5"]

def preprocess_function(examples):
  first_sentences = [
      [context] * 5 for context in examples["context"]
  ]
  question_headers = examples["question"]
  second_sentences = [
      [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
  ]
  # 토큰화를 위해 1차원으로 평활화
  first_sentences = sum(first_sentences, [])
  second_sentences = sum(second_sentences, [])

  # None 데이터 처리
  first_sentences = [i if i else "" for i in first_sentences]
  second_sentences = [i if i else "" for i in second_sentences]

  tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)

  # 토큰화 후 다시 2차원으로 재배열
  result = {
      k: [v[i:i+5] for i in range(0, len(v), 5)] for k, v in tokenized_examples.items()
  }
  result["labels"] = [i-1 for i in examples["gold"]]  # 원활한 collator 사용을 위한 변수명 이동, 레이블 0번부터 시작하게 변경

  return result

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["test"].column_names)

Downloading data:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/936 [00:00<?, ? examples/s]

{'question': ' 이 이야기에서 얻을 수 있는 교훈으로 가장 적절한 것은?', 'context': '이제 한 편의 이야기를 들려 드립니다. 잘 듣고 물음에 답하십시오.\n자, 여러분! 안녕하십니까? 오늘은 제가 어제 꾼 꿈 이야기 하날 들려 드리겠습니다. 전 꿈속에서 낯선 거리를 걷고 있었습니다. 그러다가 홍미로운 간판을 발견했답니다. 행 복을 파는 가게. 그렇게 쓰여 있었습니다. 전 호기심으로 문을 열고 들어갔답니다. 그곳 에서는 한 노인이 물건을 팔고 있었습니다. 전 잠시 머뭇거리다가 노인에게 다가가서 물 었습니다. 여기서는 무슨 물건을 파느냐고요. 노인은 미소를 지으며, 원하는 것은 뭐든 다 살 수 있다고 말했습니다. 저는 제 귀를 의심했습니다. \'무엇이든 다?\' 전 무엇을 사야 할까 생각하다가 말했답니다. "사랑, 부귀 그리고 지혜하고 건강도 사고 싶습니다. 저 자신뿐 아니라 우리 가족 모두 를 위해서요. 지금 바로 살 수 있나요?" 그러자 노인은 빙긋이 웃으며 대답했습니다. "젊은이, 한번 잘 보게나. 여기에서 팔고 있는 것은 무르익은 과일이 아니라 씨앗이라 네. 앞으로 좋은 열매를 맺으려면 이 씨앗들을 잘 가꾸어야 할 걸세."', 'option#1': '새로운 세계에 대한 열망을 가져야 한다.', 'option#2': '주어진 기회를 능동적으로 활용해야 한다.', 'option#3': '큰 것을 얻으려면 작은 것은 버려야 한다.', 'option#4': '물질적 가치보다 정신적 가치를 중시해야 한다.', 'option#5': '소망하는 바를 성취하기 위해서는 노력을 해야 한다.', 'gold': 5, 'category': 'N/A', 'human_performance': 0.0}


Map:   0%|          | 0/936 [00:00<?, ? examples/s]

In [15]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch


@dataclass
class DataCollatorForMultipleChoice:
  tokenizer: PreTrainedTokenizerBase
  padding: Union[bool, str, PaddingStrategy] = True
  max_length: Optional[int] = None
  pad_to_multiple_of: Optional[int] = None

  def __call__(self, features):
    label_name = "label" if "label" in features[0].keys() else "labels"
    labels = [feature.pop(label_name) for feature in features]

    batch_size = len(features)
    num_choices = len(features[0]["input_ids"])

    flattened_features = [
        [
            {k: v[i] for k, v in feature.items()}
            for i in range(num_choices)
        ]
        for feature in features
    ]
    flattened_features = sum(flattened_features, [])

    batch = self.tokenizer.pad(
        flattened_features,
        padding=self.padding,
        max_length=self.max_length,
        pad_to_multiple_of=self.pad_to_multiple_of,
        return_tensors="pt",
    )

    batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
    batch["labels"] = torch.tensor(labels, dtype=torch.int64)
    return batch

collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
batch = collator([tokenized_dataset["test"][i] for i in range(5)])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [16]:
with torch.no_grad():
  logits = model(**batch).logits

logits

tensor([[ 0.2072,  0.1837,  0.1191,  0.2381,  0.2946],
        [ 0.1826,  0.2586,  0.1497,  0.2627,  0.2450],
        [ 0.0198, -0.0332,  0.0022, -0.0012,  0.0795],
        [ 0.0885,  0.1150,  0.0509,  0.0649, -0.0134],
        [ 0.3183,  0.3543,  0.3392,  0.3629,  0.3864]])

### 평가 지표

In [17]:
import evaluate

pred_labels = logits.argmax(dim=1).cpu().numpy()
true_labels = batch["labels"].numpy()
print(pred_labels)
print(true_labels)

f1 = evaluate.load("f1")
f1.compute(predictions=pred_labels, references=true_labels, average="micro")

[4 3 4 1 4]
[4 4 0 3 1]


{'f1': 0.2}

# 5.2.4 Token Classification

### 모델

In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
model

Some weights of BertForTokenClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

### 데이터셋

In [19]:
from datasets import load_dataset

dataset = load_dataset("klue", "ner")

sample = dataset["train"][0]
print("tokens : ", sample["tokens"][: 20])
print("ner tags : ", sample["ner_tags"][: 20])
print((len(sample["tokens"]), len(sample["tokens"])))

Downloading data:   0%|          | 0.00/4.21M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/21008 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

tokens :  ['특', '히', ' ', '영', '동', '고', '속', '도', '로', ' ', '강', '릉', ' ', '방', '향', ' ', '문', '막', '휴', '게']
ner tags :  [12, 12, 12, 2, 3, 3, 3, 3, 3, 12, 2, 3, 12, 12, 12, 12, 2, 3, 3, 3]
(66, 66)


In [20]:
for i in range(len(sample["ner_tags"])):
  print(sample["tokens"][i], "\t", sample["ner_tags"][i])

특 	 12
히 	 12
  	 12
영 	 2
동 	 3
고 	 3
속 	 3
도 	 3
로 	 3
  	 12
강 	 2
릉 	 3
  	 12
방 	 12
향 	 12
  	 12
문 	 2
막 	 3
휴 	 3
게 	 3
소 	 3
에 	 12
서 	 12
  	 12
만 	 2
종 	 3
분 	 3
기 	 3
점 	 3
까 	 12
지 	 12
  	 12
5 	 8
㎞ 	 9
  	 12
구 	 12
간 	 12
에 	 12
는 	 12
  	 12
승 	 12
용 	 12
차 	 12
  	 12
전 	 12
용 	 12
  	 12
임 	 12
시 	 12
  	 12
갓 	 12
길 	 12
차 	 12
로 	 12
제 	 12
를 	 12
  	 12
운 	 12
영 	 12
하 	 12
기 	 12
로 	 12
  	 12
했 	 12
다 	 12
. 	 12


In [21]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # 토큰을 해당 단어에 매핑
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # 스페셜 토큰을 -100으로 세팅
            if word_idx is None:
                label_ids.append(12)
                # label_ids.append(-100)
            elif word_idx != previous_word_idx:  # 주어진 단어의 첫 번째 토큰에만 레이블을 지정
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/21008 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [22]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([tokenized_dataset["train"][i] for i in range(10)])

In [23]:
id2label = {
    0: "B-DT",
    1: "I-DT",
    2: "B-LC",
    3: "I-LC",
    4: "B-OG",
    5: "I-OG",
    6: "B-PS",
    7: "I-PS",
    8: "B-QT",
    9: "I-QT",
    10: "B-TI",
    11: "I-TI",
    12: "O",
}
label2id = {
    "B-DT": 0,
    "I-DT": 1,
    "B-LC": 2,
    "I-LC": 3,
    "B-OG": 4,
    "I-OG": 5,
    "B-PS": 6,
    "I-PS": 7,
    "B-QT": 8,
    "I-QT": 9,
    "B-TI": 10,
    "I-TI": 11,
    "O": 12,
}

In [24]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "klue/bert-base", num_labels=13, id2label=id2label, label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
with torch.no_grad():
  logits = model(**batch).logits

predictions = torch.argmax(logits, dim=2)
predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
predicted_token_class

['B-PS',
 'O',
 'B-TI',
 'B-OG',
 'I-DT',
 'I-QT',
 'O',
 'B-PS',
 'B-TI',
 'O',
 'I-QT',
 'I-LC',
 'B-TI',
 'O',
 'I-QT',
 'I-TI',
 'B-QT',
 'I-DT',
 'B-QT',
 'B-QT',
 'I-DT',
 'O',
 'I-TI',
 'I-DT',
 'O',
 'I-TI',
 'B-PS',
 'I-QT',
 'B-TI',
 'B-PS',
 'B-PS',
 'B-TI',
 'B-QT',
 'B-QT',
 'I-QT',
 'B-DT',
 'I-TI',
 'B-QT',
 'O',
 'B-TI',
 'B-TI',
 'B-PS',
 'B-PS',
 'I-QT',
 'B-PS',
 'B-PS',
 'I-QT',
 'B-DT',
 'B-QT',
 'I-DT',
 'B-TI',
 'I-PS',
 'I-DT',
 'O',
 'O',
 'B-PS',
 'B-PS',
 'B-TI',
 'B-TI',
 'I-TI',
 'O',
 'I-QT',
 'O',
 'O',
 'O',
 'I-QT',
 'I-QT',
 'I-QT',
 'B-PS',
 'I-QT',
 'I-TI',
 'I-QT',
 'I-QT',
 'B-PS',
 'B-PS',
 'O',
 'O',
 'B-TI',
 'B-PS',
 'O',
 'O',
 'B-TI',
 'B-TI',
 'I-OG',
 'B-TI',
 'B-TI',
 'B-TI',
 'B-PS',
 'B-TI',
 'B-TI',
 'I-QT',
 'I-QT',
 'I-QT',
 'I-QT',
 'B-TI',
 'B-TI',
 'I-QT',
 'I-QT',
 'I-QT',
 'I-QT',
 'I-QT',
 'B-TI',
 'O',
 'O',
 'B-TI',
 'B-TI',
 'B-PS',
 'B-QT',
 'I-QT',
 'O',
 'O',
 'I-QT',
 'I-TI',
 'B-TI',
 'I-QT',
 'B-TI',
 'I-QT']

### 평가 지표

In [26]:
import evaluate

pred_labels = logits.argmax(dim=-1).view(-1).cpu().numpy()
true_labels = batch["labels"].view(-1).numpy()
pred_labels.shape, true_labels.shape

f1 = evaluate.load("f1")
f1.compute(predictions=pred_labels, references=true_labels, average="micro")

{'f1': 0.023076923076923078}

# 5.2.5 Question Answering

### 모델

In [27]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
model

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

### 데이터셋

In [28]:
from datasets import load_dataset

dataset = load_dataset("klue", "mrc")
sample = dataset["train"][0]

print(f"내용 : {sample['context'][:50]}")
print(f"질문 : {sample['question']}")
print(f"답변 : {sample['answers']}")

Downloading data:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17554 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5841 [00:00<?, ? examples/s]

내용 : 올여름 장마가 17일 제주도에서 시작됐다. 서울 등 중부지방은 예년보다 사나흘 정도 늦은 
질문 : 북태평양 기단과 오호츠크해 기단이 만나 국내에 머무르는 기간은?
답변 : {'answer_start': [478, 478], 'text': ['한 달가량', '한 달']}


In [29]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/17554 [00:00<?, ? examples/s]

Map:   0%|          | 0/5841 [00:00<?, ? examples/s]

In [30]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()
batch = data_collator([tokenized_dataset["train"][i] for i in range(10)])
batch

{'input_ids': tensor([[    2,  1174, 18956,  ...,  2170,  2259,     3],
         [    2,  3920, 31221,  ...,  8055,  2867,     3],
         [    2,  8813,  2444,  ...,  3691,  4538,     3],
         ...,
         [    2,  6860, 19364,  ...,  2532,  6370,     3],
         [    2, 27463, 23413,  ..., 21786,  2069,     3],
         [    2,  3659,  2170,  ...,  2470,  3703,     3]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         ...,
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 1, 1, 1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'start_positions': tensor([260,  31,   0,  80,  72,  81, 216, 348, 323, 348]),
 'end_positions': tensor([263,  33,   0,  81,  78

In [31]:
with torch.no_grad():
    outputs = model(**batch)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = batch["input_ids"][0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'##한 오호츠크해 기단이 만나 형성되는 장마전선에서 내리는 비를 뜻한다. 장마전선은 18일 제주도 먼 남쪽 해상으로 내려갔다가 20일께 다시 북상해 전남 남해안까지 영향을 줄 것으로 보인다. 이에 따라 20 ~ 21일 남부지방에도 예년보다 사흘 정도 장마가 일찍 찾아올 전망이다. 그러나 장마전선을 밀어올리는 북태평양 고기압 세력이 약해 서울 등 중부지방은 평년보다 사나흘가량 늦은 이달 말부터 장마가 시작될 것이라는 게 기상청의 설명이다. 장마전선은 이후 한 달가량 한반도 중남부를 오르내리며 곳곳에 비를 뿌릴 전망이다. 최근 30년간 평균치에 따르면 중부지방의 장마 시작일은 6월24 ~ 25일이었으며 장마기간은 32일, 강수일수는 17. 2일이었다. 기상청은 올해 장마기간의 평균 강수량이 350 ~ 400㎜로 평년과 비슷하거나 적을 것으로 내다봤다. 브라질 월드컵 한국과 러시아의 경기가 열리는 18일 오전 서울은 대체로 구름이 많이 끼지만 비는 오지 않을 것으로 예상돼 거리 응원에는 [SEP]'