In [1]:
# 여러 개 입력이 주어졌을 때 주어진 문장 중 옳은 것을 고르는 문제 -> 객관식
# 프롬프트+답변 쌍을 여러개 받아 그중 정답을 출력함
# 문장이 길수록 효과적임 -> 각 문장이 짧다면 모든 선택지를 한문장에 넣고 문장 분류를 하는 것이 더 효율적일 수 있음

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForMultipleChoice

model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMultipleChoice.from_pretrained(model_name) # num_labels 설정하지 않음

model

2025-06-12 05:00:42.781631: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-12 05:00:42.790423: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749704442.799006   24547 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749704442.801507   24547 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749704442.807815   24547 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

BertForMultipleChoice(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [5]:
from datasets import load_dataset

dataset = load_dataset("HAERAE-HUB/csatqa", "full") # 수능 국어문제 - 오지선다형
print(dataset["test"][10])

{'question': '<보기>를 참고하여 밑줄 친 두 단어의 의미를 한 단어에 담아\n표현한 것으로 적절하지 않은 것은?\n<보기>\n그는 손으로 방문을 <word start>세게 밀었다.<word end> (⇒밀쳤다)\n어제는 서쪽 하늘이 <word start>몹시 붉었다.<word end> (⇒붉디붉었다)', 'context': None, 'option#1': '그 집은 <word start>매우 크다. <word end>(⇒커다랗다)', 'option#2': '그는 건강을 <word start>다시 찾았다. <word end>(⇒되찾았다)', 'option#3': '그는 남의 말을 <word start>몰래 들었다. <word end>(⇒엿들었다)', 'option#4': '그는 계단에서 발을 <word start>잘못 디뎠다. <word end>(⇒헛디뎠다)', 'option#5': '그는 오늘 친구와 <word start>심하게 싸웠다. <word end>(⇒싸움질했다)', 'gold': 5, 'category': 'N/A', 'human_performance': 0.0}


In [8]:
ending_names = ["option#1", "option#2", "option#3", "option#4", "option#5"]

def preprocess_function(examples):
    first_sentences=[
        [context] * 5 for context in examples["context"]   # 모든 문제가 공유하는 보기 항목인 context칼럼을 first_sentences로 선언
    ]
    question_headers = examples["question"]  # 질문인 question 칼럼을 question_headers변수로 옮기기
    second_sentences = [
        [f"{header} {examples[end][i]}" for end in ending_names] for i,header in enumerate(question_headers) # 질문-답변 5개 결합 세팅
    ]

    # 토큰화를 위한 1차 flatten
    first_sentences = sum(first_sentences,[])
    second_sentences = sum(second_sentences, [])

    # None 데이터 처리
    first_sentences = [i if i else '' for i in first_sentences]
    second_sentences = [i if i else '' for i in second_sentences]

    # 토큰화
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)

    # 토큰화 후 2차원 재배열
    result = {
        k: [v[i:i+5] for i in range(0,len(v),5)] for k,v in tokenized_examples.items()
    }

    # collator 사용 편하게 하기 위한 변수명 이동 -> 0번부터 시작하도록 변경
    result['labels'] = [i-1 for i in examples['gold']]

    return result



tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset['test'].column_names
)

Map:   0%|          | 0/936 [00:00<?, ? examples/s]

In [16]:
# 전처리후 결과
dataset["test"][10]

{'question': '<보기>를 참고하여 밑줄 친 두 단어의 의미를 한 단어에 담아\n표현한 것으로 적절하지 않은 것은?\n<보기>\n그는 손으로 방문을 <word start>세게 밀었다.<word end> (⇒밀쳤다)\n어제는 서쪽 하늘이 <word start>몹시 붉었다.<word end> (⇒붉디붉었다)',
 'context': None,
 'option#1': '그 집은 <word start>매우 크다. <word end>(⇒커다랗다)',
 'option#2': '그는 건강을 <word start>다시 찾았다. <word end>(⇒되찾았다)',
 'option#3': '그는 남의 말을 <word start>몰래 들었다. <word end>(⇒엿들었다)',
 'option#4': '그는 계단에서 발을 <word start>잘못 디뎠다. <word end>(⇒헛디뎠다)',
 'option#5': '그는 오늘 친구와 <word start>심하게 싸웠다. <word end>(⇒싸움질했다)',
 'gold': 5,
 'category': 'N/A',
 'human_performance': 0.0}

In [28]:
# 다중 분류 태스크에서는 DataCollatorWithPadding을 사용하기 어려우므로 Collator 직접 작성

from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
        tokenizer: PreTrainedTokenizerBase
        padding: Union[bool, str, PaddingStrategy] = True
        max_length: Optional[int] = None
        pad_to_multiple_of: Optional[int] = None
        
        def __call__(self, features):
          label_name = "label" if "label" in features[0].keys() else "labels"
          labels = [feature.pop(label_name) for feature in features]
        
          batch_size = len(features)
          num_choices = len(features[0]["input_ids"])
        
          flattened_features = [
              [
                  {k: v[i] for k, v in feature.items()}
                  for i in range(num_choices)
              ]
              for feature in features
          ]
          flattened_features = sum(flattened_features, [])
        
          batch = self.tokenizer.pad(
              flattened_features,
              padding=self.padding,
              max_length=self.max_length,
              pad_to_multiple_of=self.pad_to_multiple_of,
              return_tensors="pt",
          )
        
          batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
          batch["labels"] = torch.tensor(labels, dtype=torch.int64)
          return batch
        
collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
batch = collator([tokenized_dataset["test"][i] for i in range(5)])
    

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [29]:
# 추론
with torch.no_grad():
    logits = model(**batch).logits

logits

tensor([[ 0.1068,  0.1716,  0.2760,  0.1267,  0.0006],
        [ 0.3336,  0.3597,  0.3490,  0.3621,  0.3288],
        [ 0.3613,  0.3684,  0.3666,  0.3498,  0.3922],
        [ 0.4193,  0.4347,  0.4649,  0.4608,  0.1217],
        [-0.0868, -0.0586, -0.1727, -0.1443, -0.1152]])

In [30]:
# 평가 지표
import evaluate

pred_labels = logits.argmax(dim=1).cpu().numpy()
print(pred_labels)
true_labels = batch["labels"].numpy()
print(true_labels)

f1 = evaluate.load("f1")
f1.compute(predictions=pred_labels, references=true_labels, average='micro')

[2 3 4 2 1]
[4 4 0 3 1]


{'f1': 0.2}

In [None]:



??????????????????????//









class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [ feature.pop(label_name) for feature in features ]
                   
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])

        flattened_features = [
            [
                {k: v[i] for k,v in feature.item()}
                for i in range(num_choices)
            ]
            for feature in features
        ]
        
        flattened_features = sum(flattened_features, [])

        batch= self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt"
        )

        batch = {k: v.view(batch_size, num_choices, -a) for k,v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)

        return batch


collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
batch = collator([tokenized_dataset['test'][i] for i in range(5)])


      

In [25]:
dataset["test"][10].item()

AttributeError: 'dict' object has no attribute 'item'