In [1]:
# 문장 분류 Sequence Classification
# 보통 2개로 분류는 이진 불류, 3개 이상은 다중 분류 multiclass classification이라고 부름

# 2차원 문장 임베딩에서 1개 클래스를 고르는 1차원 확률분포를 반환함
# 이를 위해 모델은 고차원->저차원 데이터 압축하는 풀링pooling 작업 진행 -> reduce-sum,reduce-mean 등

# transformer encoder 모델은 문장의 모든 정보를 하나의 토큰 벡터에 저장하도록 학습 -> 문장 맨 앞의 [CLS] 토큰 -> classification을 뜻함
# [CLS]는 문자으이 맨 앞에 삽입되어 분류 태스크 또는 BOS(Begin Of Sentence) 표시에 사용되고 나머지 태스트에서는 대부분 무시됨
# [CLS] 토큰 벡터를 입력으로 받는 FFNN(FeedForward Neural Network)을 추가로 부착하여 미세조정 진행

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # 미세조정되지 않은 상태이므로 설정값 추가해 주어야 함

model # embedding, encoder, pooling, classifier등의 레이어 확인 가능

2025-06-12 04:29:40.828111: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-12 04:29:40.835027: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749702580.843338   23332 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749702580.845812   23332 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749702580.852134   23332 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [4]:
# 모델이 지닌 클래스 확인
print(model.config.id2label)

# 정상 작동 확인
inputs = tokenizer("안녕? 내 강아지는 귀여워.", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

{0: 'LABEL_0', 1: 'LABEL_1'}


'LABEL_1'

In [6]:
# 데이터셋 로드
from datasets import load_dataset

dataset = load_dataset("klue","sts")
dataset["train"]

train-00000-of-00001.parquet:   0%|          | 0.00/1.52M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/68.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/519 [00:00<?, ? examples/s]

Dataset({
    features: ['guid', 'source', 'sentence1', 'sentence2', 'labels'],
    num_rows: 11668
})

In [8]:
# 전처리 함수 정의
def process_data(batch):
    result = tokenizer(batch["sentence1"], text_pair=batch["sentence2"])
    result["labels"] = [x["binary-label"] for x in batch["labels"]]
    return result

# 배치단위 전처리
dataset = dataset.map(
    process_data,
    batched=True,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/11668 [00:00<?, ? examples/s]

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

In [15]:
# 문장 길이 통일 with padding - 테스트용 10개 입력
from transformers import DataCollatorWithPadding

collator = DataCollatorWithPadding(tokenizer)
batch = collator([dataset["train"][i] for i in range(10)])

# 10개 값에 대해 추론
with torch.no_grad():
    logits = model(**batch).logits

logits

tensor([[ 0.0021, -0.2343],
        [ 0.1941,  0.1877],
        [-0.0738, -0.3436],
        [-0.2128,  0.2313],
        [ 0.2154, -0.1252],
        [ 0.0528,  0.3561],
        [ 0.2005, -0.3857],
        [ 0.0593,  0.0934],
        [ 0.4533, -0.3088],
        [ 0.0271, -0.4454]])

In [16]:
# 추론한 10개 레이블과 실제 레이블 정의
pred_labels = logits.argmax(dim=1).cpu().numpy()
print(pred_labels)

true_labels = batch["labels"].numpy()
print(true_labels)

[0 0 0 1 0 1 0 1 0 0]
[1 0 0 0 1 0 1 0 0 1]


In [17]:
# 평가지표
import evaluate

# F1 score
f1 = evaluate.load("f1")
f1.compute(predictions=pred_labels, references=true_labels, average='micro')

{'f1': 0.3}

In [19]:
################################################################
# 분류 모델 헤더는 softmax 함수 적용 전의 logit값을 반환함. 즉, 스케일링 적용 안되어있음
# logit 함수가 무한대 버우이를 가지므로, 이 특성을 이용하면 SequenceClassification 모델을 회귀 모델로 사용 가능
# label값을 1로 설정하면 모델은 회귀 태스크로 인식 -> 손실함수를 CE(Cross-Entropy) 대신 MSE(mean Square Error)를 사용함 -> 출력차원은 1로 설정됨
# 회귀 문제는 주로 MSE나 MAE(Meas Absolute Error)를 손실함수로 사용

In [20]:
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, BertModel

tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
model = BertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=1)
print(model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e