In [None]:
#################################
# 학습 이후 평가 과정에서는 학습 단위마다 모델 손실이 출력됨
# 그러나 이것은 prediction과 label의 차이를 계산한것일 뿐, scaling(범주화)되지는 않음
# 객관적 평가를 위해서는 평가 지표를 사용
#    -> 태스크별로 다른 평가 지표를 사용하며, 각각 범위가 정해져 있으므로 객관화 점수 평가가 용이함

In [None]:
##############################
# Evaluate는 허깅페이스에서 제공하는 평가 지표 사용을 위한 라이브러리임
# 일반적으로 사용가능한 평가 지표들
#   > 분류 task : accuracy, f1 score, precision, recall
#   > 생성 task : BLEU(BiLingual Evaluation Understudy), ROUGE(Recall-Oriented Understudy for Gisting Evaludation)

In [1]:
import evaluate

# 평가지표 하나 불러오기
acc = evaluate.load("accuracy")
# 평가지표 여러 개 불러오기
metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

# 값 한꺼번에 입력하여 계산
print(metrics.compute(predictions=[1,0,0,1], references=[0,1,0,1]))

# add로 값을 저장한 후 한꺼번에 compute로 계산
for y, pred in zip([0,1,0,1], [1,0,0,1]):
    metrics.add(predictions=pred, references=y)
print(metrics.compute())

# add-batch로 배치 단위로 데이터 올리기
for y,preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
    metrics.add_batch(predictions=preds, references=y)
print(metrics.compute())

2025-06-11 07:17:14.867253: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-11 07:17:14.874002: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749626234.882345   21447 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749626234.884808   21447 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749626234.891133   21447 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

{'accuracy': 0.5, 'f1': 0.5, 'precision': 0.5, 'recall': 0.5}
{'accuracy': 0.5, 'f1': 0.5, 'precision': 0.5, 'recall': 0.5}
{'accuracy': 0.5, 'f1': 0.5, 'precision': 0.5, 'recall': 0.5}


In [2]:
# 커스텀 메트릭 만들기
# 커스텀 메트릭 예시
# dictionary 형태로 반환되는 구조의 함수라면 Trainer 클래스 매개변수 중 compute_metrics에 입력하여 사용 가능
def simple_accuracy(preds, labels):
    return {"accuracy": (preds==labels).to(float).mean().item()}

In [7]:
# 커스텀 메트릭 예시 - micro f1 : 긱 클래스의 f1 스코어를 계산한 후 평균값을 최종 f1 스코어로 활용
def custom_metrics(pred):
    f1 = evaluate.load("f1")
    labels=pred.label_ids
    preds = pred.predictions.argmax(-1)

    return f1.compute(predictions=preds, references=labels, average='micro')

    

In [8]:
from datasets import load_dataset
from transformers import (
                            AutoTokenizer, 
                            AutoModelForSequenceClassification,
                            Trainer,
                            TrainingArguments,
                            default_data_collator
                        )

model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7)

dataset = load_dataset("klue", "ynat")

def tokenize_function(sample):
    result = tokenizer(
        sample["title"],
        padding="max_length",
    )
    return result

datasets = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1000,
    remove_columns=["guid","title","url","date"]
)
print(datasets)

args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    max_steps=500,
    eval_strategy="steps",
    logging_strategy="steps",
    logging_steps=50,
    logging_dir="/home/ubuntu/model_path/evaluate/logs",
    save_strategy="steps",
    save_steps=50,
    output_dir="/home/ubuntu/model_path/evaluate/outputs",
    report_to="tensorboard",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['validation'],
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=custom_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 45678
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9107
    })
})


  trainer = Trainer(


In [10]:
trainer.train()

Step,Training Loss,Validation Loss,F1
50,1.0935,0.761791,0.764687
100,0.5373,0.560452,0.831009
150,0.4374,0.516203,0.838146
200,0.4674,0.501851,0.837268
250,0.433,0.483207,0.849017
300,0.4214,0.496619,0.837048
350,0.3862,0.503887,0.83639
400,0.4756,0.458163,0.849566
450,0.4149,0.438374,0.85286
500,0.3636,0.425681,0.855935


TrainOutput(global_step=500, training_loss=0.5030346031188965, metrics={'train_runtime': 577.7653, 'train_samples_per_second': 13.846, 'train_steps_per_second': 0.865, 'total_flos': 2104982937600000.0, 'train_loss': 0.5030346031188965, 'epoch': 0.17513134851138354})