In [18]:
import numpy as np
from datasets import load_dataset
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.metrics import accuracy_score, f1_score

print(transformers.__version__)


4.56.1


In [19]:
help(TrainingArguments)


Help on class TrainingArguments in module transformers.training_args:

class TrainingArguments(builtins.object)
 |
 |  TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
 |  itself**.
 |
 |  Using [`HfArgumentParser`] we can turn this class into
 |  [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
 |  command line.
 |
 |  Parameters:
 |      output_dir (`str`, *optional*, defaults to `"trainer_output"`):
 |          The output directory where the model predictions and checkpoints will be written.
 |      overwrite_output_dir (`bool`, *optional*, defaults to `False`):
 |          If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir`
 |          points to a checkpoint directory.
 |      do_train (`bool`, *optional*, defaults to `False`):
 |          Whether to run training or not. This argument is not directly used 

In [4]:
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

--2025-09-16 23:52:10--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4893335 (4.7M) [application/octet-stream]
Saving to: ‘ratings_test.txt’


2025-09-16 23:52:10 (97.4 MB/s) - ‘ratings_test.txt’ saved [4893335/4893335]

--2025-09-16 23:52:10--  https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14628807 (14M) [text/plain]
Saving to: ‘ratings_train.txt’


2025-09-16 23:52:11 (167 MB/s) - ‘ratings_

In [5]:
import pandas as pd
from datasets import Dataset, DatasetDict

# 로컬에서 불러오기
train_df = pd.read_csv("ratings_train.txt", sep="\t")
test_df = pd.read_csv("ratings_test.txt", sep="\t")

# Dataset으로 변환
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

# train/val 나누기
train_valid = train_ds.train_test_split(test_size=0.1, seed=42)
train_ds = train_valid["train"]
val_ds = train_valid["test"]

dataset = DatasetDict({
    "train": train_ds,
    "validation": val_ds,
    "test": test_ds
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 135000
    })
    validation: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 15000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})


In [12]:
#결측치 제거
train_ds = train_ds.filter(lambda x: x["document"] is not None)
val_ds = val_ds.filter(lambda x: x["document"] is not None)
test_ds = test_ds.filter(lambda x: x["document"] is not None)


Filter:   0%|          | 0/135000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
# 2. 모델과 토크나이저 준비
MODEL_NAME = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

def preprocess(batch):
    return tokenizer(
        batch["document"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_ds = train_ds.map(preprocess, batched=True)
val_ds = val_ds.map(preprocess, batched=True)
test_ds = test_ds.map(preprocess, batched=True)



Map:   0%|          | 0/134995 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/49997 [00:00<?, ? examples/s]

In [15]:
# 4. metric 정의
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [20]:
# 5. 학습 파라미터
training_arguments = TrainingArguments(
    output_dir="klue_nsmc_output",
    eval_strategy="epoch",   # epoch마다 평가
    save_strategy="epoch",         # epoch마다 저장
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",              # wandb 같은 로그 끔
)

In [21]:
# 6. Trainer 정의
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
# 7. 학습
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2541,0.253286,0.905467,0.904525


In [None]:
# 8. 평가 (Validation + Test)
print("Validation 결과:", trainer.evaluate(eval_dataset=val_ds))
print("Test 결과:", trainer.evaluate(eval_dataset=test_ds))

https://medium.com/data-science/divide-hugging-face-transformers-training-time-by-2-or-more-21bf7129db9q-21bf7129db9e  
를 읽어보면 Dynamic Padding과 Bucketing을 적용하면 훈련 시간이 단축되고, 모델 성능이 향상되었습니다. 배치 크기 64에서 훈련 시간이 48분에서 30분으로 단축되었으며, 정확도는 81.0%에서 81.7%로 향상되었습니다