In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification

def preprocess_data(example, tokenizer):
    return tokenizer(example["document"], truncation=True)

model_name = "google-bert/bert-base-multilingual-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

dataset = load_dataset("nsmc", trust_remote_code=True)
processed_dataset = dataset.map(
    lambda example: preprocess_data(example, tokenizer),
    batched=True,
    remove_columns=["id", "document"]
).rename_column("label", "labels")

print(dataset)
print(processed_dataset)
print(dataset["train"][0])
print(processed_dataset["train"][0])

In [None]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

max_length_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="max_length"
)
max_length_dataloader = DataLoader(
    processed_dataset["train"],
    collate_fn=max_length_collator,
    batch_size=4,
    shuffle=False
)
max_length_iterator = iter(max_length_dataloader)
max_lnegth_batch = next(max_length_iterator)
print("max_length 패딩 입력 id shape :", max_lnegth_batch["input_ids"].shape)

longest_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest"
)
longest_dataloader = DataLoader(
    processed_dataset["train"],
    collate_fn=longest_collator,
    batch_size=4,
    shuffle=False
)
longest_iterator = iter(longest_dataloader)
longest_batch = next(longest_iterator)
print("longest 패딩 입력 id shape :", longest_batch["input_ids"].shape)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="text-classification",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=1,
    eval_steps=200,
    logging_steps=200,
    seed=42
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=longest_collator,
    train_dataset=processed_dataset["train"].select(range(10000)),
    eval_dataset=processed_dataset["test"].select(range(100))
)

trainer.train()

In [None]:
import torch

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

text = "진짜 재밌었어요. 또 보러 갈거에요"
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs.to(device))
    print(outputs.logits)
    print(outputs.logits.argmax())

In [None]:
import evaluate

yhat = trainer.predict(processed_dataset["test"])
predictions = yhat.predictions.argmax(axis=1)
references = yhat.label_ids

metric = evaluate.load("accuracy")
accuracy = metric.compute(predictions=predictions, references=references)
print(accuracy)

metric = evaluate.load("f1")
f1 = metric.compute(predictions=predictions, references=references)
print(f1)