In [1]:
from transformers import AutoModel

model_id = "klue/roberta-base"
model = AutoModel.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
from transformers import AutoTokenizer

model_id = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [16]:
tokenized = tokenizer("토크나이저는 텍스트를 토큰 단위로 나눈다")
print(tokenized)

print(tokenizer.convert_ids_to_tokens(tokenized["input_ids"]))

print(tokenizer.decode(tokenized["input_ids"]))

print(tokenizer.decode(tokenized["input_ids"], skip_special_tokens=True))

{'input_ids': [0, 9157, 7461, 2190, 2259, 8509, 2138, 1793, 2855, 5385, 2200, 20950, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', '토크', '##나이', '##저', '##는', '텍스트', '##를', '토', '##큰', '단위', '##로', '나눈다', '[SEP]']
[CLS] 토크나이저는 텍스트를 토큰 단위로 나눈다 [SEP]
토크나이저는 텍스트를 토큰 단위로 나눈다


In [4]:
from datasets import load_dataset

klue_mrc_dataset = load_dataset("klue", "mrc")
print(klue_mrc_dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


DatasetDict({
    train: Dataset({
        features: ['title', 'context', 'news_category', 'source', 'guid', 'is_impossible', 'question_type', 'question', 'answers'],
        num_rows: 17554
    })
    validation: Dataset({
        features: ['title', 'context', 'news_category', 'source', 'guid', 'is_impossible', 'question_type', 'question', 'answers'],
        num_rows: 5841
    })
})


In [5]:
klue_tc_train = load_dataset("klue", "ynat", split="train")
klue_tc_eval = load_dataset("klue", "ynat", split="validation")
klue_tc_train
klue_tc_train[0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [6]:
klue_tc_train = klue_tc_train.remove_columns(["guid", "url", "date"])
klue_tc_eval = klue_tc_eval.remove_columns(["guid", "url", "date"])
klue_tc_train

Dataset({
    features: ['title', 'label'],
    num_rows: 45678
})

In [7]:
klue_tc_train.features["label"]

klue_tc_train.features["label"].int2str(1)

klue_tc_label = klue_tc_train.features["label"]


def make_str_label(batch):
    batch["label_str"] = klue_tc_label.int2str(batch["label"])
    return batch


klue_tc_train = klue_tc_train.map(make_str_label, batched=True, batch_size=1000)

klue_tc_train[0]

{'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영', 'label': 3, 'label_str': '생활문화'}

In [8]:
train_dataset = klue_tc_train.train_test_split(test_size=10000, shuffle=True, seed=42)[
    "test"
]
dataset = klue_tc_eval.train_test_split(test_size=1000, shuffle=True, seed=42)
test_dataset = dataset["test"]
valid_dataset = dataset["train"].train_test_split(
    test_size=1000, shuffle=True, seed=42
)["test"]

In [13]:
import torch
import numpy as np
from transformers import (
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)


def tokenize_function(examples):
    return tokenizer(examples["title"], padding="max_length", truncation=True)


model_id = "klue/roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=len(train_dataset.features["label"].names)
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 10000/10000 [00:01<00:00, 9019.12 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 9908.54 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 10061.78 examples/s]


In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    push_to_hub=False,
)


def compute_metrics(evel_pred):
    logits, labels = evel_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).mean()}



- M2 Mac 에서 시간이 너무 오래 걸린다.
- cuda 에서는 얼마나 시간이 걸리는지 확인해볼것


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate(test_dataset)