In [3]:
from transformers import AutoModel

model_id = "klue/roberta-base"
model = AutoModel.from_pretrained(model_id)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import AutoTokenizer

model_id = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [5]:
tokenized = tokenizer("토크나이저는 텍스트를 토큰 단위로 나눈다")
print(tokenized)

print(tokenizer.convert_ids_to_tokens(tokenized["input_ids"]))

print(tokenizer.decode(tokenized["input_ids"]))

print(tokenizer.decode(tokenized["input_ids"], skip_special_tokens=True))

{'input_ids': [0, 9157, 7461, 2190, 2259, 8509, 2138, 1793, 2855, 5385, 2200, 20950, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['[CLS]', '토크', '##나이', '##저', '##는', '텍스트', '##를', '토', '##큰', '단위', '##로', '나눈다', '[SEP]']
[CLS] 토크나이저는 텍스트를 토큰 단위로 나눈다 [SEP]
토크나이저는 텍스트를 토큰 단위로 나눈다


In [6]:
from datasets import load_dataset

klue_mrc_dataset = load_dataset("klue", "mrc")
print(klue_mrc_dataset)

DatasetDict({
    train: Dataset({
        features: ['title', 'context', 'news_category', 'source', 'guid', 'is_impossible', 'question_type', 'question', 'answers'],
        num_rows: 17554
    })
    validation: Dataset({
        features: ['title', 'context', 'news_category', 'source', 'guid', 'is_impossible', 'question_type', 'question', 'answers'],
        num_rows: 5841
    })
})


In [7]:
klue_tc_train = load_dataset("klue", "ynat", split="train")
klue_tc_eval = load_dataset("klue", "ynat", split="validation")
klue_tc_train
klue_tc_train[0]

{'guid': 'ynat-v1_train_00000',
 'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영',
 'label': 3,
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=227&oid=001&aid=0008508947',
 'date': '2016.06.30. 오전 10:36'}

In [8]:
klue_tc_train = klue_tc_train.remove_columns(["guid", "url", "date"])
klue_tc_eval = klue_tc_eval.remove_columns(["guid", "url", "date"])
klue_tc_train

Dataset({
    features: ['title', 'label'],
    num_rows: 45678
})

In [9]:
klue_tc_train.features["label"]

klue_tc_train.features["label"].int2str(1)

klue_tc_label = klue_tc_train.features["label"]


def make_str_label(batch):
    batch["label_str"] = klue_tc_label.int2str(batch["label"])
    return batch


klue_tc_train = klue_tc_train.map(make_str_label, batched=True, batch_size=1000)

klue_tc_train[0]

{'title': '유튜브 내달 2일까지 크리에이터 지원 공간 운영', 'label': 3, 'label_str': '생활문화'}

In [10]:
train_dataset = klue_tc_train.train_test_split(test_size=10000, shuffle=True, seed=42)[
    "test"
]
dataset = klue_tc_eval.train_test_split(test_size=1000, shuffle=True, seed=42)
test_dataset = dataset["test"]
valid_dataset = dataset["train"].train_test_split(
    test_size=1000, shuffle=True, seed=42
)["test"]

In [11]:
import torch
import numpy as np
from transformers import (
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)


def tokenize_function(examples):
    return tokenizer(examples["title"], padding="max_length", truncation=True)


model_id = "klue/roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=len(train_dataset.features["label"].names)
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1000/1000 [00:00<00:00, 3532.22 examples/s]


In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    push_to_hub=False,
)


def compute_metrics(evel_pred):
    logits, labels = evel_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).mean()}



- M2 Mac 에서 시간이 너무 오래 걸린다.
- cuda (RTX3060) 에서 11m 걸림


In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate(test_dataset)

  trainer = Trainer(
 40%|████      | 500/1250 [03:59<06:01,  2.08it/s]

{'loss': 0.6781, 'grad_norm': 5.031581401824951, 'learning_rate': 3e-05, 'epoch': 0.4}


 80%|████████  | 1000/1250 [08:09<02:01,  2.06it/s]

{'loss': 0.5295, 'grad_norm': 39.430267333984375, 'learning_rate': 1e-05, 'epoch': 0.8}


                                                   
100%|██████████| 1250/1250 [10:40<00:00,  1.95it/s]


{'eval_loss': 0.5222100615501404, 'eval_accuracy': 0.844, 'eval_runtime': 17.9256, 'eval_samples_per_second': 55.786, 'eval_steps_per_second': 6.973, 'epoch': 1.0}
{'train_runtime': 640.5821, 'train_samples_per_second': 15.611, 'train_steps_per_second': 1.951, 'train_loss': 0.5787640991210937, 'epoch': 1.0}


100%|██████████| 125/125 [00:17<00:00,  7.06it/s]


{'eval_loss': 0.5008647441864014,
 'eval_accuracy': 0.842,
 'eval_runtime': 17.8595,
 'eval_samples_per_second': 55.993,
 'eval_steps_per_second': 6.999,
 'epoch': 1.0}

In [12]:
import torch
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from transformers import AdamW

def tokenize_function(examples):
    return tokenizer(examples["title"], padding="max_length", truncation=True)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model_id = "klue/roberta-base"
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=len(train_dataset.features["label"].names)
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [13]:
def make_dataloader(dataset, batch_size, shuffle=True):
    dataset = dataset.map(tokenize_function, batched=True).with_format("torch")
    # 데이터셋에 토큰화 수행
    dataset = dataset.rename_column("label", "labels")
    dataset = dataset.remove_columns(column_names=["title"])
    return DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle
    )
    

# 데이터로더 만들기
train_dataloader = make_dataloader(train_dataset, batch_size=8, shuffle=True)
valid_dataloader = make_dataloader(valid_dataset, batch_size=8, shuffle=False)
test_dataloader = make_dataloader(test_dataset, batch_size=8, shuffle=False)

Map: 100%|██████████| 10000/10000 [00:01<00:00, 6513.23 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 6022.73 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 6410.29 examples/s]


In [14]:
def train_epoch(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)               # 모델에 입력할 토큰 아이디
        attention_mask = batch["attention_mask"].to(device)     # 모델에 입력할 어텐션 마스크
        labels = batch["labels"].to(device)                     # 모델에 입력할 레이블
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)    # 모델 계산
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [15]:
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0
    predictions = []
    true_labels = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            loss = outputs.loss
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    avg_loss = total_loss / len(dataloader)
    accuracy = np.mean(np.array(predictions) == np.array(true_labels))
    return avg_loss, accuracy

In [16]:
num_epochs = 1
optimizer = AdamW(model.parameters(), lr=5e-5)

# 학습 루프
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train_epoch(model, train_dataloader, optimizer)
    print(f"Train loss: {train_loss}")
    valid_loss, valid_accuracy = evaluate(model, valid_dataloader)
    print(f"Valid loss: {valid_loss}, Valid accuracy: {valid_accuracy}")

# 테스트
_, test_accuracy = evaluate(model, test_dataloader)
print(f"Test accuracy: {test_accuracy}")



Epoch 1/1


100%|██████████| 1250/1250 [09:35<00:00,  2.17it/s]


Train loss: 0.6366165441691876


100%|██████████| 125/125 [00:16<00:00,  7.46it/s]


Valid loss: 0.9518906944990158, Valid accuracy: 0.711


100%|██████████| 125/125 [00:16<00:00,  7.46it/s]

Test accuracy: 0.694





In [None]:
from huggingface_hub import login

login(token="...")
repo_id = f"zzingo5/roberta-base-klue-ynat-classification"
# Trainer를 사용한 경우
#trainer.push_to_hub(repo_id)
# 직접 학습한 경우
model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

model.safetensors: 100%|██████████| 443M/443M [00:20<00:00, 21.6MB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/zzingo5/roberta-base-klue-ynat-classification/commit/1e91ce5ee18874abc18613dd0c38ecc185df394a', commit_message='Upload tokenizer', commit_description='', oid='1e91ce5ee18874abc18613dd0c38ecc185df394a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/zzingo5/roberta-base-klue-ynat-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='zzingo5/roberta-base-klue-ynat-classification'), pr_revision=None, pr_num=None)

In [19]:
from transformers import pipeline

model_id = f"zzingo5/roberta-base-klue-ynat-classification"

model_pipeline = pipeline("text-classification", model=model_id)

model_pipeline("부천에 눈이 많이 내렸습니다.")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'label': 'LABEL_3', 'score': 0.9630207419395447}]

In [20]:
import torch
from torch.nn.functional import softmax
from transformers import AutoModelForSequenceClassification, AutoTokenizer

class CustomPipeline:
    def __init__(self, model_id):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_id)
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.model.to(self.device)
        
    def __call__(self, texts):
        tokenized = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

        with torch.no_grad():
            outputs = self.model(**tokenized.to(self.device))
            logits = outputs.logits

        probabilities = softmax(logits, dim=-1)
        scores, labels = torch.max(probabilities, dim=-1)
        labels_str = [self.model.config.id2label[label_idx] for label_idx in labels.tolist()]

        return [{"label": label, "score": score} for label, score in zip(labels_str, scores.tolist())]
    
custom_pipeline = CustomPipeline(model_id)
custom_pipeline("부천에 눈이 많이 내렸습니다.")

[{'label': 'LABEL_3', 'score': 0.9630207419395447}]