# **BERT 베이스라인 모델**

In [3]:
# 환경설정

!pip install -q transformers datasets accelerate scikit-learn

In [4]:
import json
import re
import pandas as pd
import numpy as np
import torch

In [5]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [9]:
# 파일 로드

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,idx,class,conversation
0,0,협박 대화,지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...
1,1,협박 대화,길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...
2,2,기타 괴롭힘 대화,너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...
3,3,갈취 대화,어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...
4,4,갈취 대화,저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...


In [17]:
df = pd.DataFrame({
    "text": ["안녕!!!!!   뭐해??", "이거  진짜야!!!"],
    "label": ["normal", "threat"]
})

label2id = {"normal": 0, "threat": 1}

In [18]:
# 데이터 로드 및 전처리

  # 최소로만 전처리 적용

def normalize_text(text):
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"([!?])\1+", r"\1", text)
    return text.strip()

df["text"] = df["text"].apply(normalize_text)
df["label_id"] = df["label"].map(label2id)

In [19]:
# 토크나이저

from transformers import AutoTokenizer

MODEL_NAME = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [20]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

In [21]:
# 데이터셋 구성

from datasets import Dataset

dataset = Dataset.from_pandas(
    df[["text", "label_id"]].rename(columns={"label_id": "labels"})
)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [22]:
# BERT 베이스라인 모델

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=5
)



model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: klue/bert-base
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
bert.embeddings.position_ids               | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on you

In [24]:
# 학습 설정

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_baseline",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)


In [29]:
# 평가 메트릭스

from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro")
    }

In [30]:
# 트레이너 실행

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    compute_metrics=compute_metrics
)

In [31]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,No log,0.845135,1.0,1.0
2,No log,0.661023,1.0,1.0
3,No log,0.569709,1.0,1.0


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

TrainOutput(global_step=3, training_loss=0.9351138273874918, metrics={'train_runtime': 37.604, 'train_samples_per_second': 0.16, 'train_steps_per_second': 0.08, 'total_flos': 394677213696.0, 'train_loss': 0.9351138273874918, 'epoch': 3.0})

In [32]:
results = trainer.evaluate()
print(results)



{'eval_loss': 0.8439077734947205, 'eval_accuracy': 1.0, 'eval_macro_f1': 1.0, 'eval_runtime': 0.7926, 'eval_samples_per_second': 2.523, 'eval_steps_per_second': 1.262, 'epoch': 3.0}
