In [1]:

# Transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
# Setting Library
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import tensorflow as tf
import numpy as np
from tqdm import tqdm, tqdm_notebook
import pandas as pd

In [2]:
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder


In [3]:
device = torch.device("cuda:0")

In [4]:
train = pd.read_csv('/content/drive/MyDrive/toy_project/data/train_1.csv')

In [5]:
val = pd.read_csv('/content/drive/MyDrive/toy_project/data/val_1.csv')

In [6]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")
model = BertForSequenceClassification.from_pretrained("beomi/kcbert-base", num_labels= 6)
model.classifier = torch.nn.Linear(model.config.hidden_size, 6)  # 6은 클래스 개수


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from sklearn.preprocessing import LabelEncoder

# 라벨 인코더 생성 및 학습
encoder = LabelEncoder()
train['label'] = encoder.fit_transform(train['감정_대분류'])  # emotion 컬럼이 감정 라벨을 포함한다고 가정
val['label'] = encoder.transform(val['감정_대분류'])

# 인코딩된 라벨 확인
print(train[['감정_대분류', 'label']].drop_duplicates())


   감정_대분류  label
0      분노      2
7      기쁨      0
8      불안      3
14     당황      1
18     슬픔      5
25     상처      4


In [8]:
from transformers import BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

class CustomDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = str(self.sentences[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',  # Return PyTorch tensors
            truncation=True
        )

        return {
            'sentence': sentence,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 파라미터 설정
MAX_LEN = 128
BATCH_SIZE = 16

# 데이터셋 생성
train_dataset = CustomDataset(
    sentences=train['사람문장1'].to_numpy(),
    labels=train['label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
val_dataset = CustomDataset(
    sentences=val['사람문장1'].to_numpy(),
    labels=val['label'].to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)
# DataLoader 준비
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)


In [None]:
from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import random_split
import torch

model.train()
model.to(device)
# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=2e-5)

# 훈련 설정
epochs = 10


for epoch in range(epochs):
    # 훈련 단계
    train_losses = []
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs[0]
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

    avg_train_loss = sum(train_losses) / len(train_losses)
    print(f'Epoch {epoch+1}, Average Training Loss: {avg_train_loss}')

    # 검증 단계
    model.eval()
    total_loss, total_accuracy = 0, 0
    for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} Validation"):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

    avg_val_loss = total_loss / len(val_loader)
    print(f'Validation Loss: {avg_val_loss}')

    model.train()

# 모델 저장
model_path = "/content/drive/MyDrive/toy_project/bert_6_emotions.pt"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")


Epoch 1 Training: 100%|██████████| 3227/3227 [18:42<00:00,  2.87it/s]


Epoch 1, Average Training Loss: 1.1415191013948962


Epoch 1 Validation: 100%|██████████| 416/416 [00:50<00:00,  8.17it/s]


Validation Loss: 0.8604460125789046


Epoch 2 Training: 100%|██████████| 3227/3227 [18:41<00:00,  2.88it/s]


Epoch 2, Average Training Loss: 0.9915996131316627


Epoch 2 Validation: 100%|██████████| 416/416 [00:50<00:00,  8.20it/s]


Validation Loss: 0.8274232852272689


Epoch 3 Training:  77%|███████▋  | 2484/3227 [14:23<04:18,  2.87it/s]