In [None]:
import numpy as np
import pandas as pd
from Korpora import Korpora

corpus = Korpora.load("nsmc")
df = pd.DataFrame(corpus.test).sample(20000, random_state=42) #2만개의 데이터세트
train, valid, test = np.split(
    df.sample(frac=1, random_state=42), [int(0.6*len(df)), int(0.8*len(df))]
) #6:2:2로 데이터세트를 분리한다. 0.6지점에서 분리, 0.8지점에서 분리하여 3개로 나눈다.

print(train.head(5).to_markdown())
print(f"Traning Data Size : {len(train)}")
print(f"Validation Data Size : {len(valid)}")
print(f"Testing Data Size : {len(test)}")

In [None]:
"""네이버 영화 리뷰 데이터 세트 전처리"""


import torch
from transformers import ElectraTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler

def make_dataset(data,tokenizer,device):
    tokenized = tokenizer(
        text = data.text.tolist(),
        padding = "longest",
        truncation = True,
        return_tensors = "pt"
    )
    input_ids = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device)
    labels = torch.tensor(data.label.values, dtype=torch.long).to(device)
    return TensorDataset(input_ids, attention_mask, labels)

#샘플러 클래스를 활용해 데이터를 목적에 따라 샘플링
def get_dataloader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler=data_sampler, batch_size=batch_size)
    return dataloader

epochs = 5
batch_size = 32
device     = "mps" if torch.backends.mps.is_available() and torch.backends.mps.is_built() else "cpu"
# device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = ElectraTokenizer.from_pretrained(
    pretrained_model_name_or_path="monologg/koelectra-base-v3-discriminator", 
    #영어 텍스트 분류를 위해 만들어진 ELECTRA모델과 한국어 텍스트 분류를 위해 만들어진 KoELECTRA가 제공된다.
    #ELECTRA는 판별 모델만을 이용해 다운스트림을 하므로 판별모델을 불러옴
    do_lower_case=False
)

train_dataset = make_dataset(train, tokenizer, device)
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size) #무작위로 샘플링

valid_dataset = make_dataset(train, tokenizer, device)
valid_dataloader = get_dataloader(train_dataset, SequentialSampler, batch_size) #고정된 순서대로 샘플링

test_dataset = make_dataset(train, tokenizer, device)
test_dataloader = get_dataloader(train_dataset, SequentialSampler, batch_size)

print(train_dataset[0])

In [None]:
from torch import optim
from transformers import ElectraForSequenceClassification

model = ElectraForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="monologg/koelectra-base-v3-discriminator",
    num_labels = 2
).to(device)
optimizer = optim.AdamW(model.parameters(),lr=1e-5, eps=1e-8)

In [None]:
"""ELECTRA 모델 구조 출력"""


from transformers import ElectraForSequenceClassification

model = ElectraForSequenceClassification.from_pretrained(pretrained_model_name_or_path="monologg/koelectra-base-v3-discriminator").to(device)
#사전 학습된 모델(pretrained_model_name_or_path)을 from_pretrained 메서드로 불러옴

for main_name, main_module in model.named_children():
    print(main_name)
    for sub_name, sub_module in main_module.named_children():
        print("└",sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print("│  └", ssub_name)
            for sssub_name, sssub_module in ssub_module.named_children():
                print("│  │  └",sssub_name)
 #dense, out_proj?

In [None]:
"""ELECTRA 모델 학습 및 검증"""

import numpy as np
from torch import nn

def calc_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat)/len(labels_flat)

def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0
    for input_ids, attention_mask, labels in dataloader:
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels = labels
        )

        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss

def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        criterion = nn.CrossEntropyLoss()
        val_loss, val_accuracy = 0.0, 0.0

        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                labels = labels
            )
            logits = outputs.logits

            loss = criterion(logits, labels)
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to("cpu").numpy()
            accuracy = calc_accuracy(logits, label_ids)

            val_loss += loss
            val_accuracy += accuracy

    val_loss = val_loss/len(dataloader)
    val_accuracy = val_accuracy/len(dataloader)
    return val_loss, val_accuracy

best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f"Epoch:{epoch+1}, Train loss:{train_loss: .4f}, Val loss : {val_loss:.4f}, Val accuracy: {val_accuracy:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(),"models/ElectraForSequenceClassification.pt")
        print("Saved the model weights")