In [1]:
"""네이버 영화 리뷰 데이터 불러오기"""

import numpy as np
import pandas as pd
from Korpora import Korpora

corpus = Korpora.load("nsmc")
df = pd.DataFrame(corpus.test).sample(20000, random_state=42) #2만개의 데이터세트
train, valid, test = np.split(
    df.sample(frac=1, random_state=42), [int(0.6*len(df)), int(0.8*len(df))]
) #6:2:2로 데이터세트를 분리한다. 0.6지점에서 분리, 0.8지점에서 분리하여 3개로 나눈다.

print(train.head(5).to_markdown())
print(f"Traning Data Size : {len(train)}")
print(f"Validation Data Size : {len(valid)}")
print(f"Testing Data Size : {len(test)}")


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at /Users/yeeun/Korpora/nsmc/ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at /Users/yeeun/K

In [2]:
"""BERT 입력 텐서 생성"""

import torch
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler

def make_dataset(data,tokenizer,device):
    tokenized = tokenizer(
        text = data.text.tolist(),
        padding = "longest", #패딩 : 가장 긴 시퀀스에 대해 패딩을 적용
        truncation = True, #절사 : 입력 시퀀스 길이가 최대 길이를 초과하는 경우 해당 시퀀스를 자른다.
        return_tensors = "pt" #반환 형식 설정 : 파이토치 텐서로 결과를 반환
    )
    input_ids = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device)
    labels = torch.tensor(data.label.values, dtype=torch.long).to(device)
    return TensorDataset(input_ids, attention_mask, labels)

#샘플러 클래스를 활용해 데이터를 목적에 따라 샘플링
def get_dataloader(dataset, sampler, batch_size):
    data_sampler = sampler(dataset)
    dataloader = DataLoader(dataset, sampler=data_sampler, batch_size=batch_size)
    return dataloader

epochs = 5
batch_size = 32
device = "mps" if torch.backends.mps.is_available() and torch.backends.mps.is_built() else "cpu"
# device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path="bert-base-multilingual-cased",
    do_lower_case=False #소문자 유지 매개변수를 False로 설정(True이면 apple과 Apple을 다른 단어로 인식)
)

train_dataset = make_dataset(train, tokenizer, device)
train_dataloader = get_dataloader(train_dataset, RandomSampler, batch_size) #무작위로 샘플링

valid_dataset = make_dataset(train, tokenizer, device)
valid_dataloader = get_dataloader(train_dataset, SequentialSampler, batch_size) #고정된 순서대로 샘플링

test_dataset = make_dataset(train, tokenizer, device)
test_dataloader = get_dataloader(train_dataset, SequentialSampler, batch_size)

print(train_dataset[0])

(tensor([   101,  58466,   9812, 118956, 119122,  59095,  10892,   9434, 118888,
           117,   9992,  40032,  30005,    117,   9612,  37824,   9410,  12030,
         42337,  10739,  83491,  12508,    106,    106,    102,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,

In [3]:
"""BERT 모델 선언"""


from torch import optim
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="bert-base-multilingual-cased",
    num_labels = 2
).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5, eps=1e-8)

  torch.utils._pytree._register_pytree_node(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
"""BERT 모델 구조 출력"""


from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path="bert-base-multilingual-cased").to(device)
#사전 학습된 모델(pretrained_model_name_or_path)을 from_pretrained 메서드로 불러옴

for main_name, main_module in model.named_children():
    print(main_name)
    for sub_name, sub_module in main_module.named_children():
        print("└",sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print("│  └", ssub_name)
            for sssub_name, sssub_module in ssub_module.named_children():
                print("│  │  └",sssub_name)

# pooler : CLS 토큰 벡터를 한 번 더 비선형 변환을 수행하기 위해 선형 변환과 비선형 변환인 Tanh함수를 사용한다. 
# 이후 드롭아웃이 적용 돼 모델의 과대적합을 방지한다.
# classifier는 BERT 모델에서 수행해야하는 작업으로, [CLS] 토큰 벡터를 활용해 결과를 예측한다.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


bert
└ embeddings
│  └ word_embeddings
│  └ position_embeddings
│  └ token_type_embeddings
│  └ LayerNorm
│  └ dropout
└ encoder
│  └ layer
│  │  └ 0
│  │  └ 1
│  │  └ 2
│  │  └ 3
│  │  └ 4
│  │  └ 5
│  │  └ 6
│  │  └ 7
│  │  └ 8
│  │  └ 9
│  │  └ 10
│  │  └ 11
└ pooler
│  └ dense
│  └ activation
dropout
classifier


In [5]:
"""BERT 모델 학습 및 검증"""

import numpy as np
from torch import nn

def calc_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat)/len(labels_flat)

def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0
    for input_ids, attention_mask, labels in dataloader:
        outputs = model(
            input_ids = input_ids,
            attention_mask = attention_mask,
            labels = labels
        )

        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss

def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        criterion = nn.CrossEntropyLoss()
        val_loss, val_accuracy = 0.0, 0.0

        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids = input_ids,
                attention_mask = attention_mask,
                labels = labels
            )
            logits = outputs.logits

            loss = criterion(logits, labels)
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to("cpu").numpy()
            accuracy = calc_accuracy(logits, label_ids)

            val_loss += loss
            val_accuracy += accuracy

    val_loss = val_loss/len(dataloader)
    val_accuracy = val_accuracy/len(dataloader)
    return val_loss, val_accuracy

best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f"Epoch:{epoch+1}, Train loss:{train_loss: .4f}, Val loss : {val_loss:.4f}, Val accuracy: {val_accuracy:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(),"models/BERTForSequenceClassification.pt")
        print("Saved the model weights")

Epoch:1, Train loss: 0.6957, Val loss : 0.6961, Val accuracy: 0.4565
Saved the model weights
Epoch:2, Train loss: 0.6963, Val loss : 0.6961, Val accuracy: 0.4565
Epoch:3, Train loss: 0.6964, Val loss : 0.6961, Val accuracy: 0.4565
Epoch:4, Train loss: 0.6968, Val loss : 0.6961, Val accuracy: 0.4565
Epoch:5, Train loss: 0.6970, Val loss : 0.6961, Val accuracy: 0.4565


In [6]:
"""BERT 모델 평가 결과"""

#최적의 모델을 저장해서 테스트 데이터세트로 모델을 평가

model = BertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="bert-base-multilingual-cased",
    num_labels=2
).to(device)
model.config.pad_token_id = model.config.eos_token_id
model.load_state_dict(torch.load("models/BERTForSequenceClassification.pt"))

test_loss, test_accuracy = evaluation(model, test_dataloader)
print(f"Test_loss : {test_loss : .4f}")
print(f"Test Accuracy : {test_accuracy : .4f}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test_loss :  0.6961
Test Accuracy :  0.4565
