# KoBERT 버전
- https://github.com/monologg/KoBERT-Transformers 참고

In [17]:
from KoBERT_Transformers.kobert_transformers.tokenization_kobert import KoBertTokenizer
from KoBERT_Transformers.kobert_transformers import get_kobert_model, get_distilkobert_model
from transformers import BertForSequenceClassification

import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report

import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import time, datetime

In [14]:
data = pd.read_parquet("../data/dataset_grammar.parquet")

In [15]:
data = data.drop(index=data.index[data.Rating==2])
data = data.reset_index(drop=True)

In [18]:
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
model = BertForSequenceClassification.from_pretrained('monologg/kobert')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'KoBertTokenizer'.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
tokenizer

PreTrainedTokenizer(name_or_path='monologg/kobert', vocab_size=8002, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [19]:
train_data, valid_data = train_test_split(data, shuffle=True, stratify=data.Rating, random_state=217, test_size=0.2)

In [20]:
valid_data, test_data = train_test_split(valid_data, stratify=valid_data.Rating, random_state=217, test_size=0.5)

In [21]:
def kobert_tokenizer(sent, MAX_LEN):
    encoded_dict = tokenizer.encode_plus(text = sent, add_special_tokens=True, max_length=MAX_LEN, padding='max_length', return_attention_mask=True, truncation=True)
    
    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    token_type_ids = encoded_dict['token_type_ids']
    
    return input_ids, attention_masks, token_type_ids

In [22]:
def make_dataset(dataset, MAX_LEN):
    input_ids, attention_masks, token_type_ids = [], [], []
    for each_review in tqdm(dataset['Review']):
        input_id, attention_mask, token_type_id = kobert_tokenizer(each_review, MAX_LEN)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
    
    tensordataset = TensorDataset(torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(token_type_ids), torch.LongTensor(dataset['Rating'].values).unsqueeze(dim=1))
    return tensordataset

In [23]:
train_dataset = make_dataset(train_data, 256)
valid_dataset = make_dataset(valid_data, 256)
test_dataset = make_dataset(test_data, 256)

  0%|          | 0/367365 [00:00<?, ?it/s]

  0%|          | 0/45921 [00:00<?, ?it/s]

  0%|          | 0/45921 [00:00<?, ?it/s]

In [24]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
valid_dataloader = DataLoader(valid_dataset) # default: batch_size=1, shuffle=False
test_dataloader = DataLoader(test_dataset)

In [25]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint_testing.pt'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print("")
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [26]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [27]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=5e-6)
#loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')
epochs = 1000
early_stopping = EarlyStopping(patience = 10, verbose = True, path='./kobert_best_f1.pt')

In [28]:
sum(p.numel() for p in model.parameters() if p.requires_grad) # in BERT, 177,854,978 / in KoElectra, 112,922,882

92188418

In [29]:
device = torch.device("cuda:2")

In [30]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementw

In [31]:
train_loss_list, valid_loss_list, valid_accuracy_list, valid_f1_list = [], [], [], []
optimizer.zero_grad()
for epoch in range(epochs):
    t0 = time.time()
    train_pred_list = []
    train_loss, valid_loss, valid_accuracy = 0.0, 0.0, 0.0
    epoch_loss = 0.0
    model.train()
    optimizer.zero_grad()
    for batch in train_dataloader:
        b_input_ids, b_attention_masks, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)
        out = model(b_input_ids, b_attention_masks, b_token_type_ids, labels=b_labels)
        loss, logits = out.loss, out.logits
        epoch_loss += loss.item()        

        loss.backward()
        optimizer.step()
        #pred = torch.argmax(F.softmax(logits, dim=0), dim=1).unsqueeze(dim=1).cpu()
        #pred_list.append(pred)
        #epoch_accuracy += (pred==b_labels).cpu().numpy().mean()
        
        optimizer.zero_grad()
        
    train_loss = float(epoch_loss / len(train_dataloader))
    #train_accuracy = float(epoch_accuracy / len(train_dataloader))
    
    train_loss_list.append(train_loss)
    #train_accuracy_list.append(train_accuracy)
    
    valid_pred_list, valid_real_list = [], []
    valid_accuracy, valid_f1, epoch_accuracy, epoch_loss = 0.0, 0.0, 0.0, 0.0
    with torch.no_grad():
        model.eval()
        for batch in valid_dataloader:
            b_input_ids, b_attention_masks, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)
            out = model(b_input_ids, b_attention_masks, b_token_type_ids, labels=b_labels)
            loss, logits = out.loss, out.logits
            epoch_loss += loss.item()

            pred = torch.argmax(logits).cpu()
            valid_pred_list.append(pred)
            valid_real_list.append(b_labels.squeeze(dim=0).squeeze(dim=0).cpu())
            #epoch_accuracy += (pred==b_labels).cpu().numpy()
            
        
        valid_loss = float(epoch_loss / len(valid_dataloader))
        #valid_accuracy = float(epoch_accuracy / len(valid_dataloader))
        valid_accuracy = accuracy_score(valid_real_list, valid_pred_list)
        valid_f1 = f1_score(valid_real_list, valid_pred_list)
        
        valid_loss_list.append(valid_loss)
        valid_accuracy_list.append(valid_accuracy)
        valid_f1_list.append(valid_f1)

        print(f"EPOCH: {epoch}  ||  Elapsed: {format_time(time.time()-t0)}.")
        print(f"   Train_loss: {train_loss:.4f}  ||  Valid_acc: {valid_accuracy:.4f} | Valid_f1: {valid_f1:.4f} | Valid_loss: {valid_loss:.4f}")
        
        early_stopping(valid_loss, model)
        print("")
        if early_stopping.early_stop:
            print("Early stopping")
            break

EPOCH: 0  ||  Elapsed: 1:52:52.
   Train_loss: 0.1220  ||  Valid_acc: 0.9641 | Valid_f1: 0.9789 | Valid_loss: 0.1010

Validation loss decreased (inf --> 0.100996).  Saving model ...

EPOCH: 1  ||  Elapsed: 1:53:13.
   Train_loss: 0.0876  ||  Valid_acc: 0.9640 | Valid_f1: 0.9787 | Valid_loss: 0.1033
EarlyStopping counter: 1 out of 10

EPOCH: 2  ||  Elapsed: 1:53:21.
   Train_loss: 0.0676  ||  Valid_acc: 0.9668 | Valid_f1: 0.9805 | Valid_loss: 0.0995

Validation loss decreased (0.100996 --> 0.099489).  Saving model ...

EPOCH: 3  ||  Elapsed: 1:53:17.
   Train_loss: 0.0513  ||  Valid_acc: 0.9661 | Valid_f1: 0.9799 | Valid_loss: 0.1086
EarlyStopping counter: 1 out of 10

EPOCH: 4  ||  Elapsed: 1:53:18.
   Train_loss: 0.0394  ||  Valid_acc: 0.9664 | Valid_f1: 0.9802 | Valid_loss: 0.1278
EarlyStopping counter: 2 out of 10

EPOCH: 5  ||  Elapsed: 1:53:12.
   Train_loss: 0.0319  ||  Valid_acc: 0.9661 | Valid_f1: 0.9800 | Valid_loss: 0.1335
EarlyStopping counter: 3 out of 10

EPOCH: 6  ||  Ela

In [32]:
print(classification_report(valid_real_list, valid_pred_list)) # 이건 12번째 epoch의 valid 결과

              precision    recall  f1-score   support

           0       0.91      0.86      0.89      7092
           1       0.97      0.99      0.98     38829

    accuracy                           0.97     45921
   macro avg       0.94      0.92      0.93     45921
weighted avg       0.97      0.97      0.97     45921



In [33]:
model.load_state_dict(torch.load('./kobert_best_f1.pt'))
test_pred_list, test_real_list = [], []
test_accuracy, test_f1 = 0.0, 0.0
with torch.no_grad():
    model.eval()
    for batch in test_dataloader:
        b_input_ids, b_attention_masks, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)
        out = model(b_input_ids, b_attention_masks, b_token_type_ids, labels=b_labels)
        loss, logits = out.loss, out.logits

        pred = torch.argmax(logits).cpu()
        test_pred_list.append(pred)
        test_real_list.append(b_labels.squeeze(dim=0).squeeze(dim=0).cpu())
        
    test_accuracy = accuracy_score(test_real_list, test_pred_list)
    test_f1 = f1_score(test_real_list, test_pred_list)

In [34]:
 print(f"Test_acc: {test_accuracy:.4f} | Test_f1: {test_f1:.4f}") # 이건 best epoch(=2)의 test 결과

Test_acc: 0.9655 | Test_f1: 0.9797


In [35]:
print(classification_report(test_real_list, test_pred_list))

              precision    recall  f1-score   support

           0       0.90      0.87      0.89      7093
           1       0.98      0.98      0.98     38828

    accuracy                           0.97     45921
   macro avg       0.94      0.93      0.93     45921
weighted avg       0.97      0.97      0.97     45921



 11722MiB / 12066MiB 