In [1]:
import pandas as pd
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import time, datetime

In [2]:
data = pd.read_parquet("./dataset_grammar.parquet")

In [3]:
data = data.drop(index=data.index[data.Rating==2])
data = data.reset_index(drop=True)

In [4]:
data.Rating.value_counts()

1    388281
0     70926
Name: Rating, dtype: int64

In [5]:
data.Review.str.len().describe()

count    459207.000000
mean        138.518812
std         188.789379
min           0.000000
25%          39.000000
50%          76.000000
75%         164.000000
max        3971.000000
Name: Review, dtype: float64

In [6]:
#tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
#model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased')
model = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-base-v3-discriminator')

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [7]:
tokenizer

PreTrainedTokenizer(name_or_path='monologg/koelectra-base-v3-discriminator', vocab_size=35000, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [8]:
train_data, valid_data = train_test_split(data, shuffle=True, stratify=data.Rating, random_state=217, test_size=0.2)

In [9]:
valid_data, test_data = train_test_split(valid_data, stratify=valid_data.Rating, random_state=217, test_size=0.5)

In [10]:
train_data.shape, valid_data.shape, test_data.shape

((367365, 2), (45921, 2), (45921, 2))

In [11]:
np.bincount(train_data.Rating), np.bincount(valid_data.Rating), np.bincount(test_data.Rating)

(array([ 56741, 310624]), array([ 7092, 38829]), array([ 7093, 38828]))

In [13]:
def electra_tokenizer(sent, MAX_LEN):
    encoded_dict = tokenizer.encode_plus(text = sent, add_special_tokens=True, max_length=MAX_LEN, padding='max_length', return_attention_mask=True, truncation=True)
    
    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    token_type_ids = encoded_dict['token_type_ids']
    
    return input_ids, attention_masks, token_type_ids

In [14]:
def make_dataset(dataset, MAX_LEN):
    input_ids, attention_masks, token_type_ids = [], [], []
    for each_review in tqdm(dataset['Review']):
        input_id, attention_mask, token_type_id = electra_tokenizer(each_review, MAX_LEN)

        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)
    
    tensordataset = TensorDataset(torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(token_type_ids), torch.LongTensor(dataset['Rating'].values).unsqueeze(dim=1))
    return tensordataset

In [15]:
train_dataset = make_dataset(train_data, 256)

  0%|          | 0/367365 [00:00<?, ?it/s]

In [16]:
valid_dataset = make_dataset(valid_data, 256)
test_dataset = make_dataset(test_data, 256)

  0%|          | 0/45921 [00:00<?, ?it/s]

  0%|          | 0/45921 [00:00<?, ?it/s]

In [17]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
valid_dataloader = DataLoader(valid_dataset) # default: batch_size=1, shuffle=False
test_dataloader = DataLoader(test_dataset)

In [18]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint_testing.pt'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print("")
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [19]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=5e-6)
#loss_fn = torch.nn.CrossEntropyLoss(reduction='sum')
epochs = 1000
early_stopping = EarlyStopping(patience = 10, verbose = True, path='./koelectra_best_f1.pt')

In [21]:
sum(p.numel() for p in model.parameters() if p.requires_grad) # in BERT, 177854978

112922882

In [22]:
device = torch.device("cuda")

In [23]:
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [24]:
train_loss_list, valid_loss_list, valid_accuracy_list, valid_f1_list = [], [], [], []
optimizer.zero_grad()
for epoch in range(epochs):
    t0 = time.time()
    train_pred_list = []
    train_loss, valid_loss, valid_accuracy = 0.0, 0.0, 0.0
    epoch_loss = 0.0
    model.train()
    optimizer.zero_grad()
    for batch in train_dataloader:
        b_input_ids, b_attention_masks, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)
        out = model(b_input_ids, b_attention_masks, b_token_type_ids, labels=b_labels)
        loss, logits = out.loss, out.logits
        epoch_loss += loss.item()        

        loss.backward()
        optimizer.step()
        #pred = torch.argmax(F.softmax(logits, dim=0), dim=1).unsqueeze(dim=1).cpu()
        #pred_list.append(pred)
        #epoch_accuracy += (pred==b_labels).cpu().numpy().mean()
        
        optimizer.zero_grad()
        
    train_loss = float(epoch_loss / len(train_dataloader))
    #train_accuracy = float(epoch_accuracy / len(train_dataloader))
    
    train_loss_list.append(train_loss)
    #train_accuracy_list.append(train_accuracy)
    
    valid_pred_list, valid_real_list = [], []
    valid_accuracy, valid_f1, epoch_accuracy, epoch_loss = 0.0, 0.0, 0.0, 0.0
    with torch.no_grad():
        model.eval()
        for batch in valid_dataloader:
            b_input_ids, b_attention_masks, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)
            out = model(b_input_ids, b_attention_masks, b_token_type_ids, labels=b_labels)
            loss, logits = out.loss, out.logits
            epoch_loss += loss.item()

            pred = torch.argmax(logits).cpu()
            valid_pred_list.append(pred)
            valid_real_list.append(b_labels.squeeze(dim=0).squeeze(dim=0).cpu())
            #epoch_accuracy += (pred==b_labels).cpu().numpy()
            
        
        valid_loss = float(epoch_loss / len(valid_dataloader))
        #valid_accuracy = float(epoch_accuracy / len(valid_dataloader))
        valid_accuracy = accuracy_score(valid_real_list, valid_pred_list)
        valid_f1 = f1_score(valid_real_list, valid_pred_list)
        
        valid_loss_list.append(valid_loss)
        valid_accuracy_list.append(valid_accuracy)
        valid_f1_list.append(valid_f1)

        print(f"EPOCH: {epoch}  ||  Elapsed: {format_time(time.time()-t0)}.")
        print(f"   Train_loss: {train_loss:.4f}  ||  Valid_acc: {valid_accuracy:.4f} | Valid_f1: {valid_f1:.4f} | Valid_loss: {valid_loss:.4f}")
        
        early_stopping(valid_loss, model)
        print("")
        if early_stopping.early_stop:
            print("Early stopping")
            break

EPOCH: 0  ||  Elapsed: 0:30:16.
   Train_loss: 0.1023  ||  Valid_acc: 0.9698 | Valid_f1: 0.9821 | Valid_loss: 0.0857

Validation loss decreased (inf --> 0.085708).  Saving model ...

EPOCH: 1  ||  Elapsed: 0:30:21.
   Train_loss: 0.0723  ||  Valid_acc: 0.9703 | Valid_f1: 0.9824 | Valid_loss: 0.0834

Validation loss decreased (0.085708 --> 0.083367).  Saving model ...

EPOCH: 2  ||  Elapsed: 0:30:23.
   Train_loss: 0.0549  ||  Valid_acc: 0.9705 | Valid_f1: 0.9826 | Valid_loss: 0.0903
EarlyStopping counter: 1 out of 10

EPOCH: 3  ||  Elapsed: 0:30:23.
   Train_loss: 0.0411  ||  Valid_acc: 0.9691 | Valid_f1: 0.9817 | Valid_loss: 0.1057
EarlyStopping counter: 2 out of 10

EPOCH: 4  ||  Elapsed: 0:30:23.
   Train_loss: 0.0326  ||  Valid_acc: 0.9707 | Valid_f1: 0.9827 | Valid_loss: 0.1178
EarlyStopping counter: 3 out of 10

EPOCH: 5  ||  Elapsed: 0:30:24.
   Train_loss: 0.0265  ||  Valid_acc: 0.9704 | Valid_f1: 0.9825 | Valid_loss: 0.1172
EarlyStopping counter: 4 out of 10

EPOCH: 6  ||  Ela

In [25]:
print(classification_report(valid_real_list, valid_pred_list)) # 이건 11번째 epoch의 valid 결과

              precision    recall  f1-score   support

           0       0.92      0.87      0.89      7092
           1       0.98      0.99      0.98     38829

    accuracy                           0.97     45921
   macro avg       0.95      0.93      0.94     45921
weighted avg       0.97      0.97      0.97     45921



In [26]:
model.load_state_dict(torch.load('./koelectra_best_f1.pt'))
test_pred_list, test_real_list = [], []
test_accuracy, test_f1 = 0.0, 0.0
with torch.no_grad():
    model.eval()
    for batch in test_dataloader:
        b_input_ids, b_attention_masks, b_token_type_ids, b_labels = tuple(t.to(device) for t in batch)
        out = model(b_input_ids, b_attention_masks, b_token_type_ids, labels=b_labels)
        loss, logits = out.loss, out.logits

        pred = torch.argmax(logits).cpu()
        test_pred_list.append(pred)
        test_real_list.append(b_labels.squeeze(dim=0).squeeze(dim=0).cpu())
        
    test_accuracy = accuracy_score(test_real_list, test_pred_list)
    test_f1 = f1_score(test_real_list, test_pred_list)

In [27]:
 print(f"Test_acc: {test_accuracy:.4f} | Test_f1: {test_f1:.4f}") # 이건 best epoch(=2)의 test 결과

Test_acc: 0.9696 | Test_f1: 0.9820


In [28]:
print(classification_report(test_real_list, test_pred_list))

              precision    recall  f1-score   support

           0       0.90      0.91      0.90      7093
           1       0.98      0.98      0.98     38828

    accuracy                           0.97     45921
   macro avg       0.94      0.94      0.94     45921
weighted avg       0.97      0.97      0.97     45921



train 시 vram 13785 소요