# OUI

### package load 및 device 지정

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd

In [2]:
from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

In [3]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [4]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
#os.environ["CUDA_VISIBLE_DEVICES"] = "3"
#os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
#os.environ["TORCH_USE_CUDA_DSA"] = '1'
#device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
#torch.cuda.device_count()
device = torch.device("cuda:3")
print(device)

cuda:3


### 데이터세트 구성

In [5]:
train_dataset = nlp.data.TSVDataset(os.path.join("../dataset/train", "train.tsv"), num_discard_samples=1)
test_dataset = nlp.data.TSVDataset(os.path.join("../dataset/test", "valid.tsv"), num_discard_samples=1)

In [6]:
bertmodel, vocab = get_pytorch_kobert_model(cachedir=".cache")

using cached model. /home/j-j10a506/oui/KoBERT/.cache/kobert_v1.zip
using cached model. /home/j-j10a506/oui/KoBERT/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [7]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /home/j-j10a506/oui/KoBERT/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [8]:
batch_size = 128
max_len = 100

In [9]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = []
        self.labels = []
        for i in dataset:
            if len(i)!=2:
                continue

            self.sentences.append(transform([i[sent_idx]]))
            self.labels.append(np.int32(i[label_idx]))
        

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [10]:
data_train = BERTDataset(train_dataset, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(test_dataset, 0, 1, tok, max_len, True, False)

In [11]:
train_dataloader = DataLoader(data_train, batch_size=batch_size, num_workers=5)
test_dataloader = DataLoader(data_test, batch_size=batch_size, num_workers=5)

### kobert load
* 하이퍼파라미터 조정
  * dropout 비율
  * learning rate
  * warmup_ratio
  * learning_rate
  * max_grad_norm

In [12]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        else:
            out = pooler
        return self.classifier(out)

In [13]:
model = BERTClassifier(bertmodel,  dr_rate=0.3).to(device)

### 학습
* best 모델 저장
* epoch 수: 최대 20회

In [14]:
## Setting parameters
warmup_ratio = 0.1 # 학습률 warm up 비율로 학습률을 초기에 점진적으로 높이는데 사용
num_epochs = 10
max_grad_norm = 1 # Gradient clipping에 사용됨 
log_interval = 200
learning_rate = 5e-5

In [15]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [16]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [17]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [18]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [19]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [20]:
best = 0.0
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    if best <  (test_acc / (batch_id+1)):
        MODEL_PATH = "../myModel"
        torch.save(model.state_dict(), os.path.join(MODEL_PATH, "oui_20240327_acc.pt"))
        best = test_acc / (batch_id+1)

  0%|          | 0/671 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.7913718223571777 train acc 0.171875
epoch 1 batch id 201 loss 1.4582903385162354 train acc 0.32097325870646765
epoch 1 batch id 401 loss 1.2914812564849854 train acc 0.37833151496259354
epoch 1 batch id 601 loss 1.1051937341690063 train acc 0.42906353993344426
epoch 1 train acc 0.44174698102042603


  0%|          | 0/168 [00:00<?, ?it/s]

epoch 1 test acc 0.5532987809849967


  0%|          | 0/671 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.1495869159698486 train acc 0.578125
epoch 2 batch id 201 loss 1.0727790594100952 train acc 0.5518501243781094
epoch 2 batch id 401 loss 1.171034574508667 train acc 0.5584865960099751
epoch 2 batch id 601 loss 0.9547896385192871 train acc 0.5680506447587355
epoch 2 train acc 0.5703885223985272


  0%|          | 0/168 [00:00<?, ?it/s]

epoch 2 test acc 0.5795385630707762


  0%|          | 0/671 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 1.0698013305664062 train acc 0.578125
epoch 3 batch id 201 loss 0.9172974228858948 train acc 0.5991915422885572
epoch 3 batch id 401 loss 1.0798733234405518 train acc 0.6036666147132169
epoch 3 batch id 601 loss 0.7843312621116638 train acc 0.6122348169717138
epoch 3 train acc 0.6148253265538705


  0%|          | 0/168 [00:00<?, ?it/s]

epoch 3 test acc 0.5875714744373777


  0%|          | 0/671 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 0.9815123081207275 train acc 0.578125
epoch 4 batch id 201 loss 0.8199601173400879 train acc 0.6440065298507462
epoch 4 batch id 401 loss 0.9948267936706543 train acc 0.6485349127182045
epoch 4 batch id 601 loss 0.7149920463562012 train acc 0.6590188227953411
epoch 4 train acc 0.6618112781625318


  0%|          | 0/168 [00:00<?, ?it/s]

epoch 4 test acc 0.581595523483366


  0%|          | 0/671 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 0.8731215596199036 train acc 0.6328125
epoch 5 batch id 201 loss 0.7259531021118164 train acc 0.6933302238805971
epoch 5 batch id 401 loss 0.8998918533325195 train acc 0.6962281795511222
epoch 5 batch id 601 loss 0.6099551916122437 train acc 0.7071677412645591
epoch 5 train acc 0.7103375394494609


  0%|          | 0/168 [00:00<?, ?it/s]

epoch 5 test acc 0.5750615368150684


  0%|          | 0/671 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.7810043692588806 train acc 0.6875
epoch 6 batch id 201 loss 0.6549832820892334 train acc 0.7320429104477612
epoch 6 batch id 401 loss 0.8338111639022827 train acc 0.7368882481296758
epoch 6 batch id 601 loss 0.4921830892562866 train acc 0.7498310108153078
epoch 6 train acc 0.7524505238011747


  0%|          | 0/168 [00:00<?, ?it/s]

epoch 6 test acc 0.5764101231245923


  0%|          | 0/671 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.7086150646209717 train acc 0.7265625
epoch 7 batch id 201 loss 0.5344706773757935 train acc 0.7709499378109452
epoch 7 batch id 401 loss 0.7054784893989563 train acc 0.7774119389027432
epoch 7 batch id 601 loss 0.4144899845123291 train acc 0.789595465890183
epoch 7 train acc 0.7920342224072938


  0%|          | 0/168 [00:00<?, ?it/s]

epoch 7 test acc 0.5779561878261579


  0%|          | 0/671 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 0.5745186805725098 train acc 0.7734375
epoch 8 batch id 201 loss 0.4147394299507141 train acc 0.8069029850746269
epoch 8 batch id 401 loss 0.5518259406089783 train acc 0.8129675810473815
epoch 8 batch id 601 loss 0.3388662040233612 train acc 0.8209234608985025
epoch 8 train acc 0.8224082526080477


  0%|          | 0/168 [00:00<?, ?it/s]

epoch 8 test acc 0.5773631156229615


  0%|          | 0/671 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 0.47457626461982727 train acc 0.8203125
epoch 9 batch id 201 loss 0.41131624579429626 train acc 0.8310012437810945
epoch 9 batch id 401 loss 0.5447052717208862 train acc 0.834612687032419
epoch 9 batch id 601 loss 0.3327123820781708 train acc 0.8405912021630616
epoch 9 train acc 0.8420617548435171


  0%|          | 0/168 [00:00<?, ?it/s]

epoch 9 test acc 0.5789442168134377


  0%|          | 0/671 [00:00<?, ?it/s]

epoch 10 batch id 1 loss 0.5277906060218811 train acc 0.796875
epoch 10 batch id 201 loss 0.4204730987548828 train acc 0.8415345149253731
epoch 10 batch id 401 loss 0.48248612880706787 train acc 0.8460099750623441
epoch 10 batch id 601 loss 0.2946419417858124 train acc 0.8519004783693843
epoch 10 train acc 0.8523775149031296


  0%|          | 0/168 [00:00<?, ?it/s]

epoch 10 test acc 0.5800023187785388


In [None]:
MODEL_PATH = "../myModel"
torch.save(model.state_dict(), os.path.join(MODEL_PATH, "oui_20240327_acc.pt"))

In [None]:
# https://velog.io/@fhflwhwl5/Python-KoBERT-7%EA%B0%80%EC%A7%80-%EA%B0%90%EC%A0%95%EC%9D%98-%EB%8B%A4%EC%A4%91%EA%B0%90%EC%84%B1%EB%B6%84%EB%A5%98%EB%AA%A8%EB%8D%B8-%EA%B5%AC%ED%98%84%ED%95%98%EA%B8%B0