# package load 및 device 지정

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd

In [2]:
from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

In [3]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [4]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
device = torch.device("cuda:3")
print(device)

cuda:3


# 데이터세트

In [5]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = []
        self.labels = []
        for i in dataset:
            if len(i)!=2:
                continue

            self.sentences.append(transform([i[sent_idx]]))
            self.labels.append(np.int32(i[label_idx]))
        

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [6]:
class OuiDatasetLoader:
    def __init__(self, train_path, train_name, test_path, test_name, batch_size, max_len):
        train_dataset = nlp.data.TSVDataset(os.path.join(train_path, train_name), num_discard_samples=1)
        test_dataset = nlp.data.TSVDataset(os.path.join(test_path, test_name), num_discard_samples=1)

        self.bertmodel, vocab = get_pytorch_kobert_model(cachedir=".cache")
        tokenizer = get_tokenizer()
        tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

        self.batch_size = batch_size
        self.max_len = max_len
        
        data_train = BERTDataset(train_dataset, 0, 1, tok, max_len, True, False)
        data_test = BERTDataset(test_dataset, 0, 1, tok, max_len, True, False)
        
        self.train_dataloader = DataLoader(data_train, batch_size=batch_size, num_workers=5)
        self.test_dataloader = DataLoader(data_test, batch_size=batch_size, num_workers=5)

    def get_pretrained_model(self):
        return self.bertmodel

In [7]:
dataloader = []
train_path = "../dataset/train"
train_name = "train.tsv"
test_path = "../dataset/test"
test_name = "valid.tsv"
batch_size = 128
max_len = 100
for i in range(0, 5):
    dataloader.append(OuiDatasetLoader(os.path.join(train_path, "split_{}".format(i)), train_name, os.path.join(test_path, "split_{}".format(i)), test_name, batch_size, max_len))

using cached model. /home/j-j10a506/oui/KoBERT/.cache/kobert_v1.zip
using cached model. /home/j-j10a506/oui/KoBERT/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /home/j-j10a506/oui/KoBERT/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /home/j-j10a506/oui/KoBERT/.cache/kobert_v1.zip
using cached model. /home/j-j10a506/oui/KoBERT/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece
using cached model. /home/j-j10a506/oui/KoBERT/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


KeyboardInterrupt: 

# 하이퍼파라미터 튜닝

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        else:
            out = pooler
        return self.classifier(out)

In [None]:
class Oui:
    def __init__(self, bertmodel, device, train_dataloader, test_dataloader, fixed, hp, log_interval, num_epochs):
        self.num_epochs = num_epochs
        self.log_interval = log_interval
    
        # Hyper-parameter random sampling
        params = dict()
        if not fixed:
            for name, value_info in hp.items():
                #print(value_info, len(value_info))
                if value_info[1] == 'max_grad_norm':
                    params[name] = np.random.choice(value_info[0])
                elif value_info[1] == 'int':
                    params[name] = np.random.randint(min(value_info[0]), max(value_info[0])+1)
                elif value_info[1] == 'float':
                    params[name] = np.random.uniform(min(value_info[0]), max(value_info[0]))
        else:
            params['dr_rate']=0.2282168316541426 #0.3
            params['learning_rate']=4.4397570365495904e-05 #5e-5
            params['max_grad_norm']=3 #1
            params['warmup_ratio']=0.2503085518907766 #0.1
            params['weight_decay']=0.009198865305723895 #0.01
        
        self.params = params
        print(self.params)

        self.model=BERTClassifier(bertmodel,  dr_rate=params["dr_rate"]).to(device)
        ## weight decay 
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': params["weight_decay"]},
            {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

        ## optimizer & loss
        self.optimizer = AdamW(optimizer_grouped_parameters, lr=params["learning_rate"])
        self.loss_fn = nn.CrossEntropyLoss()

        ## learning rate scheduler
        self.train_dataloader = train_dataloader
        self.test_dataloader = test_dataloader
        t_total = len(self.train_dataloader) * num_epochs
        warmup_step = int(t_total * params["warmup_ratio"])
        self.scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)


    def calc_accuracy(self, X,y):
        max_vals, max_indices = torch.max(X, 1)
        train_acc = (max_indices == y).sum().data.cpu().numpy()/max_indices.size()[0]
        return train_acc

    def train(self, device):
        self.train_verbose_dict = {}
        self.train_score_dict = {}
        self.test_score_dict = {}
        self.params_list = []

        max_acc = 0
        self.be = 0
        self.bestmodel = self.model
        print(self.params)
        for e in range(self.num_epochs):
            train_acc = 0.0
            test_acc = 0.0

            train_loss = []
            train_scores = []
            self.model.train()
            for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(self.train_dataloader), total=len(self.train_dataloader)):
                self.optimizer.zero_grad()
                token_ids = token_ids.long().to(device)
                segment_ids = segment_ids.long().to(device)
                valid_length= valid_length
                label = label.long().to(device)
                
                out = self.model(token_ids, valid_length, segment_ids)
                loss = self.loss_fn(out, label)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.params["max_grad_norm"])
                
                self.optimizer.step()
                self.scheduler.step()  # Update learning rate schedule
                train_acc += self.calc_accuracy(out, label)
                
                if batch_id % self.log_interval == 0:
                    print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
                    train_loss.append(loss.data.cpu().numpy())
                    train_scores.append(train_acc / (batch_id+1))
            
            print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
            self.train_score_dict[e] = train_acc / (batch_id+1)
            self.train_verbose_dict[e] = (train_loss, train_scores) 
            
            self.model.eval()
            for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(self.test_dataloader), total=len(self.test_dataloader)):
                token_ids = token_ids.long().to(device)
                segment_ids = segment_ids.long().to(device)
                valid_length= valid_length
                label = label.long().to(device)
                
                out = self.model(token_ids, valid_length, segment_ids)
                test_acc += self.calc_accuracy(out, label)
            
            print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
            self.test_score_dict[e] = test_acc / (batch_id+1)
            if max_acc < self.test_score_dict[e]:
                max_acc = self.test_score_dict[e]
                self.bestmodel = self.model
                self.be = e
                
        return self.bestmodel, self.train_score_dict[self.be], self.test_score_dict[self.be]    

In [None]:
config = {
    'hp': {
        'dr_rate': ([1e-1, 5e-1], 'float'),
        'learning_rate': ([1e-5, 1e-3], 'float'),
        'max_grad_norm': ([1, 3, 5], 'max_grad_norm'),
        'warmup_ratio': ([1e-2, 3e-1], 'float'),
        'weight_decay': ([1e-4, 1e-2] , 'float'),
    },
    'log_interval': 200,
    'num_epochs': 10,
}

In [None]:
model = dataloader[0].get_pretrained_model()
oui = Oui(model, device, dataloader[0].train_dataloader,dataloader[0].test_dataloader, True, **config)
_, train_score, test_score = oui.train(device)
    
print("train_score: {}, test_score: {}".format(train_score, test_score))

In [None]:
for i in range(0, 5):
    model = dataloader[i].get_pretrained_model()
    oui = Oui(model, device, dataloader[i].train_dataloader,dataloader[i].test_dataloader, False, **config)
    _, train_score, test_score = oui.train(device)
    
    print("K {} / train_score: {}, test_score: {}".format(i, train_score, test_score))

In [55]:
model = dataloader[0].get_pretrained_model()
oui = Oui(model, device, dataloader[0].train_dataloader,dataloader[0].test_dataloader, True, **config)
_, train_score, test_score = oui.train(device)
    
print("train_score: {}, test_score: {}".format(train_score, test_score))

{'dr_rate': 0.2282168316541426, 'learning_rate': 4.4397570365495904e-05, 'max_grad_norm': 3, 'warmup_ratio': 0.2503085518907766, 'weight_decay': 0.009198865305723895}
{'dr_rate': 0.2282168316541426, 'learning_rate': 4.4397570365495904e-05, 'max_grad_norm': 3, 'warmup_ratio': 0.2503085518907766, 'weight_decay': 0.009198865305723895}


  0%|          | 0/537 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.8984081745147705 train acc 0.046875
epoch 1 batch id 201 loss 1.6685549020767212 train acc 0.31082866915422885
epoch 1 batch id 401 loss 1.3760613203048706 train acc 0.34435395885286785
epoch 1 train acc 0.38195953169184077


  0%|          | 0/135 [00:00<?, ?it/s]

epoch 1 test acc 0.5363756613756614


  0%|          | 0/537 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.4331247806549072 train acc 0.4609375
epoch 2 batch id 201 loss 1.2579668760299683 train acc 0.5280628109452736
epoch 2 batch id 401 loss 1.1521053314208984 train acc 0.5401340399002493
epoch 2 train acc 0.5485604610662804


  0%|          | 0/135 [00:00<?, ?it/s]

epoch 2 test acc 0.5829034391534392


  0%|          | 0/537 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 1.2082579135894775 train acc 0.5234375
epoch 3 batch id 201 loss 1.212241768836975 train acc 0.5775031094527363
epoch 3 batch id 401 loss 1.0927236080169678 train acc 0.582333229426434
epoch 3 train acc 0.5874990301055245


  0%|          | 0/135 [00:00<?, ?it/s]

epoch 3 test acc 0.5885168650793651


  0%|          | 0/537 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 1.0979316234588623 train acc 0.5859375
epoch 4 batch id 201 loss 1.1112815141677856 train acc 0.6110851990049752
epoch 4 batch id 401 loss 0.9950424432754517 train acc 0.6185512780548629
epoch 4 train acc 0.6245495378991655


  0%|          | 0/135 [00:00<?, ?it/s]

epoch 4 test acc 0.5858878968253968


  0%|          | 0/537 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 1.1435667276382446 train acc 0.5859375
epoch 5 batch id 201 loss 1.0327448844909668 train acc 0.6517412935323383
epoch 5 batch id 401 loss 0.8891943097114563 train acc 0.6623675187032418
epoch 5 train acc 0.6699368275398304


  0%|          | 0/135 [00:00<?, ?it/s]

epoch 5 test acc 0.5676421957671958


  0%|          | 0/537 [00:00<?, ?it/s]

epoch 6 batch id 1 loss 0.9127681851387024 train acc 0.6875
epoch 6 batch id 201 loss 0.8819690942764282 train acc 0.7002487562189055
epoch 6 batch id 401 loss 0.7399062514305115 train acc 0.7127688591022444
epoch 6 train acc 0.7227545865232086


  0%|          | 0/135 [00:00<?, ?it/s]

epoch 6 test acc 0.5739252645502645


  0%|          | 0/537 [00:00<?, ?it/s]

epoch 7 batch id 1 loss 0.8440873026847839 train acc 0.7109375
epoch 7 batch id 201 loss 0.7750044465065002 train acc 0.7526819029850746
epoch 7 batch id 401 loss 0.6879240870475769 train acc 0.7609491895261845
epoch 7 train acc 0.767971066970136


  0%|          | 0/135 [00:00<?, ?it/s]

epoch 7 test acc 0.5681712962962963


  0%|          | 0/537 [00:00<?, ?it/s]

epoch 8 batch id 1 loss 0.8340594172477722 train acc 0.703125
epoch 8 batch id 201 loss 0.6033045053482056 train acc 0.7923662935323383
epoch 8 batch id 401 loss 0.5552154183387756 train acc 0.8002454800498753
epoch 8 train acc 0.8046875


  0%|          | 0/135 [00:00<?, ?it/s]

epoch 8 test acc 0.5893931878306878


  0%|          | 0/537 [00:00<?, ?it/s]

epoch 9 batch id 1 loss 0.7865308523178101 train acc 0.703125
epoch 9 batch id 201 loss 0.6974712014198303 train acc 0.822255907960199
epoch 9 batch id 401 loss 0.4908873736858368 train acc 0.8280665523690773
epoch 9 train acc 0.8318057495344506


  0%|          | 0/135 [00:00<?, ?it/s]

epoch 9 test acc 0.580406746031746


  0%|          | 0/537 [00:00<?, ?it/s]

epoch 10 batch id 1 loss 0.7151694893836975 train acc 0.7578125
epoch 10 batch id 201 loss 0.6126967072486877 train acc 0.841728855721393
epoch 10 batch id 401 loss 0.47557756304740906 train acc 0.8441591334164589
epoch 10 train acc 0.8469361033519553


  0%|          | 0/135 [00:00<?, ?it/s]

epoch 10 test acc 0.5832423941798941
train_score: 0.8046875, test_score: 0.5893931878306878
