In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder

In [None]:
#GPU 사용
device = torch.device("cuda:0")

In [None]:
#BERT 모델, Vocabulary 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

using cached model. /workspace/Juwan/UROP_help/.cache/kobert_v1.zip
using cached model. /workspace/Juwan/UROP_help/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))
    
    def get_labels(self):
        return self.labels

In [None]:
## Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 50 
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('train.csv', encoding = 'cp949')

In [None]:
data.head(10)

Unnamed: 0,번호,연령,성별,상황키워드,신체질환,감정_대분류,감정_소분류,사람문장1
0,44164,청년,남성,"연애, 결혼, 출산",해당없음,기쁨,신이 난,아내가 드디어 출산하게 되어서 정말 신이 나.
1,3926,노년,남성,"건강, 죽음",만성질환 유,불안,스트레스 받는,당뇨랑 합병증 때문에 먹어야 할 약이 열 가지가 넘어가니까 스트레스야.
2,50882,청소년,여성,학업 및 진로,해당없음,당황,당황,고등학교에 올라오니 중학교 때보다 수업이 갑자기 어려워져서 당황스러워.
3,31303,노년,남성,재정,만성질환 무,기쁨,신이 난,재취업이 돼서 받게 된 첫 월급으로 온 가족이 외식을 할 예정이야. 너무 행복해.
4,47200,노년,여성,재정,만성질환 유,기쁨,안도,빚을 드디어 다 갚게 되어서 이제야 안도감이 들어.
5,48610,중년,여성,"재정, 은퇴, 노후준비",해당없음,불안,취약한,이제 곧 은퇴할 시기가 되었어. 내가 먼저 은퇴를 하고 육 개월 후에 남편도 은퇴를...
6,17198,중년,남성,건강,해당없음,슬픔,우울한,사십 대에 접어들면서 머리카락이 많이 빠져 고민이야.
7,12186,노년,남성,재정,만성질환 무,분노,구역질 나는,이제 돈이라면 지긋지긋해.
8,35975,청소년,남성,학교폭력/따돌림,해당없음,분노,좌절한,친구들이 나를 괴롭혀. 부모님과 선생님께 얘기했는데도 믿어주지 않아.
9,12551,노년,여성,대인관계,만성질환 무,슬픔,눈물이 나는,친구 때문에 눈물 나.


In [None]:
preprocessor = LabelEncoder()
preprocessor.fit(data['감정_소분류'])
data['감정_소분류'] = preprocessor.transform(data['감정_소분류'])
preprocessor.classes_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57])

In [None]:
data_list = []
for q, label in zip(data['사람문장1'], data['감정_소분류'])  :
    element = []
    element.append(q)
    element.append(label)
    data_list.append(element)

In [None]:
#train & test 데이터로 나누기
from sklearn.model_selection import train_test_split
                                                         
dataset_train, dataset_test = train_test_split(data_list, test_size=0.2, random_state=99, shuffle = True)

In [None]:
#토큰화
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model. /workspace/Juwan/UROP_help/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [None]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [None]:
import numpy as np

max(np.array(list(data_train.sentences))[:,1])

  This is separate from the ipykernel package so we can avoid doing imports until


array(64, dtype=int32)

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size,sampler=ImbalancedDatasetSampler(data_train), num_workers=5)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size,sampler=ImbalancedDatasetSampler(data_test), num_workers=5)

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=len(preprocessor.classes_), 
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)

            
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
model = BERTClassifier(bertmodel,  dr_rate=0.3).to(device)


In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()



In [None]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [None]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(train_dataloader):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
      
#         if batch_id % log_interval == 0:
        print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

epoch 1 batch id 1 loss 4.1547369956970215 train acc 0.015625
epoch 1 batch id 2 loss 4.133150100708008 train acc 0.015625
epoch 1 batch id 3 loss 4.11295747756958 train acc 0.010416666666666666
epoch 1 batch id 4 loss 4.086395740509033 train acc 0.01171875
epoch 1 batch id 5 loss 4.146578788757324 train acc 0.015625
epoch 1 batch id 6 loss 4.087337970733643 train acc 0.018229166666666668
epoch 1 batch id 7 loss 4.039854526519775 train acc 0.020089285714285716
epoch 1 batch id 8 loss 4.121728420257568 train acc 0.021484375
epoch 1 batch id 9 loss 4.182212829589844 train acc 0.022569444444444444
epoch 1 batch id 10 loss 4.10418701171875 train acc 0.0203125
epoch 1 batch id 11 loss 4.168939113616943 train acc 0.019886363636363636
epoch 1 batch id 12 loss 4.092654705047607 train acc 0.01953125
epoch 1 batch id 13 loss 4.117129802703857 train acc 0.019230769230769232
epoch 1 batch id 14 loss 4.163125514984131 train acc 0.020089285714285716
epoch 1 batch id 15 loss 4.1105265617370605 train 

KeyboardInterrupt: 