# load library

In [None]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [3]:
#util
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from tqdm.notebook import tqdm

#torch
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp

#numpy
import numpy as np

#model
from kobert import get_tokenizer
from kobert import get_pytorch_kobert_model

#transformer
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# hyperparameter

In [4]:
is_cuda = torch.cuda.is_available()

if is_cuda:
    device = torch.device("cuda:0")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


In [5]:
#setting

max_len = 128
batch_size = 32
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

# define matrix

In [6]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

# load pretrained model

In [7]:
bertmodel, vocab = get_pytorch_kobert_model(cachedir = ".cache")

/content/.cache/kobert_v1.zip[██████████████████████████████████████████████████]
/content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece[██████████████████████████████████████████████████]


In [8]:
#tokenizer
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer,vocab,lower=False)

using cached model. /content/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


# define BERTDataset

In [9]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx])-1 for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

# load dataset

In [None]:
with open("/content/drive/MyDrive/text_dataset/review_data.pickle", 'rb') as f:
    review_data = pickle.load(f)

with open("/content/drive/MyDrive/text_dataset/star_data.pickle", 'rb') as f:
    star_data = pickle.load(f)

In [None]:
for review in review_data:
    
    print(review)
    break

선생님 항상 존경합니다. 저도 한 때는 16만원 들고 가출한 적이 있으나, 선생님의 뼈아픈 글들을 읽고 노력하며, 16만원부터 시작하여 누구의 도움없이, 현재는 작은 사업체를 3개 운영하고 있습니다. 저도 훗날, 선생님처럼 우여곡절을 겪고 있는 젊은이들에게 도움을 주는 일을 하고 싶습니다.


In [None]:
for star in star_data:
    
    print(star)
    break

5


In [None]:
#unbalanced
Counter(star_data)

Counter({5: 4528, 1: 213, 2: 207, 3: 577, 4: 1483})

In [None]:
data = []

for review,star in zip(review_data,star_data):
    
    data.append([review,str(star)])

print(len(data))
print(len(review_data))
print(len(star_data))

7008
7008
7008


In [None]:
with open('/content/drive/MyDrive/text_dataset/data.pickle', 'wb') as f:
    pickle.dump(data, f)

In [None]:
train_ind = set(np.random.choice(range(len(data)),6300,replace=False))
test_ind = set(range(len(data))) - train_ind

print(len(train_ind))
print(len(test_ind))

6300
708


In [None]:
dataset_train = []
dataset_test = []

for t in train_ind:
    
    dataset_train.append(data[t])

for t in test_ind:
    
    dataset_test.append(data[t])

print(len(dataset_train))
print(len(dataset_test))

6300
708


In [None]:
with open('/content/drive/MyDrive/text_dataset/train_data.pickle', 'wb') as f:
    pickle.dump(dataset_train, f)

with open('/content/drive/MyDrive/text_dataset/test_data.pickle', 'wb') as f:
    pickle.dump(dataset_test, f)

In [10]:
with open("/content/drive/MyDrive/text_dataset/train_data.pickle", 'rb') as f:
    dataset_train = pickle.load(f)

with open("/content/drive/MyDrive/text_dataset/test_data.pickle", 'rb') as f:
    dataset_test = pickle.load(f)

In [11]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [12]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=0)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0)

In [None]:
for d in data_train:
    
    print(d)
    break

(array([   2, 2752, 4996, 4192, 5424, 7843,  517,   54, 3990, 5859, 4955,
       1846,  545, 6153, 1802,  517, 5330, 7468, 7828, 4007, 3876,  517,
         46, 2752, 7095,  517, 6477, 6797, 7760, 1231, 5938, 3824, 5439,
       1481, 7810,  517,   46,  545, 6153, 6410, 2986, 7815, 1528, 7095,
       1717, 6883,  517,   46, 5064, 5760, 3939, 2609, 7436, 6116,  589,
       5357, 3517, 7788, 3867,  517,   54, 3990, 5859,    3], dtype=int32), array(64, dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int32), 5)


In [None]:
model

NameError: ignored

# define model

In [13]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids.to("cpu"))
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids, attention_mask = attention_mask.to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        else:
            out = pooler
        return self.classifier(out)

In [14]:
model = BERTClassifier(bertmodel, num_classes=5, dr_rate = 0.5).to(device)

# define finetune layer

In [15]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# optimizer, loss, scheduler

In [16]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [17]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [18]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

# training

In [21]:
save_path = "/content/drive/MyDrive/text_dataset/model.pth"

In [38]:
for token_ids,valid_length,segment_ids,label in train_dataloader:
    
    print(token_ids)
    print(valid_length)
    print(segment_ids)
    print(label)
    break

tensor([[   2, 2752, 4996,  ...,    1,    1,    1],
        [   2,  517, 6983,  ...,    1,    1,    1],
        [   2, 4299, 5064,  ...,    1,    1,    1],
        ...,
        [   2, 3489, 3647,  ...,    1,    1,    1],
        [   2, 3647, 5384,  ...,    1,    1,    1],
        [   2, 2811, 6896,  ...,    1,    1,    1]], dtype=torch.int32)
tensor([ 90,  37,  89,  87,  86,  23,  21, 110,  29,  57,  16,  57,  41,   6,
         20,  37,  91,  92,  52,  14,  22,  49,  20,  64,  29,  51,  17,  21,
         76,  37,   8, 109], dtype=torch.int32)
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32)
tensor([4, 0, 4, 1, 4, 0, 4, 0, 4, 4, 4, 4, 2, 4, 0, 4, 3, 4, 1, 4, 4, 2, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4])


In [None]:
model.to("cpu")

In [47]:
out = model(token_ids,valid_length,segment_ids)

In [48]:
out.shape

torch.Size([8, 5])

In [49]:
label.shape

torch.Size([8])

In [50]:
loss_fn(out,label.long())

tensor(1.8314, grad_fn=<NllLossBackward0>)

In [51]:
model.to(device)

token_ids = token_ids.long().to(device)
segment_ids = segment_ids.long().to(device)
#valid_length= valid_length
label = label.long().to(device)

In [52]:
out = model(token_ids,valid_length,segment_ids)

In [53]:
out

tensor([[ 0.5946, -0.3587,  0.1753,  0.1961, -0.1310],
        [ 0.3773,  0.2950,  0.1893,  0.6181,  0.3538],
        [ 0.6343,  0.1857, -0.4185, -0.0601, -0.3275],
        [ 0.2640, -0.2442,  0.2683,  0.4642,  0.2723],
        [ 0.2513, -0.5970, -0.5154,  0.6713,  0.3567],
        [ 0.3911, -0.2206,  0.3732,  0.2277, -0.2413],
        [-0.3429, -0.0183, -0.2726, -0.2314,  0.7278],
        [ 0.3198,  0.2199, -0.2292,  0.6113, -0.3891]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

In [54]:
label

tensor([4, 0, 4, 1, 4, 0, 4, 0], device='cuda:0')

In [55]:
loss_fn(out,label)

tensor(1.5961, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
bfor batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    #valid_length= valid_length
    label = label.long().to(device)
    out = model(token_ids, valid_length, segment_ids)
    print(out)
    break

tensor([[   2, 2752, 4996,  ..., 3990, 5859,    3],
        [   2,  517, 6983,  ...,    1,    1,    1],
        [   2, 4299, 5064,  ...,  517,   54,    3],
        ...,
        [   2, 3097,  517,  ...,    1,    1,    1],
        [   2, 1169, 7850,  ...,    1,    1,    1],
        [   2,  517, 6983,  ...,    1,    1,    1]], dtype=torch.int32)

In [22]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        #valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
            torch.save(model.state_dict(), save_path)
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    with torch.no_grad():
        for batch_id, (token_ids, valid_length, segment_ids, label) in tqdm(enumerate(test_dataloader), total=len(test_dataloader)):
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            #valid_length= valid_length.long()
            label = label.long().to(device)
            out = model(token_ids, valid_length, segment_ids)
            test_acc += calc_accuracy(out, label)
        print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

  0%|          | 0/197 [00:00<?, ?it/s]

epoch 1 batch id 1 loss 1.5748028755187988 train acc 0.28125
epoch 1 train acc 0.5956988759970994


  0%|          | 0/23 [00:00<?, ?it/s]

epoch 1 test acc 0.6331521739130435


  0%|          | 0/197 [00:00<?, ?it/s]

epoch 2 batch id 1 loss 1.0894697904586792 train acc 0.71875
epoch 2 train acc 0.6461430384336475


  0%|          | 0/23 [00:00<?, ?it/s]

epoch 2 test acc 0.6331521739130435


  0%|          | 0/197 [00:00<?, ?it/s]

epoch 3 batch id 1 loss 1.08392333984375 train acc 0.71875
epoch 3 train acc 0.6464602973168962


  0%|          | 0/23 [00:00<?, ?it/s]

epoch 3 test acc 0.6331521739130435


  0%|          | 0/197 [00:00<?, ?it/s]

epoch 4 batch id 1 loss 1.122100830078125 train acc 0.71875
epoch 4 train acc 0.6458031182015954


  0%|          | 0/23 [00:00<?, ?it/s]

epoch 4 test acc 0.6019021739130435


  0%|          | 0/197 [00:00<?, ?it/s]

epoch 5 batch id 1 loss 1.0216034650802612 train acc 0.65625
epoch 5 train acc 0.6463016678752719


  0%|          | 0/23 [00:00<?, ?it/s]

epoch 5 test acc 0.6331521739130435


# load pretrained model

In [31]:
resume = "/content/drive/MyDrive/text_dataset/model.pth"

checkpoint = torch.load(resume, map_location=torch.device('cpu'))

model.load_state_dict(checkpoint)

<All keys matched successfully>

# inference

In [23]:
text = """
죗값을 편견이라는 연좌제로 치르는 가해자의 가족 시점에서 서술되는 작품으로 사람들이 왜 범죄자의 가족들에게까지 차가워지는지 적나라하게 묘사되며 추리 소설이라기보다는 드라마 소설에 가까운 씁쓸한 현실을 다룬 이야기
"""

In [60]:
text2 = """
1월의 첫 책인데 감동과 재미 그리고 사유(차별과 속죄의 한계와 범위에 대한)를 충분히 제공하는 책입니다~대만족~
"""

In [26]:
text = text.strip("\n")

In [61]:
text2 = text2.strip("\n").strip(" ")

In [62]:
text2

'1월의 첫 책인데 감동과 재미 그리고 사유(차별과 속죄의 한계와 범위에 대한)를 충분히 제공하는 책입니다~대만족~'

In [28]:
transform = nlp.data.BERTSentenceTransform(
            tok, max_seq_length=max_len, pad=True, pair=False)

In [63]:
test_data = transform(text2)

In [30]:
test_data

(array([  2, 517,   0,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1], dtype=int32),
 array(4, dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0,

In [64]:
token_ids = torch.tensor(test_data[0]).unsqueeze(0).long()
valid_length = torch.tensor(test_data[1]).unsqueeze(0)
segment_ids = torch.tensor(test_data[2]).unsqueeze(0).long()

In [None]:
model

In [52]:
segment_ids

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
model.to("cpu")
model.eval()

In [65]:
output = model(token_ids, valid_length, segment_ids)

In [66]:
output

tensor([[-0.9738, -0.8896, -0.0021,  0.7148,  1.3730]],
       grad_fn=<AddmmBackward0>)

In [67]:
percentage_output = F.softmax(output, dim = 1)

print(percentage_output)
pred = output.cpu().detach().numpy()

print(pred)

sorted_pred = np.argsort(pred,axis = 1)

print(sorted_pred[0][-1])

tensor([[0.0486, 0.0528, 0.1283, 0.2628, 0.5075]], grad_fn=<SoftmaxBackward0>)
[[-0.9737689  -0.8895578  -0.00208349  0.71481854  1.3729838 ]]
4
