In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import pickle as pk

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch.autograd import Variable
import torch.autograd as autograd

from torchcrf import CRF as tcrf

from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
if torch.cuda.device_count()>1:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
elif torch.cuda.device_count()>0:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
else:
    device = 'cpu'

In [4]:
with open('data/bert_data_train.pk', 'rb') as f1:
    sentences_train, labels_train = pk.load(f1)
with open('data/bert_data_val.pk', 'rb') as f1:
    sentences_test, labels_test = pk.load(f1)
with open('data/bert_data_test.pk', 'rb') as f1:
    sentences_test2, labels_test2 = pk.load(f1)    

In [5]:
tagmap = {}
tag_values = set()
for tags_of_sent in labels_train + labels_test:
    for tag_of_word in tags_of_sent:
        tag1 = tag_of_word
        tagmap.setdefault(tag1, 0)
        tagmap[tag1] += 1
        
tag_values = list(set(tagmap))
tag_values = sorted(tag_values)
tag_values = tag_values + ["[PAD]", "[CLS]", "[SEP]"]
#tag_values = tag_values + ["<PAD>"]
tag2idx = {t: i for i, t in enumerate(tag_values)}
idx2tag = {i: t for i, t in enumerate(tag_values)}

In [6]:
idx2tag

{0: 'B-DAT',
 1: 'B-DUR',
 2: 'B-LOC',
 3: 'B-MNY',
 4: 'B-NOH',
 5: 'B-ORG',
 6: 'B-PER',
 7: 'B-PNT',
 8: 'B-POH',
 9: 'B-TIM',
 10: 'I-DAT',
 11: 'I-DUR',
 12: 'I-LOC',
 13: 'I-MNY',
 14: 'I-NOH',
 15: 'I-ORG',
 16: 'I-PER',
 17: 'I-PNT',
 18: 'I-POH',
 19: 'I-TIM',
 20: 'O',
 21: '[PAD]',
 22: '[CLS]',
 23: '[SEP]'}

In [7]:
MAX_LEN = 75
bs = 128

In [8]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig, BertForTokenClassification

from sklearn.model_selection import train_test_split
from transformers import AutoModelWithLMHead, AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoModelForTokenClassification
from transformers import pipeline, AdamW

In [9]:
from transformers import BertTokenizer, BertModel, BertConfig

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True)

In [11]:
class Bert_Vectorizer():
    def __init__(self, tokenizer, max_len=MAX_LEN):
        self.max_len = max_len
        self.tokenizer = tokenizer
        
    def __call__(self, sentences, labels):
        """
        sentences : list of list of words
        labels : list of list of labels
        """
        ret = {}
        ret['input'] = sentences
        ret['label'] = labels
        
        temp = [ self.vectorize_str(sent1) for sent1 in sentences ]
        ret['input_ids'] = [x[0] for x in temp]
        ret['attention_mask'] = [x[1] for x in temp]
        ret['type_id'] = [x[2] for x in temp]
        ret['label_ids'] = [[tag2idx[tag1] for tag1 in sent_tag] 
                            for sent_tag in self.label_pad(ret['label'])]
        ret['label_ids'] = torch.LongTensor(ret['label_ids'])
        ret['input_ids'] = torch.LongTensor(ret['input_ids'])
        ret['attention_mask'] = torch.LongTensor(ret['attention_mask'])
        ret['type_id'] = torch.LongTensor(ret['type_id'])
        
        return ret
        
    def label_pad(self, labels):
        ret = []
        for label1 in labels:
            if len(label1) < self.max_len-2:
                label2 = ["[CLS]"] + label1 + ['[SEP]'] + ['[PAD]']*(self.max_len-2-len(label1))
                #label2 = ['<PAD>'] + label1 + ['<PAD>'] + ['<PAD>']*(self.max_len-2-len(label1))
            else:
                label2 = ['[CLS]'] + label1[:self.max_len-2] + ['[SEP]']
                #label2 = ['<PAD>'] + label1[:self.max_len-2] + ['<PAD>']
            ret.append(label2)
        return ret
    
    def get_attention_mask(self, len_sent, valid_length):
        attention_mask = np.zeros(len_sent)
        attention_mask[:valid_length] = 1
        return attention_mask
    
    def vectorize_str(self, list_of_words):
        """
        tokenizer.vocab["[CLS]"] : 101
        tokenizer.vocab["[PAD]"] : 0
        tokenizer.vocab["[SEP]"] : 102
        
        tokenizer.ids_to_tokens
        tokenizer.vocab
        """
        input_ids = [ self.tokenizer.vocab[ix] for ix in list_of_words ]
        if len(input_ids) < self.max_len-2:
            length = len(input_ids) + 2
            input_ids = [101] + input_ids + [102] + [0]*(self.max_len-2-len(input_ids))
        else:
            length = self.max_len
            input_ids = [101] + input_ids[:self.max_len-2] + [102]
        attention_mask = self.get_attention_mask(len(input_ids), length)
        type_id = [0] * self.max_len
        return input_ids, attention_mask, type_id

In [12]:
vectorizer = Bert_Vectorizer(tokenizer)
data_train = vectorizer(sentences_train, labels_train)
data_test = vectorizer(sentences_test, labels_test)
data_test2 = vectorizer(sentences_test2, labels_test2)

In [13]:
ii = 5
for key, val in data_test2.items():
    print(key, len(val[ii]), val[ii])

input 36 ['삼', '##성', '##전', '##자는', '올', '##해', '2', '##분', '##기', '실', '##적', '##에', '대해', '"', '디', '##스', '##플', '##레', '##이', '관련', '일', '##회', '##성', '수', '##익', '##이', '포', '##함', '##돼', '있다', '"', '고', '설', '##명', '##했다', '.']
label 36 ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'B-NOH', 'I-NOH', 'I-NOH', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
input_ids 75 tensor([   101,   9410,  17138,  16617,  53639,   9583,  14523,    123,  37712,
         12310,   9489,  14801,  10530,  33378,    107,   9122,  12605, 119412,
         56645,  10739,  86080,   9641,  14863,  17138,   9460, 119188,  10739,
          9928,  48533, 118798,  11506,    107,   8888,   9429,  16758,  12490,
           119,    102,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0

In [14]:
dataset_train = TensorDataset(data_train['input_ids'], 
                              data_train['attention_mask'],
                              data_train['type_id'],
                              data_train['label_ids'])
dataset_test = TensorDataset(data_test['input_ids'], 
                              data_test['attention_mask'],
                              data_test['type_id'],
                              data_test['label_ids'])
dataset_test2 = TensorDataset(data_test2['input_ids'], 
                              data_test2['attention_mask'],
                              data_test2['type_id'],
                              data_test2['label_ids'])

train_sampler = RandomSampler(dataset_train)
valid_sampler = SequentialSampler(dataset_test)
valid_sampler2 = SequentialSampler(dataset_test2)

train_dataloader = DataLoader(dataset_train, sampler=train_sampler, batch_size=bs)
valid_dataloader = DataLoader(dataset_test, sampler=valid_sampler, batch_size=bs)
valid_dataloader2 = DataLoader(dataset_test2, sampler=valid_sampler2, batch_size=bs)

In [15]:
class BERT_CRF(nn.Module):
    def __init__(self, bert, n_tag):
        super().__init__()
        hidden_size = 768
        dropout_prob = 0.1
        self.bert = bert
        self.dropout = nn.Dropout(dropout_prob)
        self.ff = nn.Linear(hidden_size, n_tag)
        self.crf = tcrf(len(tag2idx), batch_first=True)
    
    def forward(self, input_ids, attention_mask, type_ids):
        """
        hidden : [batch, len_seq, dim_hidden]
        """
        hidden, _, _ = self.bert(input_ids, attention_mask=attention_mask, 
                                 token_type_ids=None)
        hidden = self.dropout(hidden)
        emissions = self.ff(hidden)
        return emissions
    
    def neg_log_likelihood(self, input_ids, attention_mask, type_ids, label_ids):
        emissions = self.forward(input_ids, attention_mask, type_ids)
        
        loss = -1*self.crf.forward(emissions, label_ids, 
                                    mask=attention_mask.to(dtype=torch.uint8),
                                    reduction='mean')
        return loss

In [16]:
#bert = BertModel.from_pretrained('bert-base-multilingual-cased', num_labels=len(tag2idx))
bert = BertModel(config)
model = BERT_CRF(bert, len(tag2idx))

In [17]:
if torch.cuda.device_count()>1:
    model = nn.DataParallel(model, device_ids=[0,1,2,3])
model = model.to(device)

In [18]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,        
    lr=3e-5,
    eps=1e-8
)

In [19]:
from transformers import get_linear_schedule_with_warmup

In [20]:
epochs = 60
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    
    num_training_steps=total_steps
)

In [21]:
from sklearn.metrics import f1_score, accuracy_score
def filtered_label(preds, labels, except_ids):
    pred_flat = np.array(preds).flatten()
    labels_flat = labels.flatten()
    mask = []
    for i, x in enumerate(labels_flat):
        if x in except_ids:
            mask.append(i)
    pred_flat = np.delete(pred_flat, mask)
    labels_flat = np.delete(labels_flat, mask)
    return pred_flat, labels_flat

def flat_accuracy(preds, labels, except_ids):
    except_ids = [tag2idx[tag] for tag in except_ids]
    pred_flat, labels_flat = filtered_label(preds, labels, except_ids)
    return accuracy_score(labels_flat, pred_flat)

In [None]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []
loss_fct = nn.CrossEntropyLoss(ignore_index = tag2idx['[PAD]'])
eff_labels = list( set(tag2idx.keys()) - set(['[PAD]', '[CLS]', '[SEP]', 'O']) )
for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    
    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_mask_id, b_type_id, b_labels_id = batch
        
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        #"""       
        # add batch to gpu
        #print(batch[0].size())
        #print(batch[1].size())
        #print(batch[2].size())
        #print(batch[3].size())
        #print(batch[0][0])
        #print('len', len(batch[0]))
        #batch = tuple(t.to(device) for t in batch)

        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        #print('b_input', b_input_ids[0])
        #print('type', b_types[0])
        #print('mask', b_mask_id[0])
        #print('label', b_labels_id[0])
        n_batch, len_sent = b_input_ids.size()
        logits = model(b_input_ids, b_mask_id, b_type_id)
                        #attention_mask=b_mask_id, labels=b_labels_id)
        #loss = loss_fct(logits.view(n_batch*len_sent, -1), b_labels_id.view(n_batch*len_sent))
        loss = model.neg_log_likelihood(b_input_ids, b_mask_id, 
                                        b_type_id, b_labels_id)
        # get the loss
        #print('output', len(outputs))
        #print('logit size', logits.size())
        
        
        #loss = loss.mean()
        #print('loss', loss)
        #"""
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()    
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        #scheduler.step()
        
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    
    
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    
    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_mask_id, b_type_id, b_labels_id = batch
        n_batch, len_sent = b_input_ids.size()
        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            #outputs = model(b_input_ids, token_type_ids=b_types,
            #outputs = model(b_input_ids, token_type_ids=None,
            #                attention_mask=b_mask_id, labels=b_labels_id)
            outputs = model(b_input_ids, b_mask_id, b_type_id)
            #print(outputs[0])
            #print(b_mask_id[0])
            #output_tags = model.crf.decode(outputs, b_mask_id.to(dtype=torch.uint8))
            output_tags = model.crf.decode(outputs)
            #print('0', len(output_tags))
            #print('1',len(output_tags[0]))
            #print()
            # Move logits and labels to CPU
            logits = outputs.detach().cpu().numpy()
            label_ids = b_labels_id.to('cpu').numpy()
        
            # Calculate the accuracy for this batch of test sentences.
            #eval_loss = loss_fct(outputs.view(n_batch*len_sent, -1), b_labels_id.view(n_batch*len_sent))
            loss = model.neg_log_likelihood(b_input_ids, b_mask_id, 
                                        b_type_id, b_labels_id)
            #print(loss.size())
            loss = loss.detach().cpu().numpy()
        eval_loss += loss
        eval_accuracy += flat_accuracy(output_tags, label_ids, ['[PAD]', '[CLS]', '[SEP]', 'O'])
        #predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        predictions.extend(output_tags)
        true_labels.append(label_ids)
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    
    eval_loss = eval_loss / nb_eval_steps
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tag_values[p_i] for p in predictions for p_i in p]
    valid_tags = [tag_values[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("Validation F1-Score: {}".format(f1_score(valid_tags, pred_tags,
                                                    average='macro', labels=eff_labels)))

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Average train loss: 40.44663400650025


Epoch:   2%|▏         | 1/60 [02:17<2:15:36, 137.90s/it]

Validation loss: 29.82383632659912
Validation Accuracy: 0.28015373229117563
Validation F1-Score: 0.2996547854616675


In [102]:
    # ========================================
    #               Test
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    
    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader2:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_mask_id, b_type_id, b_labels_id = batch
        n_batch, len_sent = b_input_ids.size()
        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            #outputs = model(b_input_ids, token_type_ids=b_types,
            #outputs = model(b_input_ids, token_type_ids=None,
            #                attention_mask=b_mask_id, labels=b_labels_id)
            outputs = model(b_input_ids, b_mask_id, b_type_id)
            #print(outputs[0])
            #print(b_mask_id[0])
            #output_tags = model.crf.decode(outputs, b_mask_id.to(dtype=torch.uint8))
            output_tags = model.crf.decode(outputs)
            #print('0', len(output_tags))
            #print('1',len(output_tags[0]))
            #print()
            # Move logits and labels to CPU
            logits = outputs.detach().cpu().numpy()
            label_ids = b_labels_id.to('cpu').numpy()
        
            # Calculate the accuracy for this batch of test sentences.
            #eval_loss = loss_fct(outputs.view(n_batch*len_sent, -1), b_labels_id.view(n_batch*len_sent))
            loss = model.neg_log_likelihood(b_input_ids, b_mask_id, 
                                        b_type_id, b_labels_id)
            #print(loss.size())
            loss = loss.detach().cpu().numpy()
        eval_loss += loss
        eval_accuracy += flat_accuracy(output_tags, label_ids, ['[PAD]', '[CLS]', '[SEP]', 'O'])
        #predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        predictions.extend(output_tags)
        true_labels.append(label_ids)
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    
    eval_loss = eval_loss / nb_eval_steps
    validation_loss_values.append(eval_loss)
    print("Test loss: {}".format(eval_loss))
    print("Test Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tag_values[p_i] for p in predictions for p_i in p]
    valid_tags = [tag_values[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("Test F1-Score: {}".format(f1_score(valid_tags, pred_tags,
                                                    average='macro', labels=eff_labels)))

Test loss: 28.509410858154297
Test Accuracy: 0.6725352112676056
Test F1-Score: 0.5020433605243871


  average, "true nor predicted", 'F-score is', len(true_sum)


In [99]:
import sys, importlib
importlib.reload(sys.modules['bert_decode'])
from bert_decode import *

In [100]:
decoder = DecoderFromNamedEntitySequence(vectorizer.tokenizer, idx2tag)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0], device='cuda:0')

In [104]:
for batch in valid_dataloader2:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_mask_id, b_type_id, b_labels_id = batch
    #print(b_input_ids[0])

    outputs = model(b_input_ids, b_mask_id, b_type_id)
    output_tags = model.crf.decode(outputs)
        
    n_batch = outputs.size(0)
    for ii in range(n_batch):
        in_ids = data_test2['input_ids'][ii].numpy()
        tag_ids = data_test2['label_ids'][ii].numpy()
        tag_ids2 = output_tags[ii]
        #print(tag_ids)
        #print(tag_ids2)
        ret = decoder(in_ids, tag_ids)
        ret2 = decoder(in_ids, tag_ids2)
        print(ret[-1])
        print(ret2[-1])
        print()
    break

 <삼성전자가:ORG> 디스플레이 관련 일회성 수익에 힘입어 시장 예상을 큰 폭 뛰어넘는 실적을 냈다 .
 <삼성전자가:ORG> 디스플레이 관련 일회성 수익에 힘입어 시장 예상을 큰 폭 뛰어넘는 실적을 냈다 .

 <삼성전자는:ORG> 올해 <2분기:NOH> 연결 기준 영업이익이 전년 동기 대비 <22 . 73 %:PNT> 증가한 <8조1천억원으로:MNY> 잠정 집계됐다고 <8일:DAT> 공시했다 .
 <삼성전자는:ORG> 올해 <2분기:NOH> 연결 기준 영업이익이 전년 동기 대비 <22 . 73 %:PNT> 증가한 <8조1천억원으로:MNY> 잠정 집계됐다고 <8일:DAT> 공시했다 .

 매출액은 <52조원으로:MNY> <7 . 36 %:PNT> 줄었다 .
 매출액은 <52조원으로:MNY> <7 . 36 %:PNT> 줄었다 .

 이런 실적은 시장 예상치를 크게 뛰어넘는 것이다 .
 이런 실적은 시장 예상치를 크게 뛰어넘는 것이다 .

 <연합인포맥스가:ORG> 최근 <1개월간:DUR> 실적 전망치를 발표한 <15개:NOH> 증권사를 대상으로 컨센서스를 실시한 결과 <삼성전자는:ORG> 올해 <2분기:NOH> <51조118억원의:MNY> 매출과 <6조5천384억원의:MNY> 영업이익을 거뒀을 것으로 관측
 <연합인포맥스가:ORG> 최근 <1개월간:NOH> 실적 전망치를 발표한 <15개:NOH> 증권사를 대상으로 컨센서스를 실시한 결과 <삼성전자는:ORG> 올해 <2분기:NOH> <51조118억원의:NOH> 매출과 <6조5천384억원의:MNY> 영업이익을 거뒀을 것으로 관측

 <삼성전자는:ORG> 올해 <2분기:NOH> 실적에 대해 " 디스플레이 관련 일회성 수익이 포함돼 있다 " 고 설명했다 .
 <삼성전자는:ORG> 올해 <2분기:NOH> 실적에 대해 " 디스플레이 관련 일회성 수익이 포함돼 있다 " 고 설명했다 .

 <삼성전자는:ORG> <2009년:DAT> <7월:DAT>부터 국내 기업 최초로 분기실적 예상치를 공시하고 있다 .
 <삼성전자는:OR

In [40]:
in_ids = data_test['input_ids'][0].numpy()
tag_ids = data_test['label_ids'][0].numpy()
decoder(in_ids, tag_ids)

([{'word': ' SBS', 'tag': 'ORG', 'prob': None},
  {'word': ' 이준실', 'tag': 'PER', 'prob': None},
  {'word': ' 4일', 'tag': 'DAT', 'prob': None},
  {'word': ' 스포츠서울 김영주골프 여자오픈부터', 'tag': 'POH', 'prob': None},
  {'word': ' 5', 'tag': 'NOH', 'prob': None},
  {'word': ' 6개', 'tag': 'NOH', 'prob': None},
  {'word': ' 9개', 'tag': 'NOH', 'prob': None}],
 ' [CLS] <SBS:ORG> 골프채널 <이준실:PER> 본부장은 <4일:DAT> [UNK] 여자 개막전인 <스포츠서울 김영주골프 여자오픈부터:POH> 디지털 고화질 ( HD ) 방송을 시작할 예정이며 중계 홀을 종전 <5:NOH> ~ <6개:NOH> 홀에서 <9개:NOH> 홀로 늘릴 것 [UNK] 이라고 밝혔다 . [SEP] [PAD] [PAD] [PAD] [PAD]')

array([   101,  21266,   8892,  28396, 119253,  49881,   9638,  54867,
        31503,   9358,  14646,  63671,  43494,    100,  62592,   8857,
       118907,  16617,  12030,   9477,  90578,  12424,  78123,   8935,
        30858,  16323, 118641,  28396,  62592,  28188, 119411,  17655,
       108266,   8888,  18227,  48599,    113,  18987,    114,  64002,
        10622,   9485,  38709,  14843,   9576,  98489,  21406,   9694,
        21611,   9988,  10622,   9684,  16617,    126,    198,    127,
        21789,   9988,  11489,    130,  21789,   9988,  11261,   9044,
        85836,   8870,    100,  56244,  99896,    119,    102,      0,
            0,      0,      0])

In [89]:
test_sentence = """
Mr. Trump’s tweets began just moments after a Fox News report by Mike Tobin, a 
reporter for the network, about protests in Minnesota and elsewhere. 
"""
test_sentence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge which is visible from the window."

In [90]:
tokenized_sentence = tokenizer.encode(test_sentence)
input_ids = torch.tensor([tokenized_sentence]).to(device)

In [91]:
with torch.no_grad():
    output = model(input_ids)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

In [92]:
# join bpe split tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(tag_values[label_idx])
        new_tokens.append(token)

In [93]:
for token, label in zip(new_tokens, new_labels):
    print("{}\t{}".format(label, token))

O	[CLS]
B-org	hugging
I-org	face
I-org	inc
I-org	.
O	is
O	a
O	company
O	based
O	in
B-geo	new
I-geo	york
I-geo	city
O	.
O	its
O	headquarters
O	are
O	in
B-geo	dumbo
O	,
O	therefore
O	veryclose
O	to
O	the
B-geo	manhattan
I-geo	bridge
O	which
O	is
O	visible
O	from
O	the
O	window
O	.
O	[SEP]
