In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
#os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [2]:
from collections.abc import Iterable
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
from sklearn.model_selection import train_test_split
import pickle as pk

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.autograd import Variable
import torch.autograd as autograd
from torch.utils.data import TensorDataset, random_split

from transformers import BertTokenizer, BertConfig, BertForTokenClassification
from transformers import AutoModelWithLMHead, AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoModelForTokenClassification
from transformers import pipeline, AdamW
from torchcrf import CRF as tcrf
#from keras.preprocessing.sequence import pad_sequences

import gluonnlp as nlp
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import BertTokenizer, BertModel, BertConfig

In [3]:
if torch.cuda.device_count()>1:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
elif torch.cuda.device_count()>0:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
else:
    device = 'cpu'

In [6]:
with open('data/kobert_data_train.pk', 'rb') as f1:
    sentences_train, labels_train = pk.load(f1)
with open('data/kobert_data_val.pk', 'rb') as f1:
    sentences_test, labels_test = pk.load(f1)
with open('data/kobert_data_test.pk', 'rb') as f1:
    sentences_test2, labels_test2 = pk.load(f1)    

In [7]:
for ii in range(0, 1):
    print(sentences_test2[ii])
    print(labels_test2[ii])

['▁삼성전자', '가', '▁', '디스플레이', '▁관련', '▁일', '회', '성', '▁수익', '에', '▁힘입어', '▁시장', '▁예상', '을', '▁큰', '▁폭', '▁뛰어넘', '는', '▁실적', '을', '▁냈다', '▁', '.']
['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [8]:
tagmap = {}
tag_values = set()
for tags_of_sent in labels_train + labels_test:
    for tag_of_word in tags_of_sent:
        tag1 = tag_of_word
        tagmap.setdefault(tag1, 0)
        tagmap[tag1] += 1
        
tag_values = list(set(tagmap))
tag_values = sorted(tag_values)
tag_values = tag_values + ["<PAD>", "<CLS>", "<SEP>"]
#tag_values = tag_values + ["<PAD>"]
tag2idx = {t: i for i, t in enumerate(tag_values)}
idx2tag = {i: t for i, t in enumerate(tag_values)}

In [9]:
idx2tag

{0: 'B-DAT',
 1: 'B-DUR',
 2: 'B-LOC',
 3: 'B-MNY',
 4: 'B-NOH',
 5: 'B-ORG',
 6: 'B-PER',
 7: 'B-PNT',
 8: 'B-POH',
 9: 'B-TIM',
 10: 'I-DAT',
 11: 'I-DUR',
 12: 'I-LOC',
 13: 'I-MNY',
 14: 'I-NOH',
 15: 'I-ORG',
 16: 'I-PER',
 17: 'I-PNT',
 18: 'I-POH',
 19: 'I-TIM',
 20: 'O',
 21: '<PAD>',
 22: '<CLS>',
 23: '<SEP>'}

In [10]:
MAX_LEN = 75
bs = 128

In [11]:
kobert, vocab = get_pytorch_kobert_model()

using cached model
using cached model


In [12]:
class Kobert_Vectorizer():
    def __init__(self, vocab, max_len=MAX_LEN):
        self.max_len = max_len
        self.vocab = vocab
        self.tok = get_tokenizer()
        self.tokenizer = nlp.data.BERTSPTokenizer(self.tok, vocab, lower=False)
        self.tok_transform = nlp.data.BERTSentenceTransform(
            self.tokenizer, max_seq_length=MAX_LEN, pad=True, pair=False)
        
    def __call__(self, sentences, labels):
        """
        sentences : list of list of words
        labels : list of list of labels
        """
        ret = {}
        ret['input'] = sentences
        ret['label'] = labels
        
        temp = [ self.vectorize_str(sent1) for sent1 in sentences ]
        ret['input_ids'] = [x[0] for x in temp]
        ret['attention_mask'] = [x[1] for x in temp]
        ret['type_id'] = [x[2] for x in temp]
        ret['label_ids'] = [[tag2idx[tag1] for tag1 in sent_tag] 
                            for sent_tag in self.label_pad(ret['label'])]
        ret['label_ids'] = torch.LongTensor(ret['label_ids'])
        ret['input_ids'] = torch.LongTensor(ret['input_ids'])
        ret['attention_mask'] = torch.LongTensor(ret['attention_mask'])
        ret['type_id'] = torch.LongTensor(ret['type_id'])
        
        return ret
        
    def label_pad(self, labels):
        ret = []
        for label1 in labels:
            if len(label1) < self.max_len-2:
                label2 = ['<CLS>'] + label1 + ['<SEP>'] + ['<PAD>']*(self.max_len-2-len(label1))
                #label2 = ['<PAD>'] + label1 + ['<PAD>']*(self.max_len-1-len(label1))
            else:
                label2 = ['<CLS>'] + label1[:self.max_len-2] + ['<SEP>']
                #label2 = ['<PAD>'] + label1[:self.max_len-2] + ['<PAD>']
            ret.append(label2)
        return ret
        
    def get_attention_mask(self, len_sent, valid_length):
        attention_mask = np.zeros(len_sent)
        attention_mask[:valid_length] = 1
        return attention_mask
    
    def vectorize_str(self, list_of_words):
        input_ids = [ self.vocab.token_to_idx[ix] for ix in list_of_words ]
        if len(input_ids) < self.max_len-2:
            length = len(input_ids) + 2
            input_ids = [2] + input_ids + [3] + [1]*(self.max_len-2-len(input_ids))
        else:
            length = self.max_len
            input_ids = [2] + input_ids[:self.max_len-2] + [3]
        #print('input', input_ids)
        #print('length', length)
        attention_mask = self.get_attention_mask(len(input_ids), length)
        type_id = [0] * self.max_len
        #print('mask', attention_mask)
        return input_ids, attention_mask, type_id

In [13]:
vectorizer = Kobert_Vectorizer(vocab)
data_train = vectorizer(sentences_train, labels_train)
data_test = vectorizer(sentences_test, labels_test)
data_test2 = vectorizer(sentences_test2, labels_test2)

using cached model


In [14]:
ii = 6
#for key, val in data_train.items():
#    print(key, len(val[ii]), val[ii])
ii = 3
for key, val in data_test.items():
    print(key, len(val[ii]), val[ii])

input 20 ['▁', '●', '봉', '황', '기', '▁전국', '대회', '▁(', '▁오전', '9', '시', '▁', '·', '▁창원', '종합', '사', '격', '장', '▁', ')']
label 20 ['O', 'O', 'B-POH', 'I-POH', 'I-POH', 'I-POH', 'I-POH', 'O', 'B-TIM', 'I-TIM', 'I-TIM', 'O', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'I-LOC', 'O', 'O']
input_ids 75 tensor([   2,  517,    0, 6392, 7951, 5561, 4014, 5829,  522, 3431,  234, 6705,
         517,  478, 4441, 7270, 6493, 5412, 7178,  517,   40,    3,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1])
attention_mask 75 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0

In [15]:
dataset_train = TensorDataset(data_train['input_ids'], 
                              data_train['attention_mask'],
                              data_train['type_id'],
                              data_train['label_ids'])
dataset_test = TensorDataset(data_test['input_ids'], 
                              data_test['attention_mask'],
                              data_test['type_id'],
                              data_test['label_ids'])
dataset_test2 = TensorDataset(data_test2['input_ids'], 
                              data_test2['attention_mask'],
                              data_test2['type_id'],
                              data_test2['label_ids'])

train_sampler = RandomSampler(dataset_train)
valid_sampler = SequentialSampler(dataset_test)
valid_sampler2 = SequentialSampler(dataset_test2)

train_dataloader = DataLoader(dataset_train, sampler=train_sampler, batch_size=bs)
valid_dataloader = DataLoader(dataset_test, sampler=valid_sampler, batch_size=bs)
valid_dataloader2 = DataLoader(dataset_test2, sampler=valid_sampler2, batch_size=bs)

In [16]:
"""
input_ids = [ encoded1['input_ids'] for encoded1 in encoded ]
attention_masks = [ encoded1['attention_mask'] for encoded1 in encoded ]
token_type_ids = [ encoded1['token_type_ids'] for encoded1 in encoded ]
"""

"\ninput_ids = [ encoded1['input_ids'] for encoded1 in encoded ]\nattention_masks = [ encoded1['attention_mask'] for encoded1 in encoded ]\ntoken_type_ids = [ encoded1['token_type_ids'] for encoded1 in encoded ]\n"

In [17]:
class KoBERT_CRF(nn.Module):
    def __init__(self, kobert, n_tag):
        super().__init__()
        hidden_size = 768
        dropout_prob = 0.1
        self.kobert = kobert
        self.dropout = nn.Dropout(dropout_prob)
        self.ff = nn.Linear(hidden_size, n_tag)
        self.crf = tcrf(len(tag2idx), batch_first=True)
    
    def forward(self, input_ids, attention_mask, type_ids):
        """
        hidden : [batch, len_seq, dim_hidden]
        """
        hidden, _, _ = self.kobert(input_ids, attention_mask, type_ids)
        hidden = self.dropout(hidden)
        emissions = self.ff(hidden)
        return emissions
    
    def neg_log_likelihood(self, input_ids, attention_mask, type_ids, label_ids):
        emissions = self.forward(input_ids, attention_mask, type_ids)
        
        loss = -1*self.crf.forward(emissions, label_ids, 
                                    mask=attention_mask.to(dtype=torch.uint8),
                                    reduction='mean')
        return loss

In [18]:
model = KoBERT_CRF(kobert, len(tag2idx))

In [19]:
if torch.cuda.device_count()>1:
    model = nn.DataParallel(model, device_ids=[0,1,2,3])
model = model.to(device)

In [20]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,        
    lr=3e-5,
    eps=1e-8
)

In [21]:
from transformers import get_linear_schedule_with_warmup

In [22]:
epochs = 60
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    
    num_training_steps=total_steps
)

In [23]:
#from seqeval.metrics import f1_score
from sklearn.metrics import f1_score, accuracy_score
def filtered_label(preds, labels, except_ids):
    #pred_flat = np.argmax(preds, axis=2).flatten()
    pred_flat = np.array(preds).flatten()
    #print('pred_flat', pred_flat)
    labels_flat = labels.flatten()
    mask = []
    for i, x in enumerate(labels_flat):
        if x in except_ids:
            mask.append(i)
    #mask = np.where(labels_flat==except_ids)
    pred_flat = np.delete(pred_flat, mask)
    #print('pred_flat2', pred_flat)
    labels_flat = np.delete(labels_flat, mask)
    return pred_flat, labels_flat

def flat_accuracy(preds, labels, except_ids):
    except_ids = [tag2idx[tag] for tag in except_ids]
    pred_flat, labels_flat = filtered_label(preds, labels, except_ids)
    return accuracy_score(labels_flat, pred_flat)
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
"""
def flat_accuracy(preds, labeltttts):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
"""    

'\ndef flat_accuracy(preds, labeltttts):\n    pred_flat = np.argmax(preds, axis=2).flatten()\n    labels_flat = labels.flatten()\n    return np.sum(pred_flat == labels_flat) / len(labels_flat)\n'

In [None]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []
loss_fct = nn.CrossEntropyLoss(ignore_index = tag2idx['<PAD>'])
#eff_labels = list( set(tag2idx.keys()) - set(['<PAD>', '<CLS>', '<SEP>', 'O']) )
eff_labels = list( set(tag2idx.keys()) - set(['<PAD>', '<CLS>', '<SEP>', 'O']) )
for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    
    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_mask_id, b_type_id, b_labels_id = batch
        
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        #"""       
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        n_batch, len_sent = b_input_ids.size()
        logits = model(b_input_ids, b_mask_id, b_type_id)
                        #attention_mask=b_mask_id, labels=b_labels_id)
        #loss = loss_fct(logits.view(n_batch*len_sent, -1), b_labels_id.view(n_batch*len_sent))
        loss = model.neg_log_likelihood(b_input_ids, b_mask_id, 
                                        b_type_id, b_labels_id)
        # get the loss
        
        #loss = loss.mean()
        #print('loss', loss)
        #"""
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()    
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
        
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    
    
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    
    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_mask_id, b_type_id, b_labels_id = batch
        n_batch, len_sent = b_input_ids.size()
        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            #outputs = model(b_input_ids, token_type_ids=b_types,
            #outputs = model(b_input_ids, token_type_ids=None,
            #                attention_mask=b_mask_id, labels=b_labels_id)
            outputs = model(b_input_ids, b_mask_id, b_type_id)
            #output_tags = model.crf.decode(outputs, b_mask_id.to(dtype=torch.uint8))
            output_tags = model.crf.decode(outputs)
            
            # Move logits and labels to CPU
            #logits = outputs.detach().cpu().numpy()
            label_ids = b_labels_id.to('cpu').numpy()
        
            # Calculate the accuracy for this batch of test sentences.
            #eval_loss = loss_fct(outputs.view(n_batch*len_sent, -1), b_labels_id.view(n_batch*len_sent))
            loss = model.neg_log_likelihood(b_input_ids, b_mask_id, 
                                        b_type_id, b_labels_id)
            #print(loss.size())
            loss = loss.detach().cpu().numpy()
        eval_loss += loss
        #eval_accuracy += flat_accuracy(logits, label_ids, len(tag2idx)-1)
        #eval_accuracy += flat_accuracy(logits, label_ids, ['<PAD>', '<CLS>', '<SEP>', 'O'])
        eval_accuracy += flat_accuracy(output_tags, label_ids, ['<PAD>', '<CLS>', '<SEP>', 'O'])
        #eval_accuracy += flat_accuracy(logits, label_ids, ['<PAD>', '<CLS>', '<SEP>'])
        #print('logits', logits[0][:10])
        #print('output', output_tags[0][:10])
        #print('label', label_ids[0][:10])
        #predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        predictions.extend(output_tags)
        true_labels.append(label_ids)
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    
    eval_loss = eval_loss / nb_eval_steps
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tag_values[p_i] for p in predictions for p_i in p]
    valid_tags = [tag_values[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("Validation F1-Score: {}".format(f1_score(valid_tags, pred_tags,
                                                    average='macro', labels=eff_labels)))

Epoch:   0%|          | 0/60 [00:00<?, ?it/s]

Average train loss: 27.294172938664754


Epoch:   2%|▏         | 1/60 [02:21<2:18:59, 141.34s/it]

Validation loss: 8.19860303401947
Validation Accuracy: 0.8656259896384498
Validation F1-Score: 0.6746000721110412


In [71]:
# ========================================
    #               Test
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    
    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader2:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_mask_id, b_type_id, b_labels_id = batch
        n_batch, len_sent = b_input_ids.size()
        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            #outputs = model(b_input_ids, token_type_ids=b_types,
            #outputs = model(b_input_ids, token_type_ids=None,
            #                attention_mask=b_mask_id, labels=b_labels_id)
            outputs = model(b_input_ids, b_mask_id, b_type_id)
            #output_tags = model.crf.decode(outputs, b_mask_id.to(dtype=torch.uint8))
            output_tags = model.crf.decode(outputs)
            
            # Move logits and labels to CPU
            #logits = outputs.detach().cpu().numpy()
            label_ids = b_labels_id.to('cpu').numpy()
        
            # Calculate the accuracy for this batch of test sentences.
            #eval_loss = loss_fct(outputs.view(n_batch*len_sent, -1), b_labels_id.view(n_batch*len_sent))
            loss = model.neg_log_likelihood(b_input_ids, b_mask_id, 
                                        b_type_id, b_labels_id)
            #print(loss.size())
            loss = loss.detach().cpu().numpy()
        eval_loss += loss
        #eval_accuracy += flat_accuracy(logits, label_ids, len(tag2idx)-1)
        #eval_accuracy += flat_accuracy(logits, label_ids, ['<PAD>', '<CLS>', '<SEP>', 'O'])
        eval_accuracy += flat_accuracy(output_tags, label_ids, ['<PAD>', '<CLS>', '<SEP>', 'O'])
        #eval_accuracy += flat_accuracy(logits, label_ids, ['<PAD>', '<CLS>', '<SEP>'])
        #print('logits', logits[0][:10])
        #print('output', output_tags[0][:10])
        #print('label', label_ids[0][:10])
        #predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        predictions.extend(output_tags)
        true_labels.append(label_ids)
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    
    eval_loss = eval_loss / nb_eval_steps
    validation_loss_values.append(eval_loss)
    print("Test loss: {}".format(eval_loss))
    print("Test Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tag_values[p_i] for p in predictions for p_i in p]
    valid_tags = [tag_values[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("Test F1-Score: {}".format(f1_score(valid_tags, pred_tags,
                                                    average='macro', labels=eff_labels)))

Test loss: 16.226093292236328
Test Accuracy: 0.7776261937244202
Test F1-Score: 0.6320725221374415


  average, "true nor predicted", 'F-score is', len(true_sum)


In [88]:
import sys, importlib
importlib.reload(sys.modules['kobert_decode'])
from kobert_decode import *

In [89]:
decoder = DecoderFromNamedEntitySequence(vectorizer.tokenizer, idx2tag)

In [91]:
data_test2['label_ids'][0]

tensor([22,  5, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
        20, 20, 20, 20, 20, 20, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
        21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
        21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
        21, 21, 21])

In [93]:
for batch in valid_dataloader2:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_mask_id, b_type_id, b_labels_id = batch

    outputs = model(b_input_ids, b_mask_id, b_type_id)
    output_tags = model.crf.decode(outputs)
        
    n_batch = outputs.size(0)
    for ii in range(n_batch):
        in_ids = data_test2['input_ids'][ii].numpy()
        tag_ids = data_test2['label_ids'][ii].numpy()
        tag_ids2 = output_tags[ii]
        #print(tag_ids)
        #print(tag_ids2)
        ret = decoder(in_ids, tag_ids)
        ret2 = decoder(in_ids, tag_ids2)
        print(ret[-1])
        print(ret2[-1])
        #print('------------------')
        print()
    break

 <삼성전자:ORG>가 디스플레이 관련 일회성 수익에 힘입어 시장 예상을 큰 폭 뛰어넘는 실적을 냈다 .
 <삼성전자:ORG>가 디스플레이 관련 일회성 수익에 힘입어 시장 예상을 큰 폭 뛰어넘는 실적을 냈다 .

 <삼성전자는:ORG> 올해 <2분기:NOH> 연결 기준 영업이익이 전년 동기 대비 <22 . 73 %:PNT> 증가한 <8조1천억원:MNY>으로 잠정 집계됐다고 <8일:DAT> 공시했다 .
 <삼성전자는:ORG> 올해 <2분기:NOH> 연결 기준 영업이익이 전년 동기 대비 <22 . 73 %:PNT> 증가한 <8조1천억원:MNY>으로 잠정 집계됐다고 <8일:DAT> 공시했다 .

 매출액은 <52조원:MNY>으로 <7 . 36 %:PNT> 줄었다 .
 매출액은 <52조원:MNY>으로 <7 . 36 %:PNT> 줄었다 .

 이런 실적은 시장 예상치를 크게 뛰어넘는 것이다 .
 이런 실적은 시장 예상치를 크게 뛰어넘는 것이다 .

 <연합인포맥스가:ORG> 최근 <1개월간:DUR> 실적 전망치를 발표한 <15개:NOH> 증권사를 대상으로 컨센서스를 실시한 결과 <삼성전자는:ORG> 올해 <2분기:NOH> <51조118억원의:MNY> 매출과 <6조5천384억원의:MNY> 영업이익을 거뒀을 것으로 관측됐다 .
 <연합인포맥스가:ORG> 최근 <1개월간:DUR> 실적 전망치를 발표한 <15개:NOH> 증권사를 대상으로 컨센서스를 실시한 결과 <삼성전자는:ORG> 올해 <2분기:NOH> <51조118억원의:MNY> 매출과 <6조5천384억원의:MNY> 영업이익을 거뒀을 것으로 관측됐다 .

 <삼성전자는:ORG> 올해 <2분기:NOH> 실적에 대해 " 디스플레이 관련 일회성 수익이 포함돼 있다 " 고 설명했다 .
 <삼성전자는:ORG> 올해 <2분기:NOH> 실적에 대해 " 디스플레이 관련 일회성 수익이 포함돼 있다 " 고 설명했다 .

 <삼성전자는:ORG> <2009년:DAT> <7월부터:DAT> 국내 기업 최초로 분기실적 예상치를 공시하고 있다 .
 <

In [54]:
in_ids.numpy()

array([   2,  688, 1019, 7407, 3745, 6738,  517, 6385, 7086,  605, 7126,
        714, 3318,  838, 7207, 7119, 2945, 6556, 1326, 7276, 5448, 3318,
       6969, 6410, 1832,  993, 7941, 7350,  522,  517,  295,  517,   40,
       2272, 7088, 2986, 7836, 3413, 7108, 4257, 5436, 5103, 7088, 4197,
       7207,  611,  517,  463,  617, 5357,  517, 7934,  627, 5357, 5103,
       6079, 1550, 6135,  905,  517,  502,  517, 7102, 2261,  517,   54,
          3,    1,    1,    1,    1,    1,    1,    1,    1])

In [44]:
ii = 0
print(data_test.keys())
print(data_test['input_ids'][ii])
print(data_test['label_ids'][ii])

dict_keys(['input', 'label', 'input_ids', 'attention_mask', 'type_id', 'label_ids'])
tensor([   2,  688, 1019, 7407, 3745, 6738,  517, 6385, 7086,  605, 7126,  714,
        3318,  838, 7207, 7119, 2945, 6556, 1326, 7276, 5448, 3318, 6969, 6410,
        1832,  993, 7941, 7350,  522,  517,  295,  517,   40, 2272, 7088, 2986,
        7836, 3413, 7108, 4257, 5436, 5103, 7088, 4197, 7207,  611,  517,  463,
         617, 5357,  517, 7934,  627, 5357, 5103, 6079, 1550, 6135,  905,  517,
         502,  517, 7102, 2261,  517,   54,    3,    1,    1,    1,    1,    1,
           1,    1,    1])
tensor([22,  5, 20, 20,  6, 16, 20, 20, 20,  0, 10, 20, 20, 20, 20, 20,  8, 18,
        18, 18, 18, 18, 18, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
        20, 20, 20, 20, 20, 20, 20, 20, 20,  4, 20, 20,  4, 14, 20, 20,  4, 14,
        20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 23, 21, 21, 21, 21, 21,
        21, 21, 21])


In [None]:
def get_max_ix(arr):
    arr = list(filter(lambda x: x[-1] in 'BI', arr))
    if len(arr) < 1:
        return 0
    elm_dic = {}
    for i, x in enumerate(arr):
        elm_dic.setdefault(i, 0)
        elm_dic[i] += 1
    counts = list(elm_dic.values())
    keys = list(elm_dic.keys())
    max_ix = counts.index(max(counts))
    return keys[max_ix]
    
def find_span_location(raw_sent, n_max):
    span_dic = {}
    count = 0
    for i, word in enumerate(raw_sent):
        tokened_word = vectorizer.tokenizer(word)
        n_subword = len(tokened_word)
        span_dic[i] = list(range(count, count+n_subword))
        count += n_subword
        if count >= n_max: break
    return span_dic

def post_process(raw_sent, label_o):
    n_max = len(label_o)
    span_dic = find_span_location(raw_sent, n_max)
    label_o2 = [None]*len(span_dic)
    
    for ix, ixs in span_dic.items():
        label_o2[ix] = []
        for i in ixs:
            if i >= n_max:
                break
            #print('i', i, 'ix', ix)
            label_o2[ix].append(label_o[i])
    final_label = []
    for wordset in label_o2:
        max_ix = get_max_ix(wordset)
        final_label.append(wordset[max_ix])
                
    return final_label, label_o2

ii = 34
sent1 = sentences_test[ii]
sent2 = data_test['input'][ii]
label_o = data_test['label'][ii]

label_dic = find_span_location(sent1, 75)
print(sent1)
print(label_dic)
print()
print(sent2)
print(label_o)
print()

final_label, label2 = post_process(sent1, label_o)
print(label2)
print(final_label)
print()
print(data_train['label_ids'][ii].detach().to('cpu').numpy())

In [None]:
"""
b_input_ids : padded and tokened
b_labels_id : padded and tokened
pred_label : padded and tokened
label_o : tokened <-- need post processing

"""
def size_limit(seq):
    if len(seq) >= MAX_LEN-2:
        return seq[:MAX_LEN-2]
    else:
        return seq

test_dataloader = DataLoader(dataset_test, sampler=valid_sampler, batch_size=1)
all_preds, all_labels = [], []

for i, batch in enumerate(test_dataloader):
    #if i>300: break
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_mask_id, b_type_id, b_labels_id = batch

    label_o = data_test['label'][i]
    label_o = size_limit(label_o)
    
    sent = sentences_test[i]
    final_label, _ = post_process(sent, label_o)
    all_labels += final_label
    
    
    #print('i', i)
    
    #print('label', final_label)
    len_label = min([len(label_o), MAX_LEN-2])
    pred_id = model(b_input_ids, b_mask_id, b_type_id)
    #pred = pred_id.detach().cpu().numpy()[0, 1:1+len_label]
    #print(i,len_label, len(label_o), len(pred_label), len(sent))
    #pred = np.argmax(pred, axis=-1)
    pred_id = model.crf.decode(pred_id)
    pred = pred_id[0][1:1+len_label]
    pred_tag = [ idx2tag[idx] for idx in pred ]
    #print('pred', pred_label_tag)
    #print(pred_label_tag.shape)

    final_pred_tag, _ = post_process(sent, pred_tag)
    all_preds += final_pred_tag
    
    if len(final_label) != len(final_pred_tag):
        print('i: ', i, )
        print(i,len_label, len(label_o), len(pred_tag), len(sent))
        print('final_label', len(final_label), 'final_pred', len(final_pred_tag))

print('f1_score', 
      f1_score(all_labels, all_preds, average='macro', labels=eff_labels))

In [16]:
data_train.keys()

dict_keys(['input', 'label', 'input_ids', 'attention_mask', 'type_id', 'label_ids'])

In [89]:
test_sentence = """
Mr. Trump’s tweets began just moments after a Fox News report by Mike Tobin, a 
reporter for the network, about protests in Minnesota and elsewhere. 
"""
test_sentence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge which is visible from the window."

In [90]:
tokenized_sentence = tokenizer.encode(test_sentence)
input_ids = torch.tensor([tokenized_sentence]).to(device)

In [91]:
with torch.no_grad():
    output = model(input_ids)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

In [92]:
# join bpe split tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(tag_values[label_idx])
        new_tokens.append(token)

In [93]:
for token, label in zip(new_tokens, new_labels):
    print("{}\t{}".format(label, token))

O	[CLS]
B-org	hugging
I-org	face
I-org	inc
I-org	.
O	is
O	a
O	company
O	based
O	in
B-geo	new
I-geo	york
I-geo	city
O	.
O	its
O	headquarters
O	are
O	in
B-geo	dumbo
O	,
O	therefore
O	veryclose
O	to
O	the
B-geo	manhattan
I-geo	bridge
O	which
O	is
O	visible
O	from
O	the
O	window
O	.
O	[SEP]
