In [1]:
# %load example.py
from __future__ import absolute_import, division, print_function, unicode_literals
import json
import pickle
import torch

from gluonnlp.data import SentencepieceTokenizer
from model.net import KobertCRF
from data_utils.utils import Config
from data_utils.vocab_tokenizer import Tokenizer
from data_utils.pad_sequence import keras_pad_fn
from pathlib import Path

  warn('"Twitter" has changed to "Okt" since KoNLPy v0.4.5.')


In [2]:
model_dir = Path('./experiments/base_model_with_crf')
model_config = Config(json_path=model_dir / 'config.json')

# load vocab & tokenizer
tok_path = "./ptr_lm_model/tokenizer_78b3253a26.model"
ptr_tokenizer = SentencepieceTokenizer(tok_path)

with open(model_dir / "vocab.pkl", 'rb') as f:
    vocab = pickle.load(f)
tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen)

# load ner_to_index.json
with open(model_dir / "ner_to_index.json", 'rb') as f:
    ner_to_index = json.load(f)
    index_to_ner = {v: k for k, v in ner_to_index.items()}

# model
model = KobertCRF(config=model_config, num_classes=len(ner_to_index), vocab=vocab)

# load
model_dict = model.state_dict()
checkpoint = torch.load("./checkpoints/base_model_with_crf/best-epoch-16-step-1500-acc-0.993.bin", map_location=torch.device('cpu'))
# checkpoint = torch.load("./experiments/base_model_with_crf_val/best-epoch-12-step-1000-acc-0.960.bin", map_location=torch.device('cpu'))
convert_keys = {}
for k, v in checkpoint['model_state_dict'].items():
    new_key_name = k.replace("module.", '')
    if new_key_name not in model_dict:
        print("{} is not int model_dict".format(new_key_name))
        continue
    convert_keys[new_key_name] = v

model.load_state_dict(convert_keys)
model.eval()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

input_text = "전 세계 최고의 기대작 <어벤져스> 시리즈의 압도적 대미를 장식할 <어벤져스: 엔드게임>이 지난 4월 14일(일)과 15일(월) 양일간 진행된 대한민국 내한 행사를 성공적으로 마무리 지었다. <어벤져스: 엔드게임>의 주역 로버트 다우니 주니어, 제레미 레너, 브리 라슨, 안소니 루소&조 루소 감독, 트린 트랜 프로듀서, 케빈 파이기 마블 스튜디오 대표까지 방문하여 특별한 대한민국 사랑을 뽐냈다."
# input_text = input('input> ')

list_of_input_ids = tokenizer.list_of_string_to_list_of_cls_sep_token_ids([input_text])
# print(list_of_input_ids)
x_input = torch.tensor(list_of_input_ids).long().to(device)
list_of_pred_ids = model(x_input)
list_of_pred_ids = [i[0] for i in list_of_pred_ids]
# print(list_of_pred_ids)

In [3]:
import pandas as pd
#
input_tokens = tokenizer.decode_token_ids(list_of_input_ids)[0]
pred_ner_tag = [index_to_ner[pred_id] for pred_id in list_of_pred_ids]
pd.DataFrame([input_tokens, list_of_input_ids[0], 
              list_of_pred_ids, pred_ner_tag],
             index=['input tokens', 'input IDs', 'pred IDs', 'pred NER'])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,124,125,126,127,128,129,130,131,132,133
input tokens,[CLS],▁전,▁세계,▁최고의,▁기대,작,▁<,어,벤,져,...,▁대표,까지,▁방문,하여,▁특별한,▁대한민국,▁사랑을,▁뽐냈다,.,[SEP]
input IDs,2,4012,2802,4524,1267,7170,630,6855,6347,7245,...,1674,5592,2268,7815,4780,1683,2591,2567,54,3
pred IDs,0,4,4,4,4,4,4,5,6,6,...,4,4,4,1,4,19,4,4,4,1
pred NER,[CLS],O,O,O,O,O,O,B-POH,I-POH,I-POH,...,O,O,O,[SEP],O,B-ORG,O,O,O,[SEP]
