In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import itertools
import random
import json
from tqdm import tqdm
import numpy as np
import unicodedata

import torch
from torch.utils.data import DataLoader
from transformers import BertJapaneseTokenizer, BertForTokenClassification
import pytorch_lightning as pl

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL_NAME = "cl-tohoku/bert-base-japanese-whole-word-masking"

In [4]:
# representation of named entitiy recognition
text = "AさんはBCD株式会社を起業した。"
entities = [
    {"name": "A", "span": [0, 1], "type": "人名", "type_id": 1},
    {"name": "BCD株式会社", "span": [4, 11], "type": "組織名", "type_id": 1},
]

In [24]:
class NER_tokenizer(BertJapaneseTokenizer):
    def encode_plus_tagged(self, text, entities, max_length):
        """
        文章とそれに含まれる固有表現が与えられたときに、
        符号化とラベル列の作成を行う
        """

        # 固有表現の前後でテキストを分割し、それぞれのラベルをつけておく
        entities = sorted(entities, key=lambda x: x['span'][0])
        splitted = []
        position = 0
        for entity in entities:
            start = entity['span'][0]
            end = entity["span"][1]
            label = entity['type_id']
            # この分割方法は固有表現が連続して存在する場合には対応できない
            # 固有表現でないもの
            splitted.append({'text': text[position:start], 'label': 0})
            # 固有表現には対応するラベルを付与
            splitted.append({'text': text[start:end], 'label': label})
            position = end
        splitted.append({'text': text[position:], 'label': 0})
        splitted = [s for s in splitted if s["text"]] # remove empty string

        # tokenize splitted string and add label
        tokens = []
        labels = []
        for text_splitted in splitted:
            text = text_splitted['text']
            label = text_splitted["label"]
            tokens_splitted = self.tokenize(text)
            labels_splitted = [label] * len(tokens_splitted)
            tokens.extend(tokens_splitted)
            labels.extend(labels_splitted)
        
        input_ids = self.convert_tokens_to_ids(tokens)
        encoding = self.prepare_for_model(
            input_ids,
            max_length=max_length,
            padding='max_length',
            truncation=True

        )
        # make label 0 of special token [CLS], [SEP]
        labels = [0] + labels[:max_length-2] + [0]
        # make label 0 of special token [PAD]
        labels = labels + [0]*(max_length - len(labels))
        encoding['labels'] = labels
        return encoding

    def encode_plus_untagged(self, text, max_length=None, return_tensors=None):
        """
        文章をトークン化し、それぞれのトークンの文章中の位置も特定しておく
        """
        # 文章のトークン化を行い、
        # それぞれのトークンと文章中の文字列を対応づける
        tokens = []
        tokens_original = [] # トークンに対応する文章中の文字列
        words = self.word_tokenizer.tokenize(text) # splitted by MeCab
        for word in words:
            # split word to sub word
            tokens_word = self.subword_tokenizer.tokenize(word)
            tokens.extend(tokens_word)
            if tokens_word[0] == "[UNK]":
                tokens_original.append(word)
            else:
                tokens_original.extend([token.replace("##", "") for token in tokens_word])
        
        # 各トークンの文章中での位置を調べる。（空白の位置を考慮する）
        position = 0
        spans = [] # トークンの位置を追加していく。
        for token in tokens_original:
            token_length = len(token)
            while 1:
                if token != text[position:position+token_length]:
                    position += 1
                else:
                    spans.append([position, position+token_length])
                    position += token_length
                    break

        # 符号化を行いBERTに入力できる形式にする。
        input_ids = self.convert_tokens_to_ids(tokens) 
        encoding = self.prepare_for_model(
            input_ids, 
            max_length=max_length, 
            padding='max_length' if max_length else False, 
            truncation=True if max_length else False
        )
        sequence_length = len(encoding['input_ids'])
        # 特殊トークン[CLS]に対するダミーのspanを追加。
        spans = [[-1, -1]] + spans[:sequence_length-2] 
        # 特殊トークン[SEP]、[PAD]に対するダミーのspanを追加。
        spans = spans + [[-1, -1]] * ( sequence_length - len(spans) ) 

        # 必要に応じてtorch.Tensorにする。
        if return_tensors == 'pt':
            encoding = { k: torch.tensor([v]) for k, v in encoding.items() }

        return encoding, spans
    
    def convert_bert_output_to_entities(self, text, labels, spans):
        """
        文章、ラベル列の予測値、各トークンの位置から固有表現を得る。
        """
        # labels, spansから特殊トークンに対応する部分を取り除く
        labels = [label for label, span in zip(labels, spans) if span[0] != -1]
        spans = [span for span in spans if span[0] != -1]

        # 同じラベルが連続するトークンをまとめて、固有表現を抽出する。
        entities = []
        for label, group \
            in itertools.groupby(enumerate(labels), key=lambda x: x[1]):
            group = list(group)
            # print(f"label: {label}, group: {group}")
            start = spans[group[0][0]][0]
            end = spans[group[-1][0]][1]

            if label != 0: # ラベルが0以外ならば、新たな固有表現として追加。
                entity = {
                    "name": text[start:end],
                    "span": [start, end],
                    "type_id": label
                }
                entities.append(entity)

        return entities

In [6]:
tokenizer = NER_tokenizer.from_pretrained(MODEL_NAME)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'NER_tokenizer'.


In [7]:
text = '昨日のみらい事務所との打ち合わせは順調だった。'
entities = [
    {'name': 'みらい事務所', 'span': [3,9], 'type_id': 1}
]

encoding = tokenizer.encode_plus_tagged(
    text, entities, max_length=20
)
print(encoding)

{'input_ids': [2, 10271, 28486, 5, 546, 10780, 2464, 13, 5, 1878, 2682, 9, 10750, 308, 10, 8, 3, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], 'labels': [0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [8]:
text = '騰訊の英語名はTencent Holdings Ltdである。'
encoding, spans = tokenizer.encode_plus_untagged(
    text, return_tensors='pt'
)
print('# encoding')
print(encoding)
print('# spans')
print(spans)

# encoding
{'input_ids': tensor([[    2,     1, 26280,     5,  1543,   125,     9,  6749, 28550,  2953,
         28550, 28566, 21202, 28683, 14050, 12475,    12,    31,     8,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
# spans
[[-1, -1], [0, 1], [1, 2], [2, 3], [3, 5], [5, 6], [6, 7], [7, 9], [9, 10], [10, 12], [12, 13], [13, 14], [15, 18], [18, 19], [19, 23], [24, 27], [27, 28], [28, 30], [30, 31], [-1, -1]]


In [9]:
labels_predicted = [0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0]
entities = tokenizer.convert_bert_output_to_entities(
    text, labels_predicted, spans
)
print(entities)

label: 1, group: [(0, 1), (1, 1)]
label: 0, group: [(2, 0), (3, 0), (4, 0), (5, 0)]
label: 1, group: [(6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)]
label: 0, group: [(15, 0), (16, 0), (17, 0)]
[{'name': '騰訊', 'span': [0, 2], 'type_id': 1}, {'name': 'Tencent Holdings Ltd', 'span': [7, 27], 'type_id': 1}]


In [10]:
# BERTによる固有表現抽出
tokenizer = NER_tokenizer.from_pretrained(MODEL_NAME)
bert_tc = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=4)
bert_tc = bert_tc.cuda()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'NER_tokenizer'.
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expecte

In [11]:
text = "AさんはB大学に入学した。"

encoding, spans = tokenizer.encode_plus_untagged(text, return_tensors='pt')

print(encoding)
print(spans)

encoding = {k: v.cuda() for k, v in encoding.items()}

with torch.no_grad():
    output = bert_tc(**encoding)
    scores = output.logits
    labels_predicted = scores[0].argmax(-1).cpu().numpy().tolist()

entities = tokenizer.convert_bert_output_to_entities(text, labels_predicted, spans)
print(entities)


{'input_ids': tensor([[   2,  192, 2375,    9,  277,  396,    7, 2663,   15,   10,    8,    3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
[[-1, -1], [0, 1], [1, 3], [3, 4], [4, 5], [5, 7], [7, 8], [8, 10], [10, 11], [11, 12], [12, 13], [-1, -1]]
label: 0, group: [(0, 0), (1, 0)]
label: 3, group: [(2, 3), (3, 3)]
label: 0, group: [(4, 0)]
label: 1, group: [(5, 1)]
label: 0, group: [(6, 0)]
label: 1, group: [(7, 1), (8, 1), (9, 1)]
[{'name': 'はB', 'span': [3, 5], 'type_id': 3}, {'name': 'に', 'span': [7, 8], 'type_id': 1}, {'name': 'した。', 'span': [10, 13], 'type_id': 1}]


In [12]:
data = [
    {
        'text': 'AさんはB大学に入学した。',
        'entities': [
            {'name': 'A', 'span': [0, 1], 'type_id': 2},
            {'name': 'B大学', 'span': [4, 7], 'type_id': 1}
        ]
    },
    {
        'text': 'CDE株式会社は新製品「E」を販売する。',
        'entities': [
            {'name': 'CDE株式会社', 'span': [0, 7], 'type_id': 1},
            {'name': 'E', 'span': [12, 13], 'type_id': 3}
        ]
    }
]

max_length = 32
dataset_for_loader = []
for sample in data:
    text = sample["text"]
    entities = sample['entities']
    encoding = tokenizer.encode_plus_tagged(text, entities, max_length=max_length)
    encoding = {k: torch.tensor(v) for k, v in encoding.items()}
    dataset_for_loader.append(encoding)
dataloader = DataLoader(dataset_for_loader, batch_size=len(data))

for batch in dataloader:
    batch = {k: v.cuda() for k, v in batch.items()}
    output = bert_tc(**batch)
    loss = output.loss

In [22]:
# nerのfinetuning wikiのデータを使う
dataset = json.load(open("../data/ner-wikipedia-dataset/ner.json", "r"))

type_id_dict = {
    "人名": 1, 
    "法人名": 2,
    "政治的組織名": 3,
    "その他の組織名": 4,
    "地名": 5,
    "施設名": 6,
    "製品名": 7,
    "イベント名": 8,
}

for sample in dataset:
    sample['text'] = unicodedata.normalize("NFKC", sample["text"])
    for e in sample['entities']:
        e['type_id'] = type_id_dict[e["type"]]
        del e['type']

random.shuffle(dataset)
n = len(dataset)
n_train = int(n*0.6)
n_val = int(n*0.2)
dataset_train = dataset[:n_train]
dataset_val = dataset[n_train:n_train+n_val]
dataset_test = dataset[n_train+n_val:]

In [17]:
def create_dataset(tokenizer, dataset, max_length):
    dataset_for_loader = []
    for sample in dataset:
        text = sample["text"]
        entities = sample["entities"]
        encoding = tokenizer.encode_plus_tagged(text, entities, max_length)
        encoding = {k: torch.tensor(v) for k, v in encoding.items()}
        dataset_for_loader.append(encoding)
    return dataset_for_loader

tokenizer = NER_tokenizer.from_pretrained(MODEL_NAME)

max_length = 128
dataset_train_for_loader = create_dataset(tokenizer, dataset_train, max_length)
dataset_val_for_loader = create_dataset(tokenizer, dataset_val, max_length)

dataloader_train = DataLoader(dataset_train_for_loader, batch_size=32, shuffle=True)
dataloader_val = DataLoader(dataset_val_for_loader, batch_size=256)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'NER_tokenizer'.


In [20]:
class BertForTokenClassification_pl(pl.LightningModule):
    def __init__(self, model_name, num_labels, lr):
        super().__init__()
        self.save_hyperparameters()
        self.bert_tc = BertForTokenClassification.from_pretrained(model_name, num_labels=num_labels)

    def training_step(self, batch, batch_idx):
        output = self.bert_tc(**batch)
        loss = output.loss
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        output = self.bert_tc(**batch)
        val_loss = output.loss
        self.log('val_loss', val_loss)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.lr)

checkpoint = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    mode='min',
    save_top_k=1,
    save_weights_only=True,
    dirpath = '../model/'
)

trainer = pl.Trainer(
    gpus=1,
    max_epochs=5,
    callbacks=[checkpoint]
)

model = BertForTokenClassification_pl(MODEL_NAME, num_labels=9, lr=1e-5)
trainer.fit(model, dataloader_train, dataloader_val)
best_model_path = checkpoint.best_model_path

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly iden

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 4: 100%|██████████| 106/106 [02:44<00:00,  1.55s/it, loss=0.0154, v_num=5]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 106/106 [02:49<00:00,  1.60s/it, loss=0.0154, v_num=5]


In [26]:
# inference

def predict(text, tokenizer, bert_tc):
    encoding, spans = tokenizer.encode_plus_untagged(
        text, return_tensors='pt'
    )
    encoding = {k: v.cuda() for k, v in encoding.items()}

    with torch.no_grad():
        output = bert_tc(**encoding)
        scores = output.logits
        labels_predicted = scores[0].argmax(-1).cpu().numpy().tolist()

    entities = tokenizer.convert_bert_output_to_entities(
        text, labels_predicted, spans
    )

    return entities

tokenizer = NER_tokenizer.from_pretrained(MODEL_NAME)

model = BertForTokenClassification_pl.load_from_checkpoint(best_model_path)

bert_tc = model.bert_tc.cuda()

# コードのわかりやすさのために1データずつ処理しているが、
# バッチで処理したほうが早い

entities_list = []
entities_predicted_list = []
for sample in tqdm(dataset_test):
    text = sample['text']
    entities_predicted = predict(text, tokenizer, bert_tc)
    entities_list.append(sample['entities'])
    entities_predicted_list.append(entities_predicted)
    


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. 
The class this function is called from is 'NER_tokenizer'.
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expecte

# ground truth
[{'name': '男子B.LEAGUE', 'span': [3, 13], 'type_id': 4}, {'name': 'アルバルク東京', 'span': [18, 25], 'type_id': 4}]
# extract
[{'name': '男子B.LEAGUE', 'span': [3, 13], 'type_id': 4}, {'name': 'アルバルク東京', 'span': [18, 25], 'type_id': 4}]





In [28]:
print("# ground truth")
print(entities_list[2])
print("# extract")
print(entities_predicted_list[2])


# ground truth
[{'name': 'AIWAF', 'span': [0, 5], 'type_id': 7}]
# extract
[{'name': 'AIWAF', 'span': [0, 5], 'type_id': 7}]
