<a href="https://colab.research.google.com/github/taishi-i/nagisa-tutorial-pycon2019/blob/master/notebooks/kwdlc_ner_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1.はじめに
京都大学ウェブ文書リードコーパスを利用し、
日本語を対象とした固有表現抽出モデルの学習・評価・予測を行います。

GPU を利用する場合はランタイム→ランタイムのタイプ変更→GPU をオンにしてください。




# 2.事前準備
- Python ライブラリーのインストール
- 作業ディレクトリの作成

In [0]:
!pip install bs4
!pip install nagisa
!pip install seqeval
!pip install flair

In [0]:
!mkdir data

# 3.京都大学ウェブ文書リードコーパスの前処理
- GitHub よりコーパスをダウンロードする
- nagisa と FRAIR 学習用にスペース区切りのデータセットに変換する
- 学習/開発/評価用データセットに分割する

In [0]:
!git clone https://github.com/ku-nlp/KWDLC

In [0]:
import os
import glob
import random

import bs4
import nagisa

In [0]:
def load_kwdlc(dir_path):
    files = glob.glob(dir_path+"/*/*", recursive=True)

    data = []
    words = []
    postgas = []

    position2ne = {}

    for fn in files:
        with open(fn, "r") as f:
            for line in f:
                line = line.strip()
                first_char = line[0]

                if first_char == "+":
                    soup = bs4.BeautifulSoup(line, "html.parser")
                    num_tags = len(soup.contents)
                    for i in range(num_tags):
                        if str(type(soup.contents[i])) == "<class 'bs4.element.Tag'>":
                            ne_tag_tokens = str(soup.contents[i]).split(":")
                            is_ne = ne_tag_tokens[0][1:]

                            if is_ne == "ne":
                                netype = ne_tag_tokens[1]
                                target = ne_tag_tokens[2].split(">")[0]

                                position2ne[len(words)] = [target, netype]

                elif first_char == "#" or first_char == "*":
                    None

                elif line == "EOS":
                    # process
                    if len(position2ne) > 0:
                        positions = position2ne.keys()
                        for position in positions:
                            target = position2ne[position][0]
                            netype = position2ne[position][1]

                    data.append([words, postgas, position2ne])

                    # reset
                    words = []
                    postgas = []
                    position2ne = {}

                else:
                    tokens = line.split()
                    surface = tokens[0]
                    words.append(surface)

                    postag = "_".join(tokens[3:4])
                    postgas.append(postag)

    return data, position2ne

In [0]:
def write_kwdlc_as_single_file(filename, data, position2ne):

    with open(filename, "w") as f:
        for line in data:
            words, postgas, position2ne = line

            nes = [v[0] for k, v in sorted(position2ne.items(), key=lambda x:x[0])]
            nes = list(reversed(nes))

            tags = [v[1] for k, v in sorted(position2ne.items(), key=lambda x:x[0])]
            tags = list(reversed(tags))

            if len(nes) == 0:
                None

            else:
                ne_tags = []

                ne = nes.pop()
                tag = tags.pop()
                ne_target_char = ne[0]

                partical = []
                for word in words:
                    first_char = word[0]
                    if first_char == ne_target_char:

                        if word in ne:
                            partical.append(word)

                            if "".join(partical) == ne:

                                for i, word in enumerate(partical):
                                    if i == 0:
                                        ne_tags.append("B-"+tag)
                                    elif i == len(partical) - 1:
                                        ne_tags.append("E-"+tag)
                                    else:
                                        ne_tags.append("M-"+tag)

                                if len(nes) > 0:
                                    ne = nes.pop()
                                    tag = tags.pop()
                                    ne_target_char = ne[0]

                                partical = []

                            else:
                                ne_target_char = ne[len("".join(partical))]

                        else:
                            partical = []
                            ne_tags.append("O")

                    else:
                        partical = []
                        ne_tags.append("O")

                for word, postag, ne_tag in zip(words, postgas, ne_tags):
                    f.write(" ".join([word, ne_tag])+"\n")
                f.write("\n")


def write_file(filename, X, Y):
    with open(filename, "w") as f:
        for x, y in zip(X, Y):
            for word, tag in zip(x, y):
                f.write(" ".join([word, tag])+"\n")
            f.write("\n")

In [0]:
# load KNP files
dir_path = "KWDLC"
dir_path = os.path.join(dir_path, "knp")
data, position2ne = load_kwdlc(dir_path)

# write a file
fn_out = "data/kwdlc.txt"
write_kwdlc_as_single_file(fn_out, data, position2ne)

# divide kwdlc.txt into trainset, devset, testset
random.seed(1234)
fn_in = "data/kwdlc.txt"
fn_out_train = "data/kwdlc.train"
fn_out_dev = "data/kwdlc.dev"
fn_out_test = "data/kwdlc.test"
train_data = 0.9
dev_data = 0.05
test_data = 0.05

X, Y = nagisa.utils.load_file(fn_in, delimiter=' ', newline='')                                    
indice = [i for i in range(len(X))]                                        
random.shuffle(indice)                                                     
                                                                           
num_train = int(train_data * len(indice))                                  
num_dev = int(dev_data * len(indice))                                      
num_test = int(test_data * len(indice))                                    
                                                                           
train_X = [X[i] for i in indice[:num_train]]                               
train_Y = [Y[i] for i in indice[:num_train]]                               
write_file(fn_out_train, train_X, train_Y)                                 
                                                                           
dev_X = [X[i] for i in indice[num_train:num_train+num_dev]]                
dev_Y = [Y[i] for i in indice[num_train:num_train+num_dev]]                
write_file(fn_out_dev, dev_X, dev_Y)                                       
                                                                           
test_X = [X[i] for i in indice[num_train+num_dev:num_train+num_dev+num_test]]
test_Y = [Y[i] for i in indice[num_train+num_dev:num_train+num_dev+num_test]]
write_file(fn_out_test, test_X, test_Y)                                 

# 4.固有表現抽出モデルの学習 (nagisa)

In [0]:
nagisa.fit(
    train_file="data/kwdlc.train",
    dev_file="data/kwdlc.dev",
    test_file="data/kwdlc.test",
    model_name="data/kwdlc_ner_model",
    delimiter=' ',  # delimiter="\t"
    newline='',  # newline='EOS'
)

[nagisa] LAYERS: 1
[nagisa] THRESHOLD: 3
[nagisa] DECAY: 1
[nagisa] EPOCH: 10
[nagisa] WINDOW_SIZE: 3
[nagisa] DIM_UNI: 32
[nagisa] DIM_BI: 16
[nagisa] DIM_WORD: 16
[nagisa] DIM_CTYPE: 8
[nagisa] DIM_TAGEMB: 16
[nagisa] DIM_HIDDEN: 100
[nagisa] LEARNING_RATE: 0.1
[nagisa] DROPOUT_RATE: 0.3
[nagisa] SEED: 1234
[nagisa] TRAINSET: data/kwdlc.train
[nagisa] TESTSET: data/kwdlc.test
[nagisa] DEVSET: data/kwdlc.dev
[nagisa] DICTIONARY: None
[nagisa] EMBEDDING: None
[nagisa] HYPERPARAMS: data/kwdlc_ner_model.hp
[nagisa] MODEL: data/kwdlc_ner_model.params
[nagisa] VOCAB: data/kwdlc_ner_model.vocabs
[nagisa] EPOCH_MODEL: data/kwdlc_ner_model_epoch.params
[nagisa] NUM_TRAIN: 4642
[nagisa] NUM_TEST: 257
[nagisa] NUM_DEV: 257
[nagisa] VOCAB_SIZE_UNI: 1927
[nagisa] VOCAB_SIZE_BI: 15055
[nagisa] VOCAB_SIZE_WORD: 5638
[nagisa] VOCAB_SIZE_POSTAG: 29
Epoch	LR   	Loss 	Time_m	DevWS_f1	DevPOS_f1	TestWS_f1	TestPOS_f1
1    	0.100	14.21	1.308	92.95   	84.11   	91.96   	83.16   
2    	0.100	8.399	1.326	93.70

# 5.固有表現抽出モデルの評価 (nagisa)

In [0]:
from seqeval.metrics import classification_report

ner_tagger = nagisa.Tagger(
    vocabs='data/kwdlc_ner_model.vocabs',
    params='data/kwdlc_ner_model.params',
    hp='data/kwdlc_ner_model.hp'
)

fn_in_test = "data/kwdlc.test"
test_X, test_Y = nagisa.utils.load_file(fn_in_test, delimiter=' ', newline='')

true_Y = []
pred_Y = []
for x, true_y in zip(test_X, test_Y):
    pred_y = ner_tagger.decode(x)
    true_Y += true_y
    pred_Y += pred_y

report = classification_report(true_Y, pred_Y)
print(report)

              precision    recall  f1-score   support

    artifact       0.35      0.37      0.36        46
        date       0.82      0.91      0.86        86
        time       0.62      0.50      0.56        10
    location       0.70      0.75      0.73       132
organization       0.47      0.46      0.47        54
      person       0.49      0.60      0.54        58
    optional       0.20      0.13      0.16        15
       money       0.38      1.00      0.55         3
     percent       0.67      0.67      0.67         3

   micro avg       0.61      0.65      0.63       407
   macro avg       0.60      0.65      0.63       407



# 6.固有表現抽出モデルの予測 (nagisa)

In [0]:
ner_tagger = nagisa.Tagger(
    vocabs="data/kwdlc_ner_model.vocabs",
    params="data/kwdlc_ner_model.params",
    hp="data/kwdlc_ner_model.hp"
)

text = "FacebookのAIラボ所長でもあるヤン・ルカン博士"
tokens = ner_tagger.tagging(text)
print(tokens)

Facebook/O の/O AI/O ラボ/E-person 所長/O で/O も/O ある/O ヤン/B-person ・/M-person ルカン/E-person 博士/O


# 7.固有表現抽出モデルの学習 (FLAIR)

In [22]:
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.datasets import ColumnCorpus
from flair.embeddings import FlairEmbeddings
from flair.embeddings import StackedEmbeddings

# preprocess 
columns = {0: 'text', 1: 'ner'}
data_folder = '.'
corpus = ColumnCorpus(
    data_folder,
    columns,
    train_file='data/kwdlc.train',
    dev_file="data/kwdlc.dev",
    test_file="data/kwdlc.test"
)

tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# construct a flair model
embedding_types = [
    FlairEmbeddings('ja-forward'),
    FlairEmbeddings('ja-backward'),
]
embeddings = StackedEmbeddings(embeddings=embedding_types)

tagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type=tag_type,
    use_crf=True
)

# start training
trainer = ModelTrainer(tagger, corpus)
trainer.train(
    'resources/taggers/example-ner',
    learning_rate=0.1,
    mini_batch_size=32,
    max_epochs=10
)

2019-11-07 00:20:04,811 Reading data from .
2019-11-07 00:20:04,813 Train: data/kwdlc.train
2019-11-07 00:20:04,819 Dev: data/kwdlc.dev
2019-11-07 00:20:04,825 Test: data/kwdlc.test
2019-11-07 00:20:09,696 ----------------------------------------------------------------------------------------------------
2019-11-07 00:20:09,698 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.3, inplace=False)
        (encoder): Embedding(15174, 100)
        (rnn): LSTM(100, 2048, num_layers=2, dropout=0.3)
        (decoder): Linear(in_features=2048, out_features=15174, bias=True)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.3, inplace=False)
        (encoder): Embedding(15174, 100)
        (rnn): LSTM(100, 2048, num_layers=2, dropout=0.3)
        (decoder): Linear(in_features=2048, out_features=15174, bias=True)
      )
    )
  )
  

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


2019-11-07 00:23:17,975 ----------------------------------------------------------------------------------------------------
2019-11-07 00:23:18,443 epoch 2 - iter 0/146 - loss 5.52695751 - samples/sec: 963.81
2019-11-07 00:23:22,350 epoch 2 - iter 14/146 - loss 5.65929953 - samples/sec: 115.77
2019-11-07 00:23:26,066 epoch 2 - iter 28/146 - loss 5.68580937 - samples/sec: 122.23
2019-11-07 00:23:29,920 epoch 2 - iter 42/146 - loss 5.66411025 - samples/sec: 117.48
2019-11-07 00:23:33,679 epoch 2 - iter 56/146 - loss 5.58606237 - samples/sec: 120.51
2019-11-07 00:23:37,154 epoch 2 - iter 70/146 - loss 5.41597591 - samples/sec: 130.48
2019-11-07 00:23:40,804 epoch 2 - iter 84/146 - loss 5.43181476 - samples/sec: 124.17
2019-11-07 00:23:44,388 epoch 2 - iter 98/146 - loss 5.31911413 - samples/sec: 126.48
2019-11-07 00:23:48,405 epoch 2 - iter 112/146 - loss 5.16790419 - samples/sec: 112.77
2019-11-07 00:23:51,978 epoch 2 - iter 126/146 - loss 5.10893094 - samples/sec: 126.77
2019-11-07 00:

{'test_score': 0.8277,
 'dev_score_history': [0.6969,
  0.7686,
  0.7586,
  0.7531,
  0.7721,
  0.8007,
  0.8042,
  0.8084,
  0.7947,
  0.7841],
 'train_loss_history': [11.781115695221784,
  5.0517017335107886,
  3.9207628981707847,
  3.3375634175457365,
  2.93023253875236,
  2.6282880371564055,
  2.3907947907709097,
  2.2098855572204066,
  2.0308768520616507,
  1.9204641523426527],
 'dev_loss_history': [tensor(5.6763, device='cuda:0'),
  tensor(3.9821, device='cuda:0'),
  tensor(2.9844, device='cuda:0'),
  tensor(2.7174, device='cuda:0'),
  tensor(2.5273, device='cuda:0'),
  tensor(2.4381, device='cuda:0'),
  tensor(2.3613, device='cuda:0'),
  tensor(1.9299, device='cuda:0'),
  tensor(2.1968, device='cuda:0'),
  tensor(2.3883, device='cuda:0')]}

# 8.固有表現抽出モデルの評価 (FLAIR)

In [23]:
from flair.data import Sentence
from flair.models import SequenceTagger
from seqeval.metrics import classification_report

model = SequenceTagger.load('resources/taggers/example-ner/final-model.pt')


fn_in_test = "data/kwdlc.test"
test_X, test_Y = nagisa.utils.load_file(fn_in_test, delimiter=' ', newline="")

true_Y = []
pred_Y = []
for x, true_y in zip(test_X, test_Y):
    text = " ".join(x)
    sentence = Sentence(text)

    model.predict(sentence)
    tagged_text = sentence.to_tagged_string()

    tokens = tagged_text.split()

    words = []
    tags = []
    for token in tokens:
        first_char = token[0]
        last_char = token[-1]

        if first_char == "<" and last_char == ">":
            tag = token[1:-1]
            tags[-1] = tag
        else:
            words.append(token)
            tags.append("O")

    pred_y = tags

    true_Y += true_y
    pred_Y += pred_y

report = classification_report(true_Y, pred_Y)
print(report)

2019-11-07 00:30:16,051 loading file resources/taggers/example-ner/final-model.pt
              precision    recall  f1-score   support

    artifact       0.64      0.61      0.62        46
        date       0.87      0.94      0.91        86
        time       0.44      0.70      0.54        10
    location       0.89      0.87      0.88       132
organization       0.71      0.59      0.65        54
      person       0.85      0.79      0.82        58
    optional       0.36      0.27      0.31        15
       money       0.75      1.00      0.86         3
     percent       0.25      0.33      0.29         3

   micro avg       0.79      0.78      0.79       407
   macro avg       0.79      0.78      0.78       407



# 9.固有表現抽出モデルの予測 (FLAIR)

In [24]:
model = SequenceTagger.load('resources/taggers/example-ner/final-model.pt')
                                                                                                                                                    
text = "Facebook の AI ラボ 所長 でも ある ヤン ・ ルカン 博士"         
sentence = Sentence(text)                                               
model.predict(sentence)                                                 
print(sentence.to_tagged_string())  

2019-11-07 00:30:56,475 loading file resources/taggers/example-ner/final-model.pt
Facebook の AI ラボ 所長 でも ある ヤン <B-person> ・ <M-person> ルカン <E-person> 博士
