In [1]:
import numpy as np
import pandas as pd
import re
import json
import keras
import keras.backend as K
from keras.layers import Input, Embedding, Bidirectional, TimeDistributed, LSTM, Dense, concatenate, Dropout
from keras.optimizers import SGD, Adam
from keras.models import Model
from keras import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras_contrib.layers import CRF
from seqeval.metrics import f1_score, precision_score, recall_score

Using TensorFlow backend.


In [77]:
# Fix ramdom seed.
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

In [78]:
class MyTokenizer(object):
    def __init__(self):
        self.UNK = '<UNK>'
        self.PAD = '<PAD>'
        self.vocab_word = {self.PAD: 0, self.UNK: 1}
        self.vocab_char = {self.PAD: 0, self.UNK: 1}
        self.vocab_tag = {self.PAD: 0}
        
    def fit(self, sentences, tags, row_sentences=None):
        self._fit_word(sentences)
        
        if row_sentences:
            self._fit_char(row_sentences)
        else:
            self._fit_char(sentences)
        
        self._fit_tag(tags)
        
        self.vocab_word_size = len(self.vocab_word)
        self.vocab_char_size = len(self.vocab_char)
        self.vocab_tag_size = len(self.vocab_tag)
    
    def transform(self, sentences, tags, row_sentences=None):
        word_seq = self._transform_word(sentences)
        
        if row_sentences:
            char_seq = self._transform_char(row_sentences)
        else:
            char_seq = self._transform_char(sentences)
        
        tag_seq = self._transform_tag(tags)
        
        return word_seq, char_seq, tag_seq
    
    def inverse_transform_tag(self, tag_id_seq):
        seq = []
        inv_vocab_tag = {v: k for k, v in self.vocab_tag.items()}
        for tag_ids in tag_id_seq:
            tags = [inv_vocab_tag[tag_id] for tag_id in tag_ids]
            seq.append(tags)

        return seq
    
    def padding(self, word_seq, char_seq, tag_seq):
        return self._padding_word(word_seq), self._padding_char(char_seq), self._padding_tag(tag_seq)
        
    def _padding_word(self, word_seq):
        return pad_sequences(word_seq, padding='post')
    
    def _padding_char(self, char_seq):
        char_max = max([len(max(char_seq_in_sent, key=len)) for char_seq_in_sent in char_seq])
        pad_seq = [pad_sequences(char_seq_in_sent, maxlen=char_max, padding='post') for char_seq_in_sent in char_seq]
        
        # 文の長さも揃える
        return pad_sequences(pad_seq, padding='post')
    
    def _padding_tag(self, tag_seq):
        return pad_sequences(tag_seq, padding='post')

    def _fit_word(self, sentences):
        for s in sentences:
            for w in s:
                if w in self.vocab_word:
                    continue
                self.vocab_word[w] = len(self.vocab_word)
                
    def _fit_char(self, sentences):
        for s in sentences:
            for w in s:
                for c in w:
                    if c in self.vocab_char:
                        continue
                    self.vocab_char[c] = len(self.vocab_char)
                    
    def _fit_tag(self, tag_seq):
        for tags in tag_seq:
            for tag in tags:
                if tag in self.vocab_tag:
                    continue
                self.vocab_tag[tag] = len(self.vocab_tag)
                
    def _transform_word(self, sentences):
        seq = []
        for s in sentences:
            word_ids = [self.vocab_word.get(w, self.vocab_word[self.UNK]) for w in s]
            seq.append(word_ids)
            
        return seq
    
    def _transform_char(self, sentences):
        seq = []
        for s in sentences:
            char_seq = []
            for w in s:
                char_ids = [self.vocab_char.get(c, self.vocab_char[self.UNK]) for c in w]
                char_seq.append(char_ids)
            seq.append(char_seq)
            
        return seq
    
    def _transform_tag(self, tag_seq):
        seq = []
        for tags in tag_seq:
            tag_ids = [self.vocab_tag[tag] for tag in tags]
            seq.append(tag_ids)

        return seq

### Normal Batch

In [79]:
def batch_iter(data, labels, batch_size, tokenizer, shuffle=True):
    num_batches_per_epoch = int((len(data[0]) - 1) / batch_size) + 1

    def data_generator():
        data_size = len(data[0])
        while True:
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = [np.array(_input)[shuffle_indices] for _input in data]
                shuffled_labels = labels[shuffle_indices]
            else:
                shuffled_data = data
                shuffled_labels = labels

            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                X = [_input[start_index: end_index] for _input in shuffled_data]
                y = shuffled_labels[start_index: end_index]
                
                X[0], X[1], y = tokenizer.padding(X[0], X[1], y)
                
                yield X, y

    return num_batches_per_epoch, data_generator()

In [96]:
def onehot2id(onehot_seq):
    return np.argmax(onehot_seq, -1)

def remove_pad(tag_seq):
    return [tags[np.where(tags > 0)[0]] for tags in tag_seq]

In [94]:
def evaluate_seq(y_true, y_pred):
    _y_true = onehot2id(y_true)
    _y_true = remove_pad(_y_true)
    _y_true = tokenizer.inverse_transform_tag(_y_true)

    _y_pred = onehot2id(y_pred)
    _y_pred = remove_pad(_y_pred)
    _y_pred = tokenizer.inverse_transform_tag(_y_pred)

    return {'precision': precision_score(_y_true, _y_pred)
            , 'recall': recall_score(_y_true, _y_pred)
            , 'f1': f1_score(_y_true, _y_pred)
           }

In [82]:
target_col_name = {
    'ふりがな': "production_tag_seq"
    , '別称': "another_name_tag_seq"
    , '用途': "use_tag_seq"
    , '種類': "type_tag_seq"
    , '商標名': "trademark_tag_seq"
    , '特性': "property_tag_seq"
    , '原材料': "raw_material_tag_seq"
    , '製造方法': "production_tag_seq"
    , '生成化合物': "formation_tag_seq"
    , 'CAS番号': "cas_tag_seq"
    , '化学式': "chemical_formula_tag_seq"
    , '密度': "density_tag_seq"
    , '融点': "melting_tag_seq"
    , '沸点': "boiling_tag_seq"
    , '示性式': "rational_formula_tag_seq"
}

In [83]:
# load data
train_df = pd.read_pickle("../data/train_IOB_repl_compound.pkl")
test_df = pd.read_pickle("../data/test_IOB_repl_compound.pkl")

train_df.head()

Unnamed: 0,_id,label,sentence,title,words,repl_words,furigana_tag_seq,another_name_tag_seq,use_tag_seq,type_tag_seq,...,property_tag_seq,raw_material_tag_seq,production_tag_seq,formation_tag_seq,cas_tag_seq,chemical_formula_tag_seq,density_tag_seq,melting_tag_seq,boiling_tag_seq,rational_formula_tag_seq
0,10166,False,アンモニア (英: ammonia) は分子式が NH 3 で表される無機化合物。,アンモニア,"[アンモニア, (, 英, :, ammonia, ), は, 分子, 式, が, NH, ...","[[title-compound], (, 英, :, [title-compound], ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, B, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,10166,False,常温常圧では無色の気体で、特有の強い刺激臭を持つ。,アンモニア,"[常温, 常, 圧, で, は, 無色, の, 気体, で, 、, 特有, の, 強い, 刺...","[常温, 常, 圧, で, は, 無色, の, 気体, で, 、, 特有, の, 強い, 刺...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,10166,False,水に良く溶けるため、水溶液（アンモニア水）として使用されることも多く、化学工業では基礎的な窒...,アンモニア,"[水, に, 良く, 溶ける, ため, 、, 水溶液, （, アンモニア水, ）, として,...","[水, に, 良く, 溶ける, ため, 、, 水溶液, （, [compound], ）, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,10166,False,塩基の程度は水酸化ナトリウムより弱い。,アンモニア,"[塩基, の, 程度, は, 水酸化ナトリウム, より, 弱い, 。]","[塩基, の, 程度, は, [compound], より, 弱い, 。]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]",...,"[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]","[O, O, O, O, O, O, O, O]"
4,10166,False,窒素原子上の孤立電子対のはたらきにより、金属錯体の配位子となり、その場合はアンミンと呼ばれる。,アンモニア,"[窒素, 原子, 上, の, 孤立, 電子, 対, の, はたらき, により, 、, 金属,...","[窒素, 原子, 上, の, 孤立, 電子, 対, の, はたらき, により, 、, 金属,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[B, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [84]:
tokenizer = MyTokenizer()
tokenizer.fit(
    sentences=train_df.repl_words.tolist()
    , row_sentences=train_df.words.tolist()
    , tags=['B', 'I', 'O']
)

In [85]:
param = {
    'char_vocab_size': tokenizer.vocab_char_size
    , 'word_vocab_size':tokenizer.vocab_word_size
    , 'tag_size': tokenizer.vocab_tag_size
    , 'char_emb_dim': 25
    , 'word_emb_dim': 100
    , 'char_lstm_units': 25
    , 'word_lstm_units': 100
    , 'dropout_rate': 0.5
    , 'lstm_activation': 'tanh'
    , 'fc_activation': 'tanh'
    , 'fc_units': 100
}

In [86]:
char_input = Input(shape=(None, None))
word_input = Input(shape=(None,))

char_emb = Embedding(input_dim=param['char_vocab_size']
                     , output_dim=param['char_emb_dim']
                     , mask_zero=True)(char_input)
char_emb = TimeDistributed(Bidirectional(LSTM(units=param['char_lstm_units'], activation=param['lstm_activation'])))(char_emb)

word_emb = Embedding(input_dim=param['word_vocab_size']
                     , output_dim=param['word_emb_dim']
                     , mask_zero=True)(word_input)

feats = concatenate([char_emb, word_emb])

feats = Dropout(param['dropout_rate'])(feats)

feats = Bidirectional(LSTM(units=param['word_lstm_units'], return_sequences=True, activation=param['lstm_activation']))(feats)

feats = Dense(param['fc_units'], activation=param['fc_activation'])(feats)
feats = Dense(param['tag_size'])(feats)

crf = CRF(param['tag_size'])
pred = crf(feats)

model = Model(inputs=[word_input, char_input], outputs=[pred])

sgd = SGD(lr=0.01, clipvalue=5.) # original paper
adam = Adam()

model.compile(loss=crf.loss_function, optimizer=adam)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, None)   0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, None, 2 52975       input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, None, 50)     10200       embedding_1[0][0]                
__________________________________________________________________________________________________
embedding_

In [87]:
# 抽出対象の属性を指定
target_attr = "原材料"

In [88]:
x_word_train, x_char_train, y_train = \
tokenizer.transform(
    sentences=train_df.repl_words.tolist()
    , row_sentences=train_df.words.tolist()
    , tags=train_df[target_col_name[target_attr]].tolist()
)
# one-hot encoding
y_train = np.array([np.identity(tokenizer.vocab_tag_size)[tags] for tags in y_train])

x_word_test, x_char_test, y_test = \
tokenizer.transform(
    sentences=test_df.repl_words.tolist()
    , row_sentences=test_df.words.tolist()
    , tags=test_df[target_col_name[target_attr]].tolist()
)
# one-hot encoding
y_test = np.array([np.identity(tokenizer.vocab_tag_size)[tags] for tags in y_test])

In [89]:
count = np.array([[y[:, 0].sum(), y[:, 1].sum(), y[:, 2].sum(), y[:, 3].sum()] for y in y_train])
count[:, 0].sum(), count[:, 1].sum(), count[:, 2].sum(), count[:, 3].sum()

(0.0, 1984.0, 851.0, 197801.0)

In [15]:
batch_size = 32

In [16]:
train_steps, train_batches = batch_iter([x_word_train, x_char_train], y_train, batch_size, tokenizer)
valid_steps, valid_batches = batch_iter([x_word_test, x_char_test], y_test, batch_size, tokenizer)

In [None]:
model.fit_generator(train_batches, train_steps
                    #, validation_data=valid_batches, validation_steps=valid_batches
                    , epochs=100
                   )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100

In [None]:
model.save("../model/raw-material.h5")

In [90]:
model.load_weights("../model/raw-material.h5")

In [91]:
pad_x_word_test, pad_x_char_test, pad_y_test = tokenizer.padding(x_word_test, x_char_test, y_test)
print(pad_x_word_test.shape)
print(pad_x_char_test.shape)
print(pad_y_test.shape)

(1564, 195)
(1564, 195, 31)
(1564, 195, 4)


In [92]:
y_pred = model.predict([pad_x_word_test, pad_x_char_test])

In [97]:
evaluate_seq(pad_y_test, y_pred)

{'precision': 0.327455919395466,
 'recall': 0.35911602209944754,
 'f1': 0.3425559947299078}

# 検証

In [23]:
with open("../data/compound_train.json", 'r') as f:
    raw_train = json.load(f)
    train_dict = {str(entry['WikipediaID']): entry['Attributes'] for entry in raw_train['entry']}

In [123]:
flatten = lambda l: [i for sub_l in l for i in sub_l]

def extract_words(word_seq, tag_seq):
    words_list = []
    words = []
    for word, tag in zip(word_seq, tag_seq):
        '''
        if ((tag == 2) and (len(phrase) == 0)) or ((tag == 3) and (len(phrase) > 0)):
            phrase.append(word)
        elif tag == 2 and len(phrase) > 0:
            phrase_list.append(phrase)
            phrase = [word]
        '''
        if tag == tokenizer.vocab_tag['B'] or tag == tokenizer.vocab_tag['I']:
            words.append(word)
        elif words:
            words_list.append(words)
            words = []

    if words:
        words_list.append(words)
        
    return words_list

def extract_strings(sentence, extracted_words):
    if extracted_words:
        patt = extract_pattern(extracted_words)
        return re.findall(patt, sentence)
    return []

In [120]:
def escape(s):
    _s = s.replace(r'.', r'\.')
    _s = _s.replace(r'+', r'\+')
    _s = _s.replace(r'-', r'\-')
    _s = _s.replace(r'^', r'\^')
    _s = _s.replace(r'?', r'\?')
    _s = _s.replace(r'$', r'\$')
    _s = _s.replace(r'|', r'\|')
    _s = _s.replace(r'(', r'\(').replace(r')', r'\)')
    _s = _s.replace(r'[', r'\[').replace(r']', r'\]')
    _s = _s.replace(r'{', r'\{').replace(r'}', r'\}')
    
    _s = _s.replace(r'*', '\*')
    _s = re.sub(r'\\s\\\*', '\s*', _s)
    
    return _s

In [108]:
def extract_pattern(chunks):
    patt = [''.join(chunk) for chunk in chunks]
    patt = ['\s*'.join(list(p)) for p in patt] # 元の文に空白が入っている場合を考慮
    patt = '|'.join([escape(p) for p in patt])
    
    return patt

In [100]:
pred_tag_seq = remove_pad(onehot2id(y_pred))

In [127]:
result_dict = {}
for i, (_, row) in enumerate(test_df.iterrows()):
    extracted = extract_words(row.words, pred_tag_seq[i])
    extracted = extract_strings(row.sentence, extracted)
    result_dict[row._id] = result_dict.get(row._id, []) + extracted
    
# 重複を除外
for _id, extracted in result_dict.items():
    result_dict[_id] = list(set(extracted))

In [129]:
len(result_dict

{'12437': ['メタノール',
  'ナトリウムエトキシド',
  'エチレングリコール',
  '酢酸',
  '硫酸',
  'ホルムアルデヒド',
  '伯エタノール',
  'アルコール',
  'アセトアルデヒド',
  'エチレン',
  '酵素'],
 '19566': ['シアンメトヘモグロビン',
  'メトヘモグロビン',
  '酸',
  'アクリロニトリル',
  'ジシアン',
  '硫酸',
  'シアン化合物',
  'アンモニア',
  'ウメ',
  '金属イオン',
  'シアン化物',
  'Fe3',
  'ヘモグロビン',
  'アミグダリン',
  'シアン化ナトリウム',
  'メタン'],
 '29891': ['クラウンエーテル',
  '酸',
  'ポリエチレングリコール',
  'エーテル',
  'ハロゲン化合物',
  '酸素',
  'セレニド',
  'オレフィン',
  'アルコール',
  'アルコキシド'],
 '31121': ['フロリダ州', '水', 'カリフォルニア', 'サンタクルーズ'],
 '41685': ['カルボキシル基', 'タンパク質', 'グルタミン酸'],
 '62444': ['糖',
  'アイラ・レムセン',
  'アンモニア',
  '2-スルホ安息香酸',
  '塩素',
  'レムセン',
  'アントラニル酸',
  '亜硝酸',
  '二酸化硫黄'],
 '62464': ['アンモニア', '窒素', 'シアヌル酸', 'シアン酸アンモニウム', '硝酸アンモニウム', '水'],
 '160786': ['メタノール', 'ケイ皮酸', 'トルエン', 'ベンゼン', 'フェニルカルビノール'],
 '189601': ['Cigua', '赤痢菌', 'シガテラ'],
 '217088': [],
 '248891': [],
 '293373': ['モルヒネ'],
 '305316': ['グリセリン', '硫酸', '炭酸カルシウム', '水', '硫酸マグネシウム'],
 '497499': ['メチルアミン'],
 '514541': ['水銀(I)塩', '水溶液'],
 '520951': ['1,2-ジクロロエタン', '

In [26]:
df = test_df.copy()
_pred = np.argmax(y_pred, -1)
_pred = [tags[np.where(tags > 0)[0]] for tags in _pred]

df = df.assign(pred_tag = _pred)

In [31]:
df['extracted'] = \
df.apply(
    lambda x: extract_string(x.words, x.pred_tag)
    , axis=1
)

In [32]:
extracted_dict = {}
for i, row in df.iterrows():
    if not row['extracted']:
        continue
    extracted_patt = [''.join(phrase) for phrase in row['extracted']]
    extracted_patt = ['\s*'.join(flatten(patt)) for patt in extracted_patt] # 元の文に空白が入っている場合を考慮
    extracted_patt = '|'.join([escape(patt) for patt in extracted_patt])
    
    match = re.findall(extracted_patt, row['sentence'])
    extracted_dict[row['_id']] = extracted_dict.get(row['_id'], []) + match

In [33]:
def evaluate_set(true_set, pred_set):
    TP = len(true_set & pred_set)
    FP = len(pred_set - true_set)
    FN = len(true_set - pred_set)
    
    return TP, FP, FN

def precision(TP, FP):
        return TP / (TP + FP)
    
def recall(TP, FN):
    return TP / (TP + FN)

def f1(TP, FP, FN):
    return 2 * precision(TP, FP) * recall(TP, FN) / (precision(TP, FP) + recall(TP, FN))

In [42]:
set(extracted_dict.get('12437'))

{'アセトアルデヒド',
 'アルコール',
 'エチレン',
 'エチレングリコール',
 'ナトリウムエトキシド',
 'ホルムアルデヒド',
 'メタノール',
 '伯エタノール',
 '硫酸',
 '酢酸',
 '酵素'}

In [46]:
TP = 0
FP = 0
FN = 0
for _id in test_df._id.unique():
    train_set = set(train_dict[_id][target_attr])
    extracted_set = set(extracted_dict.get(_id, []))
    extracted_dict[_id] = list(extracted_set)
    
    tp, fp, fn = evaluate_set(train_set, extracted_set)
    TP += tp
    FP += fp
    FN += fn

In [47]:
print("Train size:", len(flatten([v[target_attr] for k, v in train_dict.items() if k in test_df._id.unique()])), \
      "Extracted size:", len(flatten([v for k, v in extracted_dict.items()]))
     )
print("TP:", TP, "\tFP:", FP, "\tFN:", FN)
print("Precision:", precision(TP, FP))
print("Recall:", recall(TP, FN))
print("F1:", f1(TP, FP, FN))

Train size: 160 Extracted size: 299
TP: 89 	FP: 210 	FN: 71
Precision: 0.2976588628762542
Recall: 0.55625
F1: 0.38779956427015255


In [37]:
row_TP = df.apply(lambda x: len(x['extracted']) > 0 and x['label'] == True, axis=1).sum() 
row_FP = df.apply(lambda x: len(x['extracted']) > 0 and x['label'] == False, axis=1).sum() 
row_FN = df.apply(lambda x: len(x['extracted']) == 0 and x['label'] == True, axis=1).sum() 

print("TP:", row_TP, "\tFP:", row_FP, "\tFN:", row_FN)
print("Precision:", precision(row_TP, row_FP))
print("Recall:", recall(row_TP, row_FN))
print("F1:", f1(row_TP, row_FP, row_FN))

TP: 42 	FP: 45 	FN: 44
Precision: 0.4827586206896552
Recall: 0.4883720930232558
F1: 0.48554913294797686


## 抽出結果をjsonファイルに出力

In [48]:
result_filename = "../output/raw-material_with_repl-compounds.json"

In [49]:
result_dict = {}
for _id in test_df._id.unique():
    title = test_df.loc[test_df._id == _id].title.tolist()[0]
    result_dict[_id] = {'title': title, 'true': train_dict[_id][target_attr], 'predict': extracted_dict.get(_id, [])}

with open(result_filename, 'w', encoding='utf-8') as f:
    json.dump(result_dict, f, ensure_ascii=False)