In [1]:
import numpy as np
import pandas as pd
import re
import json
import keras
import keras.backend as K
from keras.layers import Input, Embedding, Bidirectional, TimeDistributed, LSTM, Dense, concatenate, Dropout
from keras.optimizers import SGD, Adam
from keras.models import Model
from keras import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras_contrib.layers import CRF
from seqeval.metrics import f1_score

Using TensorFlow backend.


In [2]:
# Fix ramdom seed.
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

In [3]:
class MyTokenizer(object):
    def __init__(self):
        self.UNK = '<UNK>'
        self.PAD = '<PAD>'
        self.vocab_word = {self.PAD: 0, self.UNK: 1}
        self.vocab_char = {self.PAD: 0, self.UNK: 1}
        self.vocab_tag = {self.PAD: 0}
        self.POS = {self.PAD: 0}
        
    def fit(self, sentences, tags, row_sentences=None, pos_seq=None):
        self._fit_word(sentences)
        
        if row_sentences:
            self._fit_char(row_sentences)
        else:
            self._fit_char(sentences)
        
        self._fit_tag(tags)
        
        self.vocab_word_size = len(self.vocab_word)
        self.vocab_char_size = len(self.vocab_char)
        self.vocab_tag_size = len(self.vocab_tag)
        
        if pos_seq:
            self._fit_pos(pos_seq)
            self.POS_size = len(self.POS)
    
    def transform(self, sentences, tags, row_sentences=None):
        word_seq = self._transform_word(sentences)
        
        if row_sentences:
            char_seq = self._transform_char(row_sentences)
        else:
            char_seq = self._transform_char(sentences)
        
        tag_seq = self._transform_tag(tags)
        
        return word_seq, char_seq, tag_seq
    
    def inverse_transform_tag(self, tag_id_seq):
        seq = []
        inv_vocab_tag = {v: k for k, v in self.vocab_tag.items()}
        for tag_ids in tag_id_seq:
            tags = [inv_vocab_tag[tag_id] for tag_id in tag_ids]
            seq.append(tags)

        return seq
    
    def padding(self, word_seq, char_seq, tag_seq):
        return self._padding_word(word_seq), self._padding_char(char_seq), self._padding_tag(tag_seq)
        
    def _padding_word(self, word_seq):
        return pad_sequences(word_seq, padding='post')
    
    def _padding_char(self, char_seq):
        char_max = max([len(max(char_seq_in_sent, key=len)) for char_seq_in_sent in char_seq])
        pad_seq = [pad_sequences(char_seq_in_sent, maxlen=char_max, padding='post') for char_seq_in_sent in char_seq]
        
        # 文の長さも揃える
        return pad_sequences(pad_seq, padding='post')
    
    def _padding_tag(self, tag_seq):
        return pad_sequences(tag_seq, padding='post')
    
    def _padding_pos(self, pos_seq):
        return pad_sequences(pos_seq, padding='post')
    
    def _fit_word(self, sentences):
        for s in sentences:
            for w in s:
                if w in self.vocab_word:
                    continue
                self.vocab_word[w] = len(self.vocab_word)
                
    def _fit_char(self, sentences):
        for s in sentences:
            for w in s:
                for c in w:
                    if c in self.vocab_char:
                        continue
                    self.vocab_char[c] = len(self.vocab_char)
                    
    def _fit_tag(self, tag_seq):
        for tags in tag_seq:
            for tag in tags:
                if tag in self.vocab_tag:
                    continue
                self.vocab_tag[tag] = len(self.vocab_tag)
                
    def _fit_pos(self, pos_seq):
        for s_pos in pos_seq:
            for pos in s_pos:
                if pos in self.POS:
                    continue
                self.POS[pos] = len(self.POS)
                
    def _transform_word(self, sentences):
        seq = []
        for s in sentences:
            word_ids = [self.vocab_word.get(w, self.vocab_word[self.UNK]) for w in s]
            seq.append(word_ids)
            
        return seq
    
    def _transform_char(self, sentences):
        seq = []
        for s in sentences:
            char_seq = []
            for w in s:
                char_ids = [self.vocab_char.get(c, self.vocab_char[self.UNK]) for c in w]
                char_seq.append(char_ids)
            seq.append(char_seq)
            
        return seq
    
    def _transform_tag(self, tag_seq):
        seq = []
        for tags in tag_seq:
            tag_ids = [self.vocab_tag[tag] for tag in tags]
            seq.append(tag_ids)

        return seq
    
    def _transform_pos(self, pos_seq):
        seq = []
        for s_pos in pos_seq:
            pos_ids = [self.POS[pos] for pos in s_pos]
            seq.append(pos_ids)
            
        return seq

### Normal Batch

In [4]:
def batch_iter(data, labels, batch_size, tokenizer, shuffle=True):
    num_batches_per_epoch = int((len(data[0]) - 1) / batch_size) + 1

    def data_generator():
        data_size = len(data[0])
        while True:
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = [np.array(_input)[shuffle_indices] for _input in data]
                shuffled_labels = labels[shuffle_indices]
            else:
                shuffled_data = data
                shuffled_labels = labels

            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                X = [_input[start_index: end_index] for _input in shuffled_data]
                y = shuffled_labels[start_index: end_index]
                
                X[0], X[1], y = tokenizer.padding(X[0], X[1], y)
                X[2] = tokenizer._padding_pos(X[2])
                
                yield X, y

    return num_batches_per_epoch, data_generator()

In [5]:
def seq_f1(y_true, y_pred):
    y_true = np.argmax(y_true, -1)
    y_true = [tags[np.where(tags > 0)[0]] for tags in y_true]
    y_true = tokenizer.inverse_transform_tag(y_true)

    y_pred = np.argmax(y_pred, -1)
    y_pred = [tags[np.where(tags > 0)[0]] for tags in y_pred]
    y_pred = tokenizer.inverse_transform_tag(y_pred)

    return f1_score(y_true, y_pred)

In [6]:
# load data
train_df = pd.read_pickle("../data/Production_train_repl_compound.pkl")
test_df = pd.read_pickle("../data/Production_test_repl_compound.pkl")

In [35]:
def convert_tag(tag_seq, words):
    _words = np.array(words)
    
    ids = []
    idx = []
    for i, tag in enumerate(tag_seq):
        if tag == 'B' and len(idx) == 0:
            idx.append(i)
        elif tag == 'I' and len(idx) != 0:
            idx.append(i)
        
        elif tag == 'I' and len(idx) == 0:
            print("ERROR TAGGING!")
        elif tag == 'O' and len(idx) != 0:
            idx = []
            
    return [_words[idx] for idx in ids]

In [36]:
train_df.apply(
    lambda x: convert_tag(x.tag, x.words)
    , axis=1
).sum()

[array(['アルケン', 'の', '二', '重', '結合', 'に対して', '水素', 'と', '一酸化', '炭素', 'を',
        '触媒', 'を', '用い', 'て', '付加', 'さ', 'せる'], dtype='<U7'),
 array(['福山', 'インドール', '合成'], dtype='<U15'),
 array(['ネニチェスク', 'の', 'インドール', '合成'], dtype='<U15'),
 array(['アルキルリチウム', 'や', 'グリニャール試薬', 'と', 'の'], dtype='<U12'),
 array(['ドデカヘドラン'], dtype='<U13'),
 array(['（'], dtype='<U13'),
 array(['dodecahedrane'], dtype='<U13'),
 array(['、'], dtype='<U13'),
 array(['化学'], dtype='<U13'),
 array(['式'], dtype='<U13'),
 array([':'], dtype='<U13'),
 array(['C'], dtype='<U13'),
 array(['20'], dtype='<U13'),
 array(['H'], dtype='<U13'),
 array(['20'], dtype='<U13'),
 array(['）'], dtype='<U13'),
 array(['は'], dtype='<U13'),
 array(['、'], dtype='<U13'),
 array(['有機'], dtype='<U13'),
 array(['化合'], dtype='<U13'),
 array(['物'], dtype='<U13'),
 array(['の'], dtype='<U13'),
 array(['1'], dtype='<U13'),
 array(['つ'], dtype='<U13'),
 array(['で'], dtype='<U13'),
 array(['、'], dtype='<U13'),
 array(['1982'], dtype='<U13'),
 array(['

In [39]:
tokenizer = MyTokenizer()
tokenizer.fit(
    sentences=train_df.repl_words.tolist()
    , row_sentences=train_df.words.tolist()
    , tags=train_df.tag.tolist()
    , pos_seq=train_df.POS.tolist() + test_df.POS.tolist()
)

In [27]:
param = {
    'char_vocab_size': tokenizer.vocab_char_size
    , 'word_vocab_size':tokenizer.vocab_word_size
    , 'tag_size': tokenizer.vocab_tag_size
    , 'pos_size': tokenizer.POS_size
    , 'char_emb_dim': 25
    , 'word_emb_dim': 100
    , 'pos_emb_dim': 10
    , 'char_lstm_units': 25
    , 'word_lstm_units': 100
    , 'dropout_rate': 0.5
    , 'activation': 'tanh'
    , 'optimizer': 'adam'
}

In [24]:
char_input = Input(shape=(None, None))
word_input = Input(shape=(None,))
pos_input = Input(shape=(None,))

char_emb = Embedding(input_dim=param['char_vocab_size']
                     , output_dim=param['char_emb_dim']
                     , mask_zero=True)(char_input)
char_emb = TimeDistributed(Bidirectional(LSTM(units=param['char_lstm_units'], activation=param['activation'])))(char_emb)

word_emb = Embedding(input_dim=param['word_vocab_size']
                     , output_dim=param['word_emb_dim']
                     , mask_zero=True)(word_input)

# POS tag
pos_emb = Embedding(input_dim=param['pos_size']
                     , output_dim=param['pos_emb_dim']
                     , mask_zero=True)(pos_input)

feats = concatenate([char_emb, word_emb, pos_emb])

feats = Dropout(param['dropout_rate'])(feats)

feats = Bidirectional(LSTM(units=param['word_lstm_units'], return_sequences=True, activation=param['activation']))(feats)

feats = Dense(param['tag_size'])(feats)

crf = CRF(param['tag_size'])
pred = crf(feats)

model = Model(inputs=[word_input, char_input, pos_input], outputs=[pred])

sgd = SGD(lr=0.01, clipvalue=5.)
adam = Adam()

model.compile(loss=crf.loss_function, optimizer=adam)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, None, None)   0                                            
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, None, None, 2 52975       input_9[0][0]                    
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, None)         0                                            
__________________________________________________________________________________________________
time_distr

In [28]:
x_word_train, x_char_train, y_train = \
tokenizer.transform(
    sentences=train_df.repl_words.tolist()
    , row_sentences=train_df.words.tolist()
    , tags=train_df.tag.tolist()
)
x_pos_train = tokenizer._transform_pos(train_df.POS.tolist())
# one-hot encoding
y_train = np.array([np.identity(tokenizer.vocab_tag_size)[tags] for tags in y_train])

x_word_test, x_char_test, y_test = \
tokenizer.transform(
    sentences=test_df.repl_words.tolist()
    , row_sentences=test_df.words.tolist()
    , tags=test_df.tag.tolist()
)
x_pos_test = tokenizer._transform_pos(test_df.POS.tolist())
# one-hot encoding
y_test = np.array([np.identity(tokenizer.vocab_tag_size)[tags] for tags in y_test])

In [11]:
count = np.array([[y[:, 0].sum(), y[:, 1].sum(), y[:, 2].sum(), y[:, 3].sum()] for y in y_train])
count[:, 0].sum(), count[:, 1].sum(), count[:, 2].sum(), count[:, 3].sum()

(0.0, 193169.0, 2188.0, 5279.0)

In [29]:
batch_size = 128

In [41]:
train_steps, train_batches = batch_iter([x_word_train, x_char_train, x_pos_train], y_train, batch_size, tokenizer)
valid_steps, valid_batches = batch_iter([x_word_test, x_char_test, x_pos_test], y_test, batch_size, tokenizer)

In [42]:
model.fit_generator(train_batches, train_steps
                    #, validation_data=valid_batches, validation_steps=valid_batches
                    , epochs=100
                   )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100

KeyboardInterrupt: 

In [None]:
model.save("../model/BiLSTM_CRF_NER_with_POS.h5")

In [None]:
model.load_weights("../model/BiLSTM_CRF_NER_with_POS.h5")

In [14]:
pad_x_word_test, pad_x_char_test, pad_y_test = tokenizer.padding(x_word_test, x_char_test, y_test)
print(pad_x_word_test.shape)
print(pad_x_char_test.shape)
print(pad_y_test.shape)

(1564, 195)
(1564, 195, 31)
(1564, 195, 4)


In [15]:
y_pred = model.predict([pad_x_word_test, pad_x_char_test])

In [16]:
seq_f1(pad_y_test, y_pred)

0.19791666666666666

## 検証

In [185]:
with open("../data/compound_train.json", 'r') as f:
    raw_train = json.load(f)
    train_dict = {str(entry['WikipediaID']): entry['Attributes'] for entry in raw_train['entry']}

In [165]:
tokenizer.vocab_tag

{'<PAD>': 0, 'O': 1, 'B': 2, 'I': 3}

In [325]:
def extract_phrase(word_seq, tag_seq):
    phrase_list = []
    phrase = []
    for word, tag in zip(word_seq, tag_seq):
        '''
        if ((tag == 2) and (len(phrase) == 0)) or ((tag == 3) and (len(phrase) > 0)):
            phrase.append(word)
        elif tag == 2 and len(phrase) > 0:
            phrase_list.append(phrase)
            phrase = [word]
        '''
        if tag == 2 or tag == 3:
            phrase.append(word)
        elif phrase:
            phrase_list.append(phrase)
            phrase = []

    if phrase:
        phrase_list.append(phrase)
        
    return phrase_list

In [326]:
def escape(s):
    _s = s.replace(r'.', r'\.')
    #_s = _s.replace(r'*', r'\*')
    _s = _s.replace(r'+', r'\+')
    _s = _s.replace(r'-', r'\-')
    _s = _s.replace(r'^', r'\^')
    _s = _s.replace(r'?', r'\?')
    _s = _s.replace(r'$', r'\$')
    _s = _s.replace(r'|', r'\|')
    _s = _s.replace(r'(', r'\(').replace(r')', r'\)')
    _s = _s.replace(r'[', r'\[').replace(r']', r'\]')
    _s = _s.replace(r'{', r'\{').replace(r'}', r'\}')
    
    return _s

In [327]:
df = test_df.copy()
_pred = np.argmax(y_pred, -1)
_pred = [tags[np.where(tags > 0)[0]] for tags in _pred]

df = df.assign(pred_tag = _pred)

In [328]:
df['extracted'] = \
df.apply(
    lambda x: extract_phrase(x.words, x.pred_tag)
    , axis=1
)

In [329]:
extracted_dict = {}
for i, row in df.iterrows():
    if not row['extracted']:
        continue
    extracted_patt = [''.join(phrase) for phrase in row['extracted']]
    extracted_patt = ['\s*'.join(flatten(patt)) for patt in extracted_patt] # 元の文に空白が入っている場合を考慮
    extracted_patt = '|'.join([escape(patt) for patt in extracted_patt])
    
    match = re.findall(extracted_patt, row['sentence'])
    extracted_dict[row['_id']] = extracted_dict.get(row['_id'], []) + match

In [331]:
def evaluate_set(true_set, pred_set):
    TP = len(true_set & pred_set)
    FP = len(pred_set - true_set)
    FN = len(true_set - pred_set)
    
    return TP, FP, FN

def precision(TP, FP):
        return TP / (TP + FP)
    
def recall(TP, FN):
    return TP / (TP + FN)

def f1(TP, FP, FN):
    return 2 * precision(TP, FP) * recall(TP, FN) / (precision(TP, FP) + recall(TP, FN))

In [332]:
TP = 0
FP = 0
FN = 0
for _id in test_df._id.unique():
    train_set = set(train_dict[_id]['製造方法'])
    extracted_set = set(extracted_dict.get(_id, []))
    
    tp, fp, fn = evaluate_set(train_set, extracted_set)
    TP += tp
    FP += fp
    FN += fn

In [333]:
print("Train size:", len(flatten([v['製造方法'] for k, v in train_dict.items() if k in test_df._id.unique()])), \
      "Extracted size:", len(flatten([v for k, v in extracted_dict.items()]))
     )
print("TP:", TP, "\tFP:", FP, "\tFN:", FN)
print("Precision:", precision(TP, FP))
print("Recall:", recall(TP, FN))
print("F1:", f1(TP, FP, FN))

Train size: 94 Extracted size: 93
TP: 11 	FP: 82 	FN: 83
Precision: 0.11827956989247312
Recall: 0.11702127659574468
F1: 0.11764705882352942


In [334]:
row_TP = df.apply(lambda x: len(x['extracted']) > 0 and x['label'] == True, axis=1).sum() 
row_FP = df.apply(lambda x: len(x['extracted']) > 0 and x['label'] == False, axis=1).sum() 
row_FN = df.apply(lambda x: len(x['extracted']) == 0 and x['label'] == True, axis=1).sum() 

print("TP:", row_TP, "\tFP:", row_FP, "\tFN:", row_FN)
print("Precision:", precision(row_TP, row_FP))
print("Recall:", recall(row_TP, row_FN))
print("F1:", f1(row_TP, row_FP, row_FN))

TP: 41 	FP: 44 	FN: 45
Precision: 0.4823529411764706
Recall: 0.47674418604651164
F1: 0.4795321637426901


In [362]:
result_dict = {}
for _id in test_df._id.unique():
    title = test_df.loc[test_df._id == _id].title.tolist()[0]
    result_dict[_id] = {'title': title, 'true': train_dict[_id]['製造方法'], 'predict': extracted_dict.get(_id, [])}

In [366]:
with open("../output/type-production_extracted-phrase_using_repl-compounds.json", 'w', encoding='utf-8') as f:
    json.dump(result_dict, f, ensure_ascii=False)