In [1]:
import numpy as np
import pandas as pd
import re
import json
import keras
import keras.backend as K
from keras.layers import Input, Embedding, Bidirectional, TimeDistributed, LSTM, Dense, concatenate, Dropout
from keras.optimizers import SGD, Adam
from keras.models import Model
from keras import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras_contrib.layers import CRF
from seqeval.metrics import f1_score, precision_score, recall_score

Using TensorFlow backend.


In [2]:
# Fix ramdom seed.
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

In [3]:
class MyTokenizer(object):
    def __init__(self):
        self.UNK = '<UNK>'
        self.PAD = '<PAD>'
        self.vocab_word = {self.PAD: 0, self.UNK: 1}
        self.vocab_char = {self.PAD: 0, self.UNK: 1}
        self.vocab_tag = {self.PAD: 0}
        
    def fit(self, sentences, tags, row_sentences=None):
        self._fit_word(sentences)
        
        if row_sentences:
            self._fit_char(row_sentences)
        else:
            self._fit_char(sentences)
        
        self._fit_tag(tags)
        
        self.vocab_word_size = len(self.vocab_word)
        self.vocab_char_size = len(self.vocab_char)
        self.vocab_tag_size = len(self.vocab_tag)
    
    def inverse_transform_tag(self, tag_id_seq):
        seq = []
        inv_vocab_tag = {v: k for k, v in self.vocab_tag.items()}
        for tag_ids in tag_id_seq:
            tags = [inv_vocab_tag[tag_id] for tag_id in tag_ids]
            seq.append(tags)

        return seq
    
    def padding_word(self, word_seq):
        return pad_sequences(word_seq, padding='post')
    
    def padding_char(self, char_seq):
        char_max = max([len(max(char_seq_in_sent, key=len)) for char_seq_in_sent in char_seq])
        pad_seq = [pad_sequences(char_seq_in_sent, maxlen=char_max, padding='post') for char_seq_in_sent in char_seq]
        
        # 文の長さも揃える
        return pad_sequences(pad_seq, padding='post')
    
    def padding_tag(self, tag_seq):
        return pad_sequences(tag_seq, padding='post')

    def _fit_word(self, sentences):
        for s in sentences:
            for w in s:
                if w in self.vocab_word:
                    continue
                self.vocab_word[w] = len(self.vocab_word)
                
    def _fit_char(self, sentences):
        for s in sentences:
            for w in s:
                for c in w:
                    if c in self.vocab_char:
                        continue
                    self.vocab_char[c] = len(self.vocab_char)
                    
    def _fit_tag(self, tag_seq):
        for tags in tag_seq:
            for tag in tags:
                if tag in self.vocab_tag:
                    continue
                self.vocab_tag[tag] = len(self.vocab_tag)
                
    def transform_word(self, sentences):
        seq = []
        for s in sentences:
            word_ids = [self.vocab_word.get(w, self.vocab_word[self.UNK]) for w in s]
            seq.append(word_ids)
            
        return seq
    
    def transform_char(self, sentences):
        seq = []
        for s in sentences:
            char_seq = []
            for w in s:
                char_ids = [self.vocab_char.get(c, self.vocab_char[self.UNK]) for c in w]
                char_seq.append(char_ids)
            seq.append(char_seq)
            
        return seq
    
    def transform_tag(self, tag_seq):
        seq = []
        for tags in tag_seq:
            tag_ids = [self.vocab_tag[tag] for tag in tags]
            seq.append(tag_ids)

        return seq

### Normal Batch

In [4]:
def batch_iter(X, y, batch_size, tokenizer, shuffle=True):
    num_batches_per_epoch = int((len(X[0]) - 1) / batch_size) + 1

    def data_generator():
        data_size = len(X[0])
        while True:
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_X = [np.array(_input)[shuffle_indices] for _input in X]
                shuffled_y = [np.array(target)[shuffle_indices] for target in y]
            else:
                shuffled_data = X
                shuffled_labels = y

            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                batch_X = [_input[start_index: end_index] for _input in shuffled_X]
                batch_y = [target[start_index: end_index] for target in shuffled_y]
                
                batch_X[0] = tokenizer.padding_word(batch_X[0])
                batch_X[1] = tokenizer.padding_char(batch_X[1])
                batch_y = [tokenizer.padding_tag(attr_y) for attr_y in batch_y]
                
                yield batch_X, batch_y

    return num_batches_per_epoch, data_generator()

In [5]:
target_col_name = {
    'ふりがな': "production_tag_seq"
    , '別称': "another_name_tag_seq"
    , '用途': "use_tag_seq"
    , '種類': "type_tag_seq"
    , '商標名': "trademark_tag_seq"
    , '特性': "property_tag_seq"
    , '原材料': "raw_material_tag_seq"
    , '製造方法': "production_tag_seq"
    , '生成化合物': "formation_tag_seq"
    , 'CAS番号': "cas_tag_seq"
    , '化学式': "chemical_formula_tag_seq"
    , '密度': "density_tag_seq"
    , '融点': "melting_tag_seq"
    , '沸点': "boiling_tag_seq"
    , '示性式': "rational_formula_tag_seq"
}

In [6]:
# load data
train_df = pd.read_pickle("../data/train_IOB_repl_compound.pkl")
test_df = pd.read_pickle("../data/test_IOB_repl_compound.pkl")

In [7]:
tokenizer = MyTokenizer()
tokenizer.fit(
    sentences=train_df.repl_words.tolist()
    , row_sentences=train_df.words.tolist()
    , tags=['B', 'I', 'O']
)

In [8]:
param = {
    'char_vocab_size': tokenizer.vocab_char_size
    , 'word_vocab_size':tokenizer.vocab_word_size
    , 'tag_size': tokenizer.vocab_tag_size
    , 'char_emb_dim': 25
    , 'word_emb_dim': 100
    , 'char_lstm_units': 25
    , 'word_lstm_units': 100
    , 'dropout_rate': 0.5
    , 'lstm_activation': 'tanh'
    , 'fc_activation': 'tanh'
    , 'fc_units': 100
}

In [9]:
char_input = Input(shape=(None, None))
word_input = Input(shape=(None,))

char_emb = Embedding(input_dim=param['char_vocab_size']
                     , output_dim=param['char_emb_dim']
                     , mask_zero=True)(char_input)
char_emb = TimeDistributed(Bidirectional(LSTM(units=param['char_lstm_units'], activation=param['lstm_activation'])))(char_emb)

word_emb = Embedding(input_dim=param['word_vocab_size']
                     , output_dim=param['word_emb_dim']
                     , mask_zero=True)(word_input)

feats = concatenate([char_emb, word_emb])

feats = Dropout(param['dropout_rate'])(feats)

feats = Bidirectional(LSTM(units=param['word_lstm_units'], return_sequences=True, activation=param['lstm_activation']))(feats)

feats1 = Dense(param['fc_units'], activation=param['fc_activation'])(feats)
feats1 = Dense(param['tag_size'])(feats1)
crf1 = CRF(param['tag_size'])
pred1 = crf1(feats1)

feats2 = Dense(param['fc_units'], activation=param['fc_activation'])(feats)
feats2 = Dense(param['tag_size'])(feats2)
crf2 = CRF(param['tag_size'])
pred2 = crf2(feats2)

model = Model(inputs=[word_input, char_input], outputs=[pred1, pred2])

sgd = SGD(lr=0.01, clipvalue=5.) # original paper
adam = Adam()

model.compile(loss=[crf1.loss_function, crf2.loss_function], optimizer=adam)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, None)   0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, None, 2 52975       input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, None, 50)     10200       embedding_1[0][0]                
__________________________________________________________________________________________________
embedding_

In [33]:
# 抽出対象の属性を指定
target_attr_list = ["原材料", "製造方法"]

In [34]:
def encoding_onehot(tag_seq, tokenizer):
    return np.array([np.identity(tokenizer.vocab_tag_size)[tags] for tags in tag_seq])

In [35]:
x_word_train = tokenizer.transform_word(train_df.repl_words.tolist())
x_char_train = tokenizer.transform_char(train_df.words.tolist())
y_train = [tokenizer.transform_tag(train_df[target_col_name[attr]].tolist()) for attr in target_attr_list]
# one-hot encoding
y_train = np.array([encoding_onehot(tag_seq, tokenizer) for tag_seq in y_train])

x_word_test = tokenizer.transform_word(test_df.repl_words.tolist())
x_char_test = tokenizer.transform_char(test_df.words.tolist())
y_test = [tokenizer.transform_tag(test_df[target_col_name[attr]].tolist()) for attr in target_attr_list]
# one-hot encoding
y_test = np.array([encoding_onehot(tag_seq, tokenizer) for tag_seq in y_test])

In [36]:
batch_size = 32

In [37]:
train_steps, train_batches = batch_iter([x_word_train, x_char_train], y_train, batch_size, tokenizer)
valid_steps, valid_batches = batch_iter([x_word_test, x_char_test], y_test, batch_size, tokenizer)

In [38]:
model.fit_generator(train_batches, train_steps
                    #, validation_data=valid_batches, validation_steps=valid_batches
                    , epochs=1
                   )

Epoch 1/1


ValueError: Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 1 arrays: [array([[[0, 0, 0, 1],
        [0, 0, 0, 1],
        [0, 0, 0, 1],
        ...,
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 1],
        [0, 0, 0, 1],
        ...

In [None]:
model.save("../model/raw-material.h5")

In [90]:
model.load_weights("../model/raw-material.h5")

In [19]:
pad_x_word_test = tokenizer.padding_word(x_word_test)
pad_x_char_test = tokenizer.padding_char(x_char_test)
pad_y_test = [tokenizer.padding_tag(target) for target in y_test]

In [20]:
y_pred = model.predict([pad_x_word_test, pad_x_char_test])

# 検証

In [21]:
with open("../data/compound_train.json", 'r') as f:
    raw_train = json.load(f)
    train_dict = {str(entry['WikipediaID']): {'title': entry['Name'], 'attributes': entry['Attributes']} for entry in raw_train['entry']}

In [30]:
flatten = lambda l: [i for sub_l in l for i in sub_l]

def extract_words(word_seq, tag_seq):
    words_list = []
    words = []
    for word, tag in zip(word_seq, tag_seq):
        '''
        if ((tag == 2) and (len(phrase) == 0)) or ((tag == 3) and (len(phrase) > 0)):
            phrase.append(word)
        elif tag == 2 and len(phrase) > 0:
            phrase_list.append(phrase)
            phrase = [word]
        '''
        if tag == tokenizer.vocab_tag['B'] or tag == tokenizer.vocab_tag['I']:
            words.append(word)
        elif words:
            words_list.append(words)
            words = []

    if words:
        words_list.append(words)
        
    return words_list

def extract_strings(sentence, extracted_words):
    if extracted_words:
        patt = extract_pattern(extracted_words)
        return re.findall(patt, sentence)
    return []

def escape(s):
    _s = s.replace(r'.', r'\.')
    _s = _s.replace(r'+', r'\+')
    _s = _s.replace(r'-', r'\-')
    _s = _s.replace(r'^', r'\^')
    _s = _s.replace(r'?', r'\?')
    _s = _s.replace(r'$', r'\$')
    _s = _s.replace(r'|', r'\|')
    _s = _s.replace(r'(', r'\(').replace(r')', r'\)')
    _s = _s.replace(r'[', r'\[').replace(r']', r'\]')
    _s = _s.replace(r'{', r'\{').replace(r'}', r'\}')
    
    _s = _s.replace(r'*', '\*')
    _s = re.sub(r'\\s\\\*', '\s*', _s)
    
    return _s

def extract_pattern(chunks):
    patt = [''.join(chunk) for chunk in chunks]
    patt = ['\s*'.join(list(p)) for p in patt] # 元の文に空白が入っている場合を考慮
    patt = '|'.join([escape(p) for p in patt])
    
    return patt

def evaluate_exact_match(result_dict):
    annotation_size = 0
    extracted_size = 0
    TP = 0
    FP = 0
    FN = 0
    for _id, val in result_dict.items():
        true_set = set(val['true'])
        pred_set = set(val['predict'])
        
        annotation_size += len(true_set)
        extracted_size += len(pred_set)
        TP += len(true_set & pred_set)
        FP += len(pred_set - true_set)
        FN += len(true_set - pred_set)

    def precision(TP, FP):
            return TP / (TP + FP) if (TP + FP) != 0 else 0.0

    def recall(TP, FN):
        return TP / (TP + FN) if (TP + FN) != 0 else 0.0

    def f1(precision, recall):
        return 2 * precision * recall / (precision + recall) \
            if (precision + recall) != 0 else 0.0
    
    score = {
        'annotation_size': annotation_size
        , 'extracted_size': extracted_size
        , 'TP': TP
        , 'FP': FP
        , 'FN': FN
        , 'precision': precision(TP, FP)
        , 'recall': recall(TP, FN)
        , 'f1': f1(precision(TP, FP), recall(TP, FN))
    }
    
    return score

def onehot2id(onehot_seq):
    return np.argmax(onehot_seq, -1)

def remove_pad(tag_seq):
    return [tags[np.where(tags > 0)[0]] for tags in tag_seq]

def evaluate_seq(y_true, y_pred):
    _y_true = onehot2id(y_true)
    _y_true = remove_pad(_y_true)
    _y_true = tokenizer.inverse_transform_tag(_y_true)

    _y_pred = onehot2id(y_pred)
    _y_pred = remove_pad(_y_pred)
    _y_pred = tokenizer.inverse_transform_tag(_y_pred)

    return {'precision': precision_score(_y_true, _y_pred)
            , 'recall': recall_score(_y_true, _y_pred)
            , 'f1': f1_score(_y_true, _y_pred)
           }

In [23]:
target_attr = "原材料"
attr_y_pred = y_pred[target_attr_list.index(target_attr)]
attr_y_test = y_test[target_attr_list.index(target_attr)]

In [24]:
evaluate_seq(attr_y_test, attr_y_pred)

ValueError: operands could not be broadcast together with shapes (24,4) (37,4) 

In [28]:
pred_tag_seq = remove_pad(onehot2id(attr_y_pred))

extracted_dict = {}
for i, (_, row) in enumerate(test_df.iterrows()):
    extracted = extract_words(row.words, pred_tag_seq[i])
    extracted = extract_strings(row.sentence, extracted)
    extracted_dict[row._id] = extracted_dict.get(row._id, []) + extracted
    
result_dict = {}
for _id in test_df._id.unique():
    result_dict[_id] = \
    {'title': train_dict[_id]['title']
     , 'true': train_dict[_id]['attributes'][target_attr]
     , 'predict': list(set(extracted_dict.get(_id, [])))
    }

In [31]:
evaluate_exact_match(result_dict)

{'annotation_size': 160,
 'extracted_size': 2,
 'TP': 0,
 'FP': 2,
 'FN': 160,
 'precision': 0.0,
 'recall': 0.0,
 'f1': 0.0}

## 抽出結果をjsonファイルに出力

In [48]:
result_filename = "../output/raw-material_with_repl-compounds.json"

In [49]:
with open(result_filename, 'w', encoding='utf-8') as f:
    json.dump(result_dict, f, ensure_ascii=False)