In [1]:
# https://hironsan.hatenablog.com/entry/named-entity-recognition-with-elmo
# を実装してみる

In [2]:
import argparse
import os

import numpy as np

from allennlp.modules.elmo import Elmo, batch_to_ids

In [3]:
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
elmo = Elmo(options_file, weight_file, 2, dropout=0)

# ELMoが何かの確認

In [4]:
sentences = [['First', 'sentence', '.'], ['Another', '.']]
character_ids = batch_to_ids(sentences)
embeddings = elmo(character_ids)

In [5]:
embeddings["elmo_representations"][0]

tensor([[[ 0.1474, -0.1475,  0.1376,  ...,  0.0270, -0.4051, -0.0498],
         [ 0.2394,  0.0769,  0.4126,  ..., -0.1671, -0.1707,  0.3884],
         [-0.7602, -0.4944, -0.5355,  ..., -0.0803,  0.0361,  0.1128]],

        [[ 0.2603, -0.4437,  0.2726,  ..., -0.0830, -0.1522, -0.1361],
         [-0.7772, -0.4294, -0.2651,  ..., -0.0803,  0.0361,  0.1128],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
       grad_fn=<DropoutBackward>)

In [6]:
embeddings["elmo_representations"][0].shape

torch.Size([2, 3, 1024])

# ELmoの分散表現を計算してみる
与えた文について単語ごとにELMoの分散表現を得る事を目的とした例

In [47]:
import tensorflow as tf
import tensorflow_hub as hub

In [48]:
with tf.Graph().as_default():
    elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True) # モジュール用意
    embeddings = elmo(
        ["the cat is on the mat", "dogs are in the fog"],
        signature="default",
        as_dict=True)["elmo"] # モジュール適用

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.tables_initializer())

        print(sess.run(embeddings)) # 実行

[[[ 3.08154583e-01  2.66303927e-01  2.35613093e-01 ... -3.70857447e-01
    1.64904833e-01 -7.24597126e-02]
  [ 5.14287531e-01 -1.35323644e-01  1.10904068e-01 ...  4.04682383e-02
   -4.78974357e-02  7.36596107e-01]
  [-2.58803926e-02 -7.28363097e-02 -7.93559477e-02 ... -2.90724397e-01
    7.24214137e-01  4.38634515e-01]
  [-3.47980678e-01 -2.91022509e-02 -8.19930434e-01 ... -9.20484006e-01
    2.18879543e-02  1.21059902e-01]
  [-2.18274236e-01 -1.30765587e-01 -2.52096236e-01 ... -2.96935618e-01
   -1.58280328e-01 -4.90075573e-02]
  [ 1.00726105e-01 -2.95349322e-02 -2.44943231e-01 ... -3.72351050e-01
   -1.48757815e-01  2.15922311e-01]]

 [[ 5.45786619e-02 -2.64275432e-01  4.68437791e-01 ... -1.40770972e-01
   -2.65682340e-01  4.52119648e-01]
  [ 8.09428394e-02  1.15838401e-01 -1.56705871e-01 ... -2.68961281e-01
    3.38719338e-01  1.15770502e-02]
  [-7.89646134e-02  9.49275017e-01 -6.18049502e-01 ... -6.30558968e-01
    3.09430093e-01  1.53787762e-01]
  [-6.78093284e-02  9.71869081e-02 

# ELMoを他のモデルに差し込んでみる

In [7]:
def load_data_and_labels(filename):
    sents, labels = [], []
    words, tags = [], []
    with open(filename) as f:
        for line in f:
            line = line.rstrip()
            if line:
                word, tag = line.split('\t')
                words.append(word)
                tags.append(tag)
            else:
                sents.append(words)
                labels.append(tags)
                words, tags = [], []
                
    return sents, labels

In [8]:
train_file = 'data/conll2003/en/ner/train.txt'
valid_file = 'data/conll2003/en/ner/valid.txt'

x_train, y_train = load_data_and_labels(train_file)
x_valid, y_valid = load_data_and_labels(valid_file)

In [9]:
x_train[0]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [10]:
UNK = '<UNK>'
PAD = '<PAD>'

vocab_word = {PAD: 0, UNK: 1}
vocab_char = {PAD: 0, UNK: 1}
vocab_label = {PAD: 0}

In [11]:
for sent in x_train:
    for w in sent:
        # create char dictionary.
        for c in w:
            if c in vocab_char:
                continue
            vocab_char[c] = len(vocab_char)

        # create word dictionary.
        if w in vocab_word:
            continue
        vocab_word[w] = len(vocab_word)

# create label dictionary.
for labels in y_train:
    for tag in labels:
        if tag in vocab_label:
            continue
        vocab_label[tag] = len(vocab_label)

In [12]:
def get_char_sequences(x):
    chars = []
    for sent in x:
        chars.append([list(w) for w in sent])

    return chars

In [13]:
x_train_chars = get_char_sequences(x_train)
x_valid_chars = get_char_sequences(x_valid)

In [14]:
x_train_chars[0]

[['E', 'U'],
 ['r', 'e', 'j', 'e', 'c', 't', 's'],
 ['G', 'e', 'r', 'm', 'a', 'n'],
 ['c', 'a', 'l', 'l'],
 ['t', 'o'],
 ['b', 'o', 'y', 'c', 'o', 't', 't'],
 ['B', 'r', 'i', 't', 'i', 's', 'h'],
 ['l', 'a', 'm', 'b'],
 ['.']]

In [15]:
def transform_word(x):
    seq = []
    for sent in x:
        word_ids = [vocab_word.get(w, vocab_word[UNK]) for w in sent]
        seq.append(word_ids)

    return seq


In [16]:
x_train_words = transform_word(x_train)
x_valid_words = transform_word(x_valid)

In [17]:
x_train_words[0]

[2, 3, 4, 5, 6, 7, 8, 9, 10]

In [18]:
def transform_char(x):
    seq = []
    for sent in x:
        char_seq = []
        for w in sent:
            char_ids = [vocab_char.get(c, vocab_char[UNK]) for c in w]
            char_seq.append(char_ids)
        seq.append(char_seq)
    
    return seq

In [19]:
x_train_chars = transform_char(x_train)
x_valid_chars = transform_char(x_valid)

In [20]:
x_train_chars[0]

[[2, 3],
 [4, 5, 6, 5, 7, 8, 9],
 [10, 5, 4, 11, 12, 13],
 [7, 12, 14, 14],
 [8, 15],
 [16, 15, 17, 7, 15, 8, 8],
 [18, 4, 19, 8, 19, 9, 20],
 [14, 12, 11, 16],
 [21]]

In [21]:
def transform_label(y):
    seq = []
    for labels in y:
        tag_ids = [vocab_label[tag] for tag in labels]
        seq.append(tag_ids)
        
    return seq

In [22]:
y_train = transform_label(y_train)

In [23]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical


def pad_char(sequences):
    maxlen_word = max(len(max(seq, key=len)) for seq in sequences)
    maxlen_seq = len(max(sequences, key=len))
    sequences = [list(seq) + [[] for i in range(max(maxlen_seq - len(seq), 0))] for seq in sequences]

    return np.array([pad_sequences(seq, padding='post', maxlen=maxlen_word) for seq in sequences])

x_train_words = pad_sequences(x_train_words, padding='post')
x_valid_words = pad_sequences(x_valid_words, padding='post')
x_train_chars = pad_char(x_train_chars)
x_valid_chars = pad_char(x_valid_chars)
y_train = pad_sequences(y_train, padding='post')
y_train = to_categorical(y_train, len(vocab_label))

Using TensorFlow backend.


In [24]:
char_vocab_size = len(vocab_char)
char_emb_size = 50
char_lstm_units = 25
word_vocab_size = len(vocab_word)
word_emb_size = 100
word_lstm_units = 100
num_tags = len(vocab_label)

In [25]:
from keras.layers import Input

word_ids = Input(shape=(None,), dtype='int32')
char_ids = Input(shape=(None, None), dtype='int32')
elmo_embeddings = Input(shape=(None, 1024), dtype='float32')

In [26]:
import keras.backend as K
from keras.layers import LSTM, Embedding, Bidirectional, TimeDistributed
from keras.layers.merge import Concatenate

char_embeddings = Embedding(input_dim=char_vocab_size,
                            output_dim=char_emb_size,
                            mask_zero=True
                            )(char_ids)
char_embeddings = TimeDistributed(Bidirectional(LSTM(char_lstm_units)))(char_embeddings)

In [27]:
word_embeddings = Embedding(input_dim=word_vocab_size,
                            output_dim=word_emb_size,
                            mask_zero=True)(word_ids)

In [28]:
x = Concatenate(axis=-1)([word_embeddings, char_embeddings, elmo_embeddings])
x = Bidirectional(LSTM(units=word_lstm_units, return_sequences=True))(x)

In [52]:
from keras.layers import Dense
from anago.layers import CRF

x = Dense(word_lstm_units, activation='tanh')(x)
x = Dense(num_tags)(x)
crf = CRF(3, sparse_target=True)
pred = crf(x)

In [53]:
from keras.models import Model
model = Model(inputs=[word_ids, char_ids, elmo_embeddings], outputs=[pred])
model.compile(loss=crf.loss_function, optimizer='adam')

In [54]:
model.fit([x_train_words, x_train_chars, elmo_embeddings], y_train)

ValueError: When feeding symbolic tensors to a model, we expect thetensors to have a static batch size. Got tensor with shape: (None, None, 1024)