In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
MAX_SEQUENCE_LENGTH = 200 # 问题/答案 上限200个词
MAX_NB_WORDS = 20000 # 字典 上限20000个词
EMBEDDING_DIM = 100 # 100d 词向量

In [3]:
import pickle

# load tokenizer
token_path = '../main/model/tokenizer.pkl'
tokenizer = pickle.load(open(token_path, 'rb'))

word_index = tokenizer.word_index

In [4]:
embeddings_index = {}
with open('../word2vec/wiki.vector') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 782241 word vectors.


In [5]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Merge, LSTM, Reshape, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional

QA_EMBED_SIZE = 64
DROPOUT_RATE = 0.3

q = Sequential()
q.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
q.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=True), merge_mode="sum"))
q.add(Dropout(DROPOUT_RATE))

a = Sequential()
a.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
a.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=True), merge_mode="sum"))
a.add(Dropout(DROPOUT_RATE))

# attention model
attention = Sequential()
attention.add(Merge([q, a], mode="dot", dot_axes=[1, 1]))
attention.add(Flatten())
attention.add(Dense((MAX_SEQUENCE_LENGTH * QA_EMBED_SIZE)))
attention.add(Reshape((MAX_SEQUENCE_LENGTH, QA_EMBED_SIZE)))

model = Sequential()
model.add(Merge([q, attention], mode="sum"))
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_3 (Merge)              (None, 200, 64)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 12800)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 12801     
Total params: 96,900,961
Trainable params: 52,623,361
Non-trainable params: 44,277,600
_________________________________________________________________




In [9]:
# load model
model_path = '../main/model/model-blstm-word2vec-attention.h5'
model.load_weights(model_path)

In [10]:
import json
import jieba

# load test data
test_path = '../newdata/WebQA.json'
with open(test_path, 'r') as f:
    test = json.load(f)

questions = []
answers = []
for item in test:
    q = ' '.join(jieba.cut(item['question']))
    for passage in item['passages']:
        a = ' '.join(jieba.cut(passage['content']))
        questions.append(q)
        answers.append(a)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.789 seconds.
Prefix dict has been built succesfully.


In [11]:
# load test data
test_path = '../newdata/WebQA_label.txt'

labels = []
for line in open(test_path, 'r'):
    label = int(line.strip('\n').split(',')[1])
    labels.append(label)

In [12]:
sequences_q = tokenizer.texts_to_sequences(questions)
sequences_a = tokenizer.texts_to_sequences(answers)

test_q = pad_sequences(sequences_q, maxlen=MAX_SEQUENCE_LENGTH)
test_a = pad_sequences(sequences_a, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', test_q.shape)
print('Shape of data tensor:', test_a.shape)

Shape of data tensor: (448444, 200)
Shape of data tensor: (448444, 200)


In [13]:
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [14]:
model.evaluate([test_q, test_a], labels)



[0.5821690707570762, 0.7067214635479229]

In [14]:
model.evaluate([test_q, test_a], labels)



[0.7894134040711889, 0.630787790671745]

In [15]:
model.metrics_names

['loss', 'acc']