In [7]:
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

In [10]:
MAX_SEQUENCE_LENGTH = 200 # 问题/答案 上限200个词
MAX_NB_WORDS = 20000 # 字典 上限20000个词
EMBEDDING_DIM = 100 # 100d 词向量

In [19]:
# load tokenizer
token_path = '../main/model/tokenizer-blstm-word2vec.pkl'
_tokenizer = pickle.load(open(token_path, 'rb'))

word_index = _tokenizer.word_index

In [20]:
embeddings_index = {}
with open('../main/model/wiki.vector') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 782240 word vectors.


In [22]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [35]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Merge, LSTM, Input
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional

QA_EMBED_SIZE = 64
DROPOUT_RATE = 0.3

q = Sequential()
q.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
q.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=False), merge_mode="sum"))
q.add(Dropout(DROPOUT_RATE))

a = Sequential()
a.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
a.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=False), merge_mode="sum"))
a.add(Dropout(DROPOUT_RATE))

model = Sequential()
model.add(Merge([q, a], mode="sum"))
model.add(Dense(2, activation="softmax"))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_2 (Merge)              (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 130       
Total params: 44,446,690
Trainable params: 169,090
Non-trainable params: 44,277,600
_________________________________________________________________




In [38]:
# load model
model_path = '../main/model/model-blstm-word2vec.h5'
_model = model
_model.load_weights(model_path)

ValueError: You are trying to load a weight file containing 1 layers into a model with 5 layers.

In [None]:
# load test data
test_path = '../newdata/WebQA.json'
with open(test_path, 'r') as f:
    test = json.load(f)

questions = []
answers = []
for item in test:
    q = ' '.join(jieba.cut(item['question']))
    for passage in item['passages']:
        a = ' '.join(jieba.cut(passage['content']))
        questions.append(q)
        answers.append(a)

In [33]:
# load test data
test_path = '../newdata/WebQA_label.json'
labels = []
for line in open(test_path, 'r'):
    label = int(line.strip('\n').split(',')[1])
    labels.append(label)

In [None]:
sequences_q = tokenizer.texts_to_sequences(questions)
sequences_a = tokenizer.texts_to_sequences(answers)

test_q = pad_sequences(sequences_q, maxlen=MAX_SEQUENCE_LENGTH)
test_a = pad_sequences(sequences_a, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', test_q.shape)
print('Shape of data tensor:', test_a.shape)

In [None]:
_model.evaluate([test_q, test_a], y_true)