In [1]:
import numpy as np
import pickle
import json
import jieba
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
MAX_SEQUENCE_LENGTH = 200 # 问题/答案 上限200个词
MAX_NB_WORDS = 20000 # 字典 上限20000个词
EMBEDDING_DIM = 100 # 100d 词向量

In [3]:
# load tokenizer
token_path = '../main/model/tokenizer.pkl'
tokenizer = pickle.load(open(token_path, 'rb'))

word_index = tokenizer.word_index

In [4]:
embeddings_index = {}
with open('../word2vec/wiki.vector') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 782241 word vectors.


In [5]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [6]:
# load test data
test_path = '../newdata/WebQA.json'
with open(test_path, 'r') as f:
    test = json.load(f)

questions = []
answers = []
ids = []
for item in test:
    q = ' '.join(jieba.cut(item['question']))
    for passage in item['passages']:
        a = ' '.join(jieba.cut(passage['content']))
        questions.append(q)
        answers.append(a)
        ids.append(passage['passage_id'])

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.707 seconds.
Prefix dict has been built succesfully.


In [7]:
sequences_q = tokenizer.texts_to_sequences(questions)
sequences_a = tokenizer.texts_to_sequences(answers)

test_q = pad_sequences(sequences_q, maxlen=MAX_SEQUENCE_LENGTH)
test_a = pad_sequences(sequences_a, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', test_q.shape)
print('Shape of data tensor:', test_a.shape)

Shape of data tensor: (448444, 200)
Shape of data tensor: (448444, 200)


In [8]:
from keras.models import Sequential
from keras.layers import BatchNormalization, Dense, Dropout, Merge, LSTM, Reshape, Flatten, Convolution1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional

QA_EMBED_SIZE = 64
DROPOUT_RATE = 0.3

def get_lstm():
    q = Sequential()
    q.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    q.add(LSTM(QA_EMBED_SIZE, return_sequences=False, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE))

    a = Sequential()
    a.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    a.add(LSTM(QA_EMBED_SIZE, return_sequences=False, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE))

    model = Sequential()
    model.add(Merge([q, a], mode="concat"))
    model.add(BatchNormalization())
    model.add(Dropout(DROPOUT_RATE))
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(DROPOUT_RATE))
    model.add(Dense(1, activation="sigmoid"))
    return model

def get_blstm():
    q = Sequential()
    q.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    q.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=False, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE), merge_mode="sum"))

    a = Sequential()
    a.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    a.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=False, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE), merge_mode="sum"))

    model = Sequential()
    model.add(Merge([q, a], mode="concat"))
    model.add(BatchNormalization())
    model.add(Dropout(DROPOUT_RATE))
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(DROPOUT_RATE))
    model.add(Dense(1, activation="sigmoid"))
    return model

def get_blstm_word2vec():
    q = Sequential()
    q.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False))
    q.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=False, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE), merge_mode="sum"))

    a = Sequential()
    a.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False))
    a.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=False, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE), merge_mode="sum"))

    model = Sequential()
    model.add(Merge([q, a], mode="concat"))
    model.add(BatchNormalization())
    model.add(Dropout(DROPOUT_RATE))
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(DROPOUT_RATE))
    model.add(Dense(1, activation="sigmoid"))
    return model

def get_blstm_word2vec_cnn():
    q = Sequential()
    q.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False))
    q.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=True, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE), merge_mode="sum"))
    q.add(Convolution1D(filters=128, kernel_size=3, padding='valid', activation='relu'))
    q.add(MaxPooling1D(4))
    q.add(Dropout(0.2))
    q.add(Flatten())
    q.add(Dense(300))

    a = Sequential()
    a.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False))
    a.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=True, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE), merge_mode="sum"))
    a.add(Convolution1D(filters=128, kernel_size=3, padding='valid', activation='relu'))
    a.add(MaxPooling1D(4))
    a.add(Dropout(0.2))
    a.add(Flatten())
    a.add(Dense(300))

    model = Sequential()
    model.add(Merge([q, a], mode="concat"))
    model.add(BatchNormalization())
    model.add(Dropout(DROPOUT_RATE))
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(DROPOUT_RATE))
    model.add(Dense(1, activation="sigmoid"))
    return model

def get_blstm_word2vec_att():
    q = Sequential()
    q.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False))
    q.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=True, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE), merge_mode="sum"))

    a = Sequential()
    a.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False))
    a.add(Bidirectional(LSTM(QA_EMBED_SIZE, return_sequences=True, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE), merge_mode="sum"))

    # attention model
    attention = Sequential()
    attention.add(Merge([q, a], mode="dot", dot_axes=[1, 1]))
    attention.add(Flatten())
    attention.add(Dense((MAX_SEQUENCE_LENGTH * QA_EMBED_SIZE)))
    attention.add(Reshape((MAX_SEQUENCE_LENGTH, QA_EMBED_SIZE)))

    model = Sequential()
    model.add(Merge([q, attention], mode="concat"))
    model.add(Flatten()) # flatten
    model.add(BatchNormalization())
    model.add(Dropout(DROPOUT_RATE))
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(DROPOUT_RATE))
    model.add(Dense(1, activation="sigmoid"))
    return model

In [9]:
# load model
model_lstm = get_lstm()
model_lstm.load_weights('../main/model/model-lstm.h5')
model_lstm.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

predicts = model_lstm.predict([test_q, test_a], batch_size=64, verbose=1)

with open('predict/predict-lstm.txt', 'w') as f:
    for i in range(len(ids)):
        f.write('%d,%s\n' % (ids[i], predicts[i][0]))





In [10]:
# load model
model_blstm = get_blstm_word2vec()
model_blstm.load_weights('../main/model/model-blstm.h5')
model_blstm.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

predicts = model_blstm.predict([test_q, test_a], batch_size=64, verbose=1)

with open('predict/predict-blstm.txt', 'w') as f:
    for i in range(len(ids)):
        f.write('%d,%s\n' % (ids[i], predicts[i][0]))





In [11]:
# load model
model_blstm_word2vec = get_blstm_word2vec()
model_blstm_word2vec.load_weights('../main/model/model-blstm-word2vec.h5')
model_blstm_word2vec.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

predicts = model_blstm_word2vec.predict([test_q, test_a], batch_size=64, verbose=1)

with open('predict/predict-blstm-word2vec.txt', 'w') as f:
    for i in range(len(ids)):
        f.write('%d,%s\n' % (ids[i], predicts[i][0]))





In [12]:
# load model
model_blstm_word2vec_cnn = get_blstm_word2vec_cnn()
model_blstm_word2vec_cnn.load_weights('../main/model/model-blstm-word2vec-cnn.h5')
model_blstm_word2vec_cnn.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

predicts = model_blstm_word2vec_cnn.predict([test_q, test_a], batch_size=64, verbose=1)

with open('predict/predict-blstm-word2vec-cnn.txt', 'w') as f:
    for i in range(len(ids)):
        f.write('%d,%s\n' % (ids[i], predicts[i][0]))





In [13]:
# load model
model_blstm_word2vec_att = get_blstm_word2vec_att()
model_blstm_word2vec_att.load_weights('../main/model/model-blstm-word2vec-att.h5')
model_blstm_word2vec_att.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

predicts = model_blstm_word2vec_att.predict([test_q, test_a], batch_size=64, verbose=1)

with open('predict/predict-blstm-word2vec-att.txt', 'w') as f:
    for i in range(len(ids)):
        f.write('%d,%s\n' % (ids[i], predicts[i][0]))



