In [1]:
MAX_SEQUENCE_LENGTH = 200 # 问题/答案 上限200个词
EMBEDDING_DIM = 100 # 100d 词向量

### 数据预处理

In [2]:
import json
import jieba

In [3]:
# load train data
train_path = 'data/train_data_sample.json'
with open(train_path, 'r') as f:
    train = json.load(f)

train_q = []
train_a = []
train_y = []
for item in train:
    q = ' '.join(jieba.cut(item['question']))
    for passage in item['passages']:
        a = ' '.join(jieba.cut(passage['content']))
        train_q.append(q)
        train_a.append(a)
        train_y.append(passage['label'])

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.889 seconds.
Prefix dict has been built succesfully.


In [4]:
id2label = {}
sample_path = 'data/submit_sample.txt'
with open(sample_path) as f:
    for line in f.readlines():
        pro = line.split(',')
        id = int(pro[0])
        label = pro[1]
        id2label[id] = label

# load test data
test_path = 'data/test_data_sample.json'
with open(test_path, 'r') as f:
    test = json.load(f)

test_q = []
test_a = []
test_y = []
test_id = []
for item in test:
    q = ' '.join(jieba.cut(item['question']))
    for passage in item['passages']:
        a = ' '.join(jieba.cut(passage['content']))
        test_q.append(q)
        test_a.append(a)
        id = passage['passage_id']
        test_id.append(id)
        test_y.append(id2label[id])

In [5]:
# https://keras-cn-docs.readthedocs.io/zh_CN/latest/blog/word_embedding/
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# tokenizer
texts = train_q + train_a + test_q + test_a
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(tokenizer.word_index))

# sequences
sequences_train_q = tokenizer.texts_to_sequences(train_q)
sequences_train_a = tokenizer.texts_to_sequences(train_a)
sequences_test_q = tokenizer.texts_to_sequences(test_q)
sequences_test_a = tokenizer.texts_to_sequences(test_a)

# padding
data_train_q = pad_sequences(sequences_train_q, maxlen=MAX_SEQUENCE_LENGTH)
data_train_a = pad_sequences(sequences_train_a, maxlen=MAX_SEQUENCE_LENGTH)
data_test_q = pad_sequences(sequences_test_q, maxlen=MAX_SEQUENCE_LENGTH)
data_test_a = pad_sequences(sequences_test_a, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data_train_q.shape)
print('Shape of data tensor:', data_train_a.shape)
print('Shape of data tensor:', data_test_q.shape)
print('Shape of data tensor:', data_test_a.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Found 220464 unique tokens.
Shape of data tensor: (80084, 200)
Shape of data tensor: (80084, 200)
Shape of data tensor: (1586, 200)
Shape of data tensor: (1586, 200)


### 处理词向量

In [6]:
embeddings_index = {}
with open('../word2vec/wiki.vector') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 782241 word vectors.


In [7]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

### 保存

In [8]:
import pickle
token_path = 'model/tokenizer.pkl'
pickle.dump(tokenizer, open(token_path, 'wb'))

import numpy as np
np.save('data/train_q.npy', data_train_q)
np.save('data/train_a.npy', data_train_a)
np.save('data/train_y.npy', train_y)
np.save('data/test_q.npy', data_test_q)
np.save('data/test_a.npy', data_test_a)
np.save('data/test_y.npy', test_y)
np.save('data/test_id.npy', test_id)
np.save('data/embedding_matrix.npy', embedding_matrix)