In [1]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential,Model
from keras.layers import Dense, Embedding,Input
from keras.layers import LSTM
import numpy as np

from preprocess_util import load_data

Using TensorFlow backend.


## 数据读取与处理

In [2]:
path = 'datasets/imdb.npz'
num_words = 20000   #选取前20000个高频的词
max_len = 80    #一句句子统一80词
batch_size = 32  

print('loading dataset...')
(x_train, y_train), (x_test, y_test) = load_data(path=path, maxlen=None, seed=37, num_words=num_words)
print('x_train.shape = ',x_train.shape)
print('x_test.shape = ',x_test.shape)
print('x_train[0]: ',x_train[0])
print('y_train[0]: ',y_train[0])

loading dataset...
x_train.shape =  (25000,)
x_test.shape =  (25000,)
x_train[0]:  [1, 132, 23, 968, 716, 6, 65, 7, 35, 1233, 318, 1929, 232, 19, 6, 2779, 439, 37, 5446, 6, 292, 17, 6, 965, 8244, 7, 6, 117, 250, 11, 2713, 315, 4, 2869, 7, 2, 18, 10443, 33, 86, 240, 24, 8, 2575, 21, 95, 36, 15329, 19, 257, 85, 29, 1068, 8, 570, 2779, 525, 525, 95, 31, 251, 59, 214, 3531, 5, 556, 10, 10, 5, 29, 528, 570, 33, 233, 8, 79, 4, 1060, 10, 10, 198, 691, 4, 65, 7, 132, 23, 968, 21, 535, 49, 194, 1299, 33, 222, 6, 171, 211, 587, 4, 277, 63, 9, 307, 5, 80, 242, 97, 25, 1415, 10, 10, 198, 82, 88, 7, 4, 87, 228, 1335, 2, 1701, 19, 2751, 2, 8639, 3946, 10, 10, 21, 4, 5655, 173, 7, 4, 20, 858, 4, 155, 9, 285, 133, 9, 404, 10, 10, 86, 116, 3387, 2079, 9, 33, 27, 118, 4111, 8281, 5, 1368, 3590, 52, 17, 210, 87, 2, 3717, 5, 480, 185, 10255, 8819, 5, 198, 24, 4, 130, 7, 4, 1029, 10, 10, 95, 216, 4, 627, 63, 9, 6203, 5, 367, 19, 897, 802, 144, 28, 1199, 35, 735, 18, 252, 10, 10, 4, 65, 9, 24, 43, 6, 1060, 

In [3]:
print('Pad sequences (samples x time)')
# 一句句子多于80词剪裁，少于80就补0
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)


## 建立模型

In [5]:
x = Input(shape=(max_len,))
b = Embedding(num_words,128)(x)
lstm = LSTM(32,dropout=0.2,recurrent_dropout=0.2)(b)
output = Dense(1,activation='sigmoid')(lstm)

model = Model(inputs=x,outputs=output)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

## 喂入数据进行训练

In [6]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 80)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 80, 128)           2560000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                20608     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 2,580,641
Trainable params: 2,580,641
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_split=0.1)
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train on 22500 samples, validate on 2500 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 1.1735811022949219
Test accuracy: 0.80728
