# IMDB 电影评论分类

In [11]:
import numpy as np

from keras import models
from keras import layers
from keras import optimizers
from keras import losses
from keras import metrics
from keras.datasets import imdb

In [19]:
MAX_WORD_NUMS = 10000

In [2]:
(train_datas, train_labels), (test_datas, test_labels) = imdb.load_data(num_words=MAX_WORD_NUMS)

In [10]:
print(train_datas.shape)
print(train_datas[0])
print(train_labels.shape)
print(train_labels[0])

(25000,)
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
(25000,)
1


In [16]:
# 求最大单词索引
max([max(word_indexs) for word_indexs in train_datas])

9999

In [17]:
# 向量化评论
def vectorization_comments(comments):
    x = np.zeros((comments.shape[0], MAX_WORD_NUMS))
    for i, comment in enumerate(comments):
        x[i, comment] = 1
        
    return x

In [25]:
x_train = vectorization_comments(train_datas)
x_test = vectorization_comments(test_datas)

y_train = train_labels.astype('float32')
y_test = test_labels.astype('float32')

In [31]:
# 模型没有从测试集中拆分验证集
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=optimizers.RMSprop(lr=0.001),
              loss=losses.binary_crossentropy,
              metrics=[metrics.binary_accuracy])

history = model.fit(x_train, y_train, epochs=5, batch_size=512)
print('fit> ', history.history)

eval_result = model.evaluate(x_test, y_test)
print('evaluate> ', eval_result)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fit>  {'loss': [0.46256480371475217, 0.2688534903907776, 0.20628301260948181, 0.17230818962574004, 0.14920373081207275], 'binary_accuracy': [0.8240000002861023, 0.9090400002288819, 0.9278400001716613, 0.9393599998283386, 0.9485600004577637]}
evaluate>  [0.30629473545074465, 0.881]


In [33]:
# 模型加入验证集
VALIDATION_SET_NUM = 10000
x_val = x_train[:VALIDATION_SET_NUM]
y_val = y_train[:VALIDATION_SET_NUM]
x_train_ = x_train[VALIDATION_SET_NUM:]
y_train_ = y_train[VALIDATION_SET_NUM:]

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(x_train_, y_train_, 
                    epochs=5, batch_size=512,
                    validation_data=(x_val, y_val))
print('fit> ', history.history)

eval_result = model.evaluate(x_test, y_test)
print('evaluate> ', eval_result)

Train on 15000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fit>  {'val_loss': [0.420804176568985, 0.3057797256946564, 0.3313155623912811, 0.2773297059059143, 0.3002026237487793], 'val_acc': [0.836499999332428, 0.8911000007629395, 0.8648000001907349, 0.8882000002861022, 0.8821000005722046], 'loss': [0.5114460351785024, 0.30675576764742535, 0.22086736146608987, 0.17301243742307026, 0.1429766508658727], 'acc': [0.7782000001271566, 0.901933333269755, 0.9309999998410543, 0.9457333330472311, 0.9557333335558573]}
evaluate>  [0.31774706316947937, 0.8716]


In [36]:
# 层改为32个神经网络单元
model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train_, y_train_, epochs=5, batch_size=512, validation_data=(x_val, y_val))
model.evaluate(x_test, y_test)

Train on 15000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.3759273798465729, 0.85772]

In [37]:
# 改为一层
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train_, y_train_, epochs=5, batch_size=512, validation_data=(x_val, y_val))
model.evaluate(x_test, y_test)

Train on 15000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.2917386550998688, 0.88328]

In [38]:
# 改为三层
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train_, y_train_, epochs=5, batch_size=512, validation_data=(x_val, y_val))
model.evaluate(x_test, y_test)

Train on 15000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.34428353609085083, 0.86976]

In [40]:
# 损失函数改为mse
model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy'])
model.fit(x_train_, y_train_, epochs=5, batch_size=512, validation_data=(x_val, y_val))
model.evaluate(x_test, y_test)

Train on 15000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.09327613470554352, 0.87304]

In [41]:
# 激活函数改为tanh
model = models.Sequential()
model.add(layers.Dense(16, activation='tanh', input_shape=(10000,)))
model.add(layers.Dense(16, activation='tanh'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train_, y_train_, epochs=5, batch_size=512, validation_data=(x_val, y_val))
model.evaluate(x_test, y_test)

Train on 15000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.34445818514347076, 0.86972]

In [43]:
# 改为1层1个神经网络单元
model = models.Sequential()
model.add(layers.Dense(1, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train_, y_train_, epochs=5, batch_size=512, validation_data=(x_val, y_val))
model.evaluate(x_test, y_test)

Train on 15000 samples, validate on 10000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.4939744903182983, 0.80756]

## 尝试了这么多模型的结构，怎么都差不多？

## 参考资料
* [Getting started with the Keras Sequential model](https://keras.io/getting-started/sequential-model-guide/)