## DNN_for_textClassification 

In [9]:
# 根据影评判别电影当前用户对该电影的喜好

import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

1.12.0
2.1.6-tf


In [2]:
imdb = keras.datasets.imdb
(train_data,train_label),(test_data,test_label) = imdb.load_data(num_words=10000)

In [3]:
# train_data[0] 数字，现将数字先转回评论文字
word_index = imdb.get_word_index()

word_index = { k:v+3 for k,v  in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(v,k) for (k,v) in word_index.items()])


def decode_text(text):
    return ' '.join([reverse_word_index.get(i,'?') for i in text])

### 准备数据 

In [4]:
# 将每条影评的词长度都扩充为256

train_data = keras.preprocessing.sequence.pad_sequences(train_data,value=word_index["<PAD>"],padding='post',maxlen=256)
test_data = keras.preprocessing.sequence.pad_sequences(test_data,value=word_index["<PAD>"],padding='post',maxlen=256)


### 构建模型 

In [10]:
vocab_size = 10000

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [11]:
# 训练目标
model.compile(optimizer=tf.train.AdamOptimizer(),
             loss = "binary_crossentropy",
             metrics=["accuracy"])

### 创建验证机 

In [14]:
# 测试集在模型训练和验证过程，最好只使用一次

x_val = train_data[:10000]
par_val = train_data[10000:]
x_label = train_label[:10000]
par_label = train_label[10000:]

### 训练模型 

In [15]:
# 数据量大，分批次(单次训练512个数据)，总计40回合
print(x_val.shape)
history = model.fit(par_val,
                    par_label,
                    epochs=40,
                    batch_size=512,
                    validation_data=(x_val, x_label),
                    verbose=1)

(10000, 256)
Train on 15000 samples, validate on 10000 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


### 评估模型 

In [17]:
loss,acc = model.evaluate(test_data,test_label)
print(loss,acc)

0.334717641658783 0.87252


### 创建准确率和损失随时间变化的图 

In [18]:
# model.fit() 返回一个 History 对象
history_dict = history.history
history_dict.keys()

dict_keys(['val_loss', 'val_acc', 'loss', 'acc'])

In [None]:
loss = history_dict['loss']
acc = history_dict['acc']
val_loss = history_dict['val_loss']
val_acc = history_dict['val_acc']

epochs = range(1,len(loss)+1)

plt.subplot(1,2,1)
plt.plot(epochs,loss,'bo',label="Trainning_loss")
plt.plot(epochs,val_loss,'b-',label="validation  loss")
plt.title("loss accuracy")
plt.xlabel("epochs")
plt.ylabel("loss")

plt.subplot(1,2,2)
plt.plot(epochs,acc,'bo',label="Trainning_loss")
plt.plot(epochs,val_acc,'b-',label="validation  loss")
plt.title("loss accuracy")
plt.xlabel("epochs")
plt.ylabel("loss")