In [1]:
import numpy 
print(numpy.__version__)

1.14.6


In [2]:
import tensorflow
print(tensorflow.__version__)

1.11.0


In [0]:
import numpy as np

# 定义加载数据的函数
def loadData():
    # 加载评论（字符串）
    with open('reviews.txt', 'r') as f:
        reviews = f.read()

    # 加载评论（字符串）的对应标签，是积极的还是消极的
    with open('labels.txt', 'r') as f:
        labels = f.read()
        
    # 返回评论和标签
    return reviews, labels

# 调用函数
reviews, labels = loadData()

In [4]:
# 查看评论的前150个字符是什么
reviews[:150]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching '

In [5]:
# 查看评论的对应标签的前150个字符是什么
labels[:150]

'positive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositive\nnegative\npositi'

In [0]:
from string import punctuation 

# 定义数据预处理函数
def dataPreprocess(reviews_str):
    # 通过列表推导式将reviews_str字符串里的包含各种标点符号去掉，并返回一个字符组成的数组
    # 然后通过join()函数将数组里的元素都连接成一个长长的字符串
    all_text = ''.join(
        [review for review in reviews_str if review not in punctuation])

    # 将该字符串通过\n换行符分割成数组
    review_list = all_text.split('\n')

    # 将数组里的元素通过空格连接起来，形成一个长长的字符串
    all_text = ' '.join(review_list)

    # 然后通过使用split()函数的默认分隔符-空格来将字符串分割成一个个单词的数组
    words = all_text.split()
    
    return review_list, all_text, words
  
# 调用函数
reviews, all_text, words = dataPreprocess(reviews)

In [7]:
reviews[:2]

['bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   ',
 'story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  viol

In [8]:
# 查看前20个元素（单词）
words[:20]

['bromwell',
 'high',
 'is',
 'a',
 'cartoon',
 'comedy',
 'it',
 'ran',
 'at',
 'the',
 'same',
 'time',
 'as',
 'some',
 'other',
 'programs',
 'about',
 'school',
 'life',
 'such']

In [9]:
# 查看前150个字符串
all_text[:150]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching pr'

In [0]:
# 单词编码
from collections import Counter

# 统计单词的重复个数
word_counter = Counter(words)

# 将变量word_counter根据默认顺序进行逆序排序（从大到小），使用sorted方法，逆序设置参数reverse=True
sorted_vocab = sorted(word_counter, key=word_counter.get, reverse=True)

In [0]:
# 定义显示前10个单词以及它的重复个数的函数
def showTop10Item(dict_obj):
    word_index = 0
    for k, v in dict_obj.items():
        if word_index >= 10:
            break
        print("{}:{}".format(k, v))
        word_index+=1

In [12]:
# 显示变量word_counter里的单词和它对应的数量
showTop10Item(word_counter)

bromwell:8
high:2161
is:107328
a:163009
cartoon:545
comedy:3246
it:96352
ran:238
at:23513
the:336713


In [13]:
# 按照单词出现的数量从大到小的排序，查看前15个单词的出现次数
word_counter.most_common(15)

[('the', 336713),
 ('and', 164107),
 ('a', 163009),
 ('of', 145864),
 ('to', 135720),
 ('is', 107328),
 ('br', 101872),
 ('it', 96352),
 ('in', 93968),
 ('i', 87623),
 ('this', 76000),
 ('that', 73245),
 ('s', 65361),
 ('was', 48208),
 ('as', 46933)]

In [14]:
# 查看排序后的前15个单词，和上面显示的结果一样
sorted_vocab[:15]

['the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'br',
 'it',
 'in',
 'i',
 'this',
 'that',
 's',
 'was',
 'as']

In [15]:
# 创建单词对应的索引关系字典
vocab_to_int = {word: i for i, word in enumerate(sorted_vocab, 1)}

# 然后显示前10个单词以及它的个数
showTop10Item(vocab_to_int)

the:0
and:1
a:2
of:3
to:4
is:5
br:6
it:7
in:8
i:9


In [0]:
# 将每个单词的索引位置取出来，然后添加到reviews_ints数组里
# 也就是说，现在字符串里的每个单词，不是原来的单词字符串了，而是一个数值，表示它的索引
reviews_ints = []
for review in reviews:
    reviews_ints.append([vocab_to_int[word] for word in review.split()])

In [17]:
print(reviews_ints[:1])

[[21024, 307, 5, 2, 1049, 206, 7, 2137, 31, 0, 170, 56, 14, 48, 80, 5784, 43, 381, 109, 139, 14, 5193, 59, 153, 8, 0, 4974, 5851, 474, 70, 4, 259, 11, 21024, 307, 12, 1977, 5, 73, 2394, 4, 612, 72, 5, 5193, 0, 24102, 4, 1982, 10165, 0, 5785, 1498, 35, 50, 65, 203, 144, 66, 1198, 5193, 19868, 0, 37441, 3, 0, 220, 882, 30, 2987, 70, 3, 0, 5786, 9, 685, 1, 66, 1498, 53, 9, 215, 0, 382, 8, 61, 2, 1405, 3685, 782, 4, 3482, 179, 0, 381, 9, 1211, 13582, 31, 307, 2, 348, 340, 2912, 9, 142, 126, 4, 7689, 29, 3, 128, 5193, 1405, 2325, 4, 21024, 307, 9, 527, 11, 108, 1447, 3, 59, 542, 101, 11, 21024, 307, 5, 226, 4145, 47, 2, 2210, 11, 7, 214, 22]]


In [18]:
len(reviews_ints)

25001

In [19]:
# 标签编码
# 对positive进行编码为1，negative为0
labels = labels.split('\n')
labels = np.array([1 if label == 'positive' else 0 for label in labels])

# 查看前10个编码标签值
labels[:10]

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0])

In [20]:
from collections import Counter

review_lens = Counter([len(x) for x in reviews_ints])
print("评论的最小长度是: {}".format(review_lens[0]))
print("评论的最大长度是: {}".format(max(review_lens)))

评论的最小长度是: 1
评论的最大长度是: 2514


In [21]:
# 过滤掉评论的字符串长度为0的情况，并返回长度非零的索引，形成数组并返回
non_zero_idx = [i for i, review in enumerate(reviews_ints) if len(review) != 0]

# 去掉字符串长度为0的情况后，还有多少个评论
print(len(non_zero_idx))

25000


In [0]:
# 通过变量non_zero_idx索引数组，过滤掉变量reviews_ints里的字符串为0的情况
reviews_ints = [reviews_ints[i] for i in non_zero_idx]

# 过滤掉由于上面的字符串长度为0的那一行评论后，它对应的标签也需要过滤掉
labels = np.array([labels[i] for i in non_zero_idx])

In [0]:
# 现在，我们要创建一个features的变量来作为特征向量（Feature Vector），这个数据就是我们要传递到神经网络中的，
# 数据来自于reviews_ints变量。因为我们要传递整型的数值到神经网络中，且每行的数值不能
# 超过200个；所以就是，不足200长度的评论，前面使用0来填充；超过200长度的，我们截断前
# 200个字符串的长度。

# 定义一个评论的字符串最大长度是200
seq_len = 200

# 创建一个矩阵，里面的值都默认是0
features = np.zeros((len(reviews_ints), seq_len), dtype=int)

# 将reviews_ints里的值都截断在200的长度，并填充到变量features里。
# 不足200长度的，就是它本身长度
for i, row in enumerate(reviews_ints):
    # 评论长度不足200的，我们在前面使用0来填充
    features[i, -len(row):] = np.array(row)[:seq_len]

In [24]:
# 查看第一个
features[0:1]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 21024,   307,     5,
            2,  1049,   206,     7,  2137,    31,     0,   170,    56,
           14,    48,    80,  5784,    43,   381,   109,   139,    14,
         5193,    59,   153,     8,     0,  4974,  5851,   474,    70,
            4,   259,    11, 21024,   307,    12,  1977,     5,    73,
         2394,     4,   612,    72,     5,  5193,     0, 24102,     4,
         1982, 10165,     0,  5785,  1498,    35,    50,    65,   203,
          144,    66,  1198,  5193, 19868,     0, 37441,     3,     0,
      

In [25]:
features.shape

(25000, 200)

In [26]:
# 拆分训练集、验证集和测试集数据

# 定义80%的数据用于训练
split_train_ratio = 0.8

# 特征向量的长度
features_len = len(features)
# 训练集的个数
train_len = int(features_len * split_train_ratio)

# 分割出训练集和验证集的数据
train_x, val_x = features[:train_len], features[train_len:]
train_y, val_y = labels[:train_len], labels[train_len:] 

# 将验证集的数量折半
val_x_half_len = int(len(val_x) / 2)

# 将验证集数据分成一半验证集，另一半测试集
val_x, test_x = val_x[:val_x_half_len], val_x[val_x_half_len:]
val_y, test_y = val_y[:val_x_half_len], val_y[val_x_half_len:]

# 输出打印
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


In [0]:
# 定义超参数

lstm_size = 256
lstm_layers = 2
batch_size = 512
learning_rate = 0.01

In [34]:
import tensorflow as tf

# 获取单词的总长度
n_words = len(vocab_to_int) + 1

# 创建默认计算图对象
tf.reset_default_graph()

# 给计算图上的张量的输入占位符添加一个前缀inputs
with tf.name_scope('inputs'):
    # 输入特征占位符
    inputs_ = tf.placeholder(tf.int32, [None, None], name="inputs")
    # 输入标签占位符
    labels_ = tf.placeholder(tf.int32, [None, None], name="labels")
    # 保留率占位符
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")
    
    
# 嵌入向量的大小
embed_size = 300 

# 给计算图上的张量的嵌入层变量和查找表添加一个前缀Embeddings
with tf.name_scope("Embeddings"):
    # 均匀分布初始化嵌入层的变量，范围是-1到1之间
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    # 将输入特征占位符传入嵌入查找表
    embed = tf.nn.embedding_lookup(embedding, inputs_)
    
    
def lstm_cell():
    # 创建基础LSTM cell
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
    # 添加dropout层到cell上
    return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)

# 给graph上的tensors的RNN层添加一个前缀RNN_layers
with tf.name_scope("RNN_layers"):
    # 创建多个LSTM层
    cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
    
    # 获取一个初始化状态，默认值都是0
    initial_state = cell.zero_state(batch_size, tf.float32)

Instructions for updating:
This class is deprecated, please use tf.nn.rnn_cell.LSTMCell, which supports all the feature this cell currently has. Please replace the existing code with tf.nn.rnn_cell.LSTMCell(name='basic_lstm_cell').


In [0]:
with tf.name_scope("RNN_forward"):
    # 通过dynamic_rnn可以返回每一步的输出和隐藏层的最后状态
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state)
    
with tf.name_scope('predictions'):
    # 创建输出层，由于我们预测的输出是1或者0，所以sigmoid激活函数是最好的选择
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    
with tf.name_scope('cost'):
    # 定义均方差训练损失函数
    cost = tf.losses.mean_squared_error(labels_, predictions)

with tf.name_scope('train'):
    # 定义训练优化器
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
with tf.name_scope('validation'):
    # 计算验证精确度
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [0]:
# 定义获取数据批次的生成器函数
def get_batches(x, y, batch_size=100):
    # 计算得出有多少个批次，这里是整除，所以假如x的总数不能被batch_size整除，
    # 那么会剩下很小的一部分数据暂时会被丢弃
    n_batches = len(x)//batch_size
    # 然后再次确定x和y的数据集的数据
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    # 通过for循环，使用yield关键字构建生成器函数
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [0]:
# 设置迭代次数，8次
epochs = 8

# 创建检查点保存对象
saver = tf.train.Saver()

# 创建一个TensorFlow会话
with tf.Session() as sess:
    # 初始化全局变量
    sess.run(tf.global_variables_initializer())
    
    iteration = 1
    # 开始迭代
    for e in range(epochs):
        # 首次计算初始化状态
        state = sess.run(initial_state)
        
        # 将所有的数据都进行训练，get_batches()函数会获取数据生成器，然后进行迭代
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
        
            # 每训练5次时，打印一次训练日志
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            # 每训练25次时，打印一次验证日志
            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                # 对验证集的所有数据进行计算分值
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = \
                        sess.run([accuracy, final_state], feed_dict=feed)
                    # 每25次训练后，完全的验证一次，得到验证分值，保存在数组val_acc里，
                    val_acc.append(batch_acc)
                # 打印每25次训练后，验证的均值
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
            
            # 每批次时都记录检查点
            saver.save(sess, "checkpoints/sentiment.ckpt")
    # 当所有的数据迭代训练完毕后，最后记录一次检查点
    saver.save(sess, "checkpoints/sentiment.ckpt")

In [0]:
test_acc = []
with tf.Session() as sess:
    # 从检查点恢复已训练的模型
    saver.restore(sess, "checkpoints/sentiment.ckpt")
    # 在计算测试集数据前，先创建一个空的状态
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    # 获取测试集数据生成器
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        # 开始批次计算测试集数据
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        # 将每个批次的得分保存到数组
        test_acc.append(batch_acc)
    # 最后输出测试得分均值，即精确度
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

In [33]:
import numpy
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

# 为了确保可复现性，我们设置一个随机种子
numpy.random.seed(7)

# 设置5000的意思是，只保留前面5000个以内常见的单词，其它的都为0
top_words = 5000

# 加载数据集
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)

# 设置单个影评的最大长度是500
review_max_length = 500

# 影评长度不够500的用0填充，超过500的截断
X_train = sequence.pad_sequences(X_train, maxlen=review_max_length)
X_test = sequence.pad_sequences(X_test, maxlen=review_max_length)

# 创建模型
embedding_vecor_length = 32
model = Sequential()
# 添加输入嵌入层
model.add(Embedding(top_words, embedding_vecor_length, input_length=review_max_length))
# 添加LSTM隐藏层
model.add(LSTM(100))
# 添加输出层（全连接层），二分类问题，使用sigmoid激活函数
model.add(Dense(1, activation='sigmoid'))
# 编译模型，二分类问题，使用二进制交叉熵来计算损失
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 输出显示模型架构
model.summary()

# 训练模型，所有的训练数据集都要经过3次训练，每次训练时的每批次大小是64个
model.fit(X_train, y_train, epochs=3, batch_size=64)

# 最后评估模型
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: {}".format((scores[1]*100)))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 85.63199999999999
