In [3]:
import numpy as np
import tensorflow as tf
import jieba
import re

In [4]:
import itertools
from random import shuffle

In [5]:
reviews_pos = []
r = "[\s+\.\!\/_,$%^*)(+\"\']+|[+——！，。？、~@#￥%……&*（）]+"
with open('pos.txt', 'r', encoding = "utf-8") as f:
    for lines in f:
        if re.sub(r,'',lines) != "没有描述":
#        if not "没有描述" or "na" in re.sub(r,'',lines):
            reviews_pos.append([re.sub(r,'',x) for x in jieba.cut(lines) if len(re.sub(r,'',x))])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\yuzhe\AppData\Local\Temp\jieba.cache
Loading model cost 1.268 seconds.
Prefix dict has been built succesfully.


In [6]:
def input_doc(filename):
    reviews = []
    r = "[\s+\.\!\/_,$%^*)(+\"\']+|[+——！，。？、~@#￥%……&*（）]+"
    with open(filename, 'r', encoding = "utf-8") as f:
        for lines in f:
            if re.sub(r,'',lines) != "na":
                reviews.append([re.sub(r,'',x) for x in jieba.cut(lines) if len(re.sub(r,'',x))])
    return reviews

In [7]:
reviews_neg = input_doc('neg.txt')

In [8]:
reviews_neg[:10]

[['暂时', '还', '没有', '足够', '用', '了'],
 ['暂时', '还', '没有', '发现', '缺点', '哦'],
 ['比', '电脑城', '的', '要', '贵'],
 ['给',
  '的',
  '两',
  '节电池',
  '是',
  '山寨',
  '货',
  '怎么',
  '也',
  '送个',
  '南孚',
  '啊',
  '自己',
  '直接',
  '上',
  '了',
  '金霸王',
  '的'],
 ['暂时', '没有', '发现', '不足'],
 ['暂时', '没有', '发现'],
 ['原装', '的', '就是', '贵', '只好', '买', '了'],
 ['价格', '还是', '有些', '贵', '再', '便宜', '些', '就', '更好'],
 ['一对', '20', '音箱', '卖', '一千块', '价格不菲'],
 ['电池', '还', '不错']]

In [9]:
reviews_pos[:10]

[['东西', '很', '好', '哦'],
 ['可穿', '在', '钥匙扣', '随身携带'],
 ['文曲星', 'E638', '过级', '王', '内置', '剑桥', '高阶', '双解', '词典', '白色', 'good'],
 ['看上去', '非常', '坚固'],
 ['便宜', '好用', '1'],
 ['大', '音量', '操作', '简单'],
 ['送货', '快', '是', '正品', '用', '得', '挺', '好', '的'],
 ['刚', '收到', '还', '没', '打开'],
 ['可以', '装', '很多', '哦', '喜欢'],
 ['主要', '是', '价格便宜']]

In [10]:
reviews = reviews_pos + reviews_neg

In [11]:
labels = np.array([1]*len(reviews_pos)+[0]*len(reviews_neg))

In [12]:
words = list(itertools.chain.from_iterable(reviews))

In [13]:
words[:10]

['东西', '很', '好', '哦', '可穿', '在', '钥匙扣', '随身携带', '文曲星', 'E638']

# Encoding the words

In [14]:
from collections import Counter
counts = Counter(words)

In [15]:
vocab = sorted(counts, key=counts.get, reverse=True)

In [16]:
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [17]:
reviews_ints = []
for each in reviews:
    reviews_ints.append([vocab_to_int[word] for word in each])

In [18]:
reviews_ints[:10]

[[22, 6, 7, 10],
 [21780, 41, 4919, 2402],
 [3545, 11238, 7232, 7233, 2140, 15934, 15935, 12948, 2203, 648, 1334],
 [567, 56, 3222],
 [28, 67, 241],
 [21, 186, 200, 111],
 [55, 47, 19, 50, 13, 130, 53, 7, 1],
 [281, 243, 2, 14, 372],
 [20, 319, 202, 10, 102],
 [700, 19, 116]]

#### remove the review with zero length from the reviews_ints list.

In [19]:
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 650
Maximum review length: 80


In [20]:
len(reviews_ints)

300000

In [21]:
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
len(non_zero_idx)

299350

In [22]:
non_zero_idx

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [23]:
shuffle(non_zero_idx)

In [24]:
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])

In [25]:
reviews_ints[:10]

[[21927, 943, 877, 36, 11, 165, 1403, 891, 1, 49, 38, 102, 1, 118],
 [4, 48],
 [8, 2, 4, 5, 9, 10],
 [62, 148, 19, 1465, 1, 321, 321],
 [77, 293, 823, 499],
 [24404, 4819, 101],
 [8, 2, 4, 5, 9, 10],
 [196, 12630, 12, 311, 1493, 18855, 265, 196],
 [8, 2, 4, 5, 9, 10],
 [201, 504, 82, 148, 1327]]

In [26]:
seq_len = 80 #use Maximum review length
features = np.zeros((len(reviews_ints), seq_len), dtype=int)
for i, row in enumerate(reviews_ints):
    features[i, -len(row):] = np.array(row)[:seq_len]

In [27]:
features[:10]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 21927,   943,   877,    36,    11,   165,
         1403,   891,     1,    49,    38,   102,     1,   118],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            

#### Training, Validation, Test

In [28]:
split_frac = 0.8

In [29]:
split_idx = int(len(features)*0.8)

train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(239480, 80) 
Validation set: 	(29935, 80) 
Test set: 		(29935, 80)


# Build the graph

In [30]:
#lstm_size = 256
num_size = 256
#lstm_layers = 1
num_layers = 1
batch_size = 500
learning_rate = 0.001

In [42]:
hidden_layer_size = 32
hidden_size = 32
hidden_layer = 1

In [40]:
def my_attention(inputs, hidden_layer_size):
    
    inputs = tf.reshape(inputs, [-1, hidden_size])
    hidden_layer = tf.layers.dense(inputs, hidden_layer_size, activation=tf.nn.softmax)
    logits = tf.layers.dense(hidden_layer, 1, activation = None)
    
    logits = tf.reshape(logits, [-1, seq_len, 1])
    alphas = tf.nn.softmax(logits, dim=1)
    encodings = tf.reduce_sum(inputs*alphas, 1)
    
    return encodings, alphas

In [31]:
n_words = len(vocab_to_int) + 1 # Adding 1 because we use 0's for padding, dictionary started at 1

# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None, None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

In [32]:
embed_size = 300 

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs_)

In [34]:

with graph.as_default():
    # Your basic LSTM cell
    # lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    cell = tf.contrib.rnn.GRUCell(num_size)
    
    # Add dropout to the cell
    # drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    drop = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=keep_prob)
    
    # Stack up multiple LSTM layers, for deep learning
    # cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers)
    cell = tf.contrib.rnn.MultiRNNCell([drop] * num_layers)
    
    # Getting an initial state of all zeros
    initial_state = cell.zero_state(batch_size, tf.float32)

In [51]:
with graph.as_default():
    outputs, final_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell, cell_bw=cell, inputs=embed,
                                    initial_state_fw=initial_state, initial_state_bw=initial_state)
    #outputs, final_state = tf.nn.dynamic_rnn(cell, embed,
    #                                         initial_state=initial_state)

In [36]:
outputs = tf.concat(outputs, axis=2)

outputs.get_shape()

final_state = tf.concat(final_state,2)

encoding, alphas = my_attention(outputs, num_size)

logits = tf.layers.dense(encoding, 2, activation=None)


with graph.as_default():
    #predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    #predictions = tf.argmax(logits, 1)
    cost = tf.losses.mean_squared_error(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [45]:

with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [46]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [47]:
epochs = 1

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            #loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            loss, _ = sess.run([cost, optimizer], feed_dict=feed)
            
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in get_batches(val_x, val_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    #batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    batch_acc = sess.run([accuracy], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/1 Iteration: 5 Train loss: 0.190
Epoch: 0/1 Iteration: 10 Train loss: 0.176
Epoch: 0/1 Iteration: 15 Train loss: 0.158
Epoch: 0/1 Iteration: 20 Train loss: 0.149
Epoch: 0/1 Iteration: 25 Train loss: 0.138


KeyboardInterrupt: 

In [None]:

test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        feed = {inputs_: x,
                labels_: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))