# Neural sentiment classification

## without attention

In [1]:
import tensorflow as tf
import numpy as np
import copy
import random
import cPickle

In [2]:
from __future__ import division

In [3]:
class DataReader(object):
    def __init__(self, filename):
        self.filename = filename
        #self.read_wordlist()
        #self.read_datasets()
        #self.read_embeds()
        
    def read_datasets(self):
        self.dataset = []
        file_strs = ["/train.txt", "/dev.txt", "/test.txt"]
        for file_str in file_strs:
            lines = map(lambda x: x.split('\t\t'), open("data/"+self.filename + file_str).readlines())           
            label = np.asarray(
                map(lambda x: int(x[2])-1, lines),
                dtype = np.int32
            )
            docs = map(lambda x: x[3][0:len(x[3])-1], lines) 
            docs = map(lambda x: x.split('<sssss>'), docs) 
            docs = map(lambda doc: map(lambda sentence: sentence.split(' '),doc),docs)
            docs = map(lambda doc: map(lambda sentence: 
                                       filter(lambda wordid: wordid !=-1,
                                              map(lambda word: self.getID(word),sentence)),doc),docs)
            dataset = [docs, label];
            self.dataset.append(dataset)
            
    def read_embeds(self):
        f = file('data/'+self.filename+'/embinit.save', 'rb')
        self.embeds = cPickle.load(f)
        f.close()
        
    def read_wordlist(self):
        lines = map(lambda x: x.split(), open("data/" + self.filename+"/wordlist.txt").readlines())
        self.size = len(lines)
        self.voc = [(item[0][0], item[1]) for item in zip(lines, xrange(self.size))]
        self.inv_voc = [(item[1], item[0][0]) for item in zip(lines, xrange(self.size))]
        self.voc = dict(self.voc)
        self.inv_voc = dict(self.inv_voc)

    def getID(self, word):
        try:
            return self.voc[word]
        except:
            return -1
    def getWord(self, id):
        try:
            return self.inv_voc[id]
        except:
            return -1


In [4]:
data_reader = DataReader("IMDB")

In [5]:
data_reader.read_wordlist()
data_reader.voc['expected']

761

In [6]:
## the last one is "UNK"
data_reader.read_embeds()

In [7]:
data_reader.read_datasets()

In [8]:
trainset =  data_reader.dataset[0]

In [9]:
## 每句话的长度包括句子两端的<sess>，不过第一句话只有一个sess
##trainset[0]:docs , trainset[1]:labels
##trainset[0][0]:doc trainset[0][0][0]:sentence
print(len(trainset[0][4]))#doc
print(len(trainset[0][0][4]))#sentence
print(trainset[0][0][0][0])#word
print(trainset[1][0])#label

5
9
11
9


In [10]:
print(data_reader.getID('from'))

41


In [11]:
print(data_reader.getWord(26957))

excepted


In [12]:
docs_maxlen = 20##
sentence_maxlen = 25##
batch_size = 30

In [13]:
## padding the sentences and docs to the same size
## to construct 3d tensor size [batch_size, sentence_num_max,word_num_max]
## At the same time, remember the length of the docs and sentence for the tensorflow dynamic rnn 
## dynamic rnn can really save your time
def pad_doc(doc, docs_maxlen):
    if len(doc) < docs_maxlen:
        doc = doc + (docs_maxlen-len(doc))*[[0]]
    else:
        doc = doc[0:docs_maxlen]
    return doc
    
def pad_sentence(sentence, sentence_maxlen):
    if len(sentence) < sentence_maxlen:
        sentence = sentence + (sentence_maxlen-len(sentence))*[0]
    else:
        sentence = sentence[0:sentence_maxlen]
    return sentence
    
def pad_sentence_len(sentence_len, docs_maxlen):
    new_sentence_len = []
    for id in range(batch_size):
        tmp = sentence_len[id]
        if len(tmp) < docs_maxlen:
            tmp = tmp + (docs_maxlen-len(tmp))*[0]
        else:
            tmp = tmp[0:docs_maxlen]
        new_sentence_len.append(tmp)
    return new_sentence_len
    
def genBatch(dataset, label, batch_size, docs_maxlen, sentence_maxlen,batch_id):
    # get the sentence_num_max
    docs = dataset[batch_size*batch_id:(batch_id+1)*batch_size]
    label_batch = label[batch_size*batch_id:(batch_id+1)*batch_size]
    #print(len(docs))#the first element shold be 8
    #print(docs)
    docs_len = np.array(map(lambda x: len(x), docs))
    docs_len[docs_len>docs_maxlen] = docs_maxlen
    #print(docs_len)#
    #docs_maxlen = np.max(docs_len)
    #print('docs maxlen is %d' %docs_maxlen)
    # get the word_num_max
    sentence_len = np.array(map(lambda x :map(lambda y : len(y), x), docs))
    sentence_len = pad_sentence_len(sentence_len, docs_maxlen)
    for id in range(batch_size):
        tmp = np.array(sentence_len[id])
        tmp[tmp>sentence_maxlen] = sentence_maxlen
        sentence_len[id] = tmp
    
    #sentence_len[sentence_len>sentence_maxlen] = sentence_maxlen
    #print(sentence_len)# the first element should be 13
    #print('the sentence maxlen is %d' % sentence_maxlen)
    ## padding the data
    doc_batch = map(lambda x: pad_doc(x, docs_maxlen), docs)
    #print(doc_batch[0:2])
    doc_batch = map(lambda x:map(lambda y: pad_sentence(y, sentence_maxlen), x), doc_batch)
    batch = [doc_batch,docs_len, sentence_len]
    return batch, label_batch
    

In [14]:
batch, label_batch = genBatch(trainset[0],trainset[1], batch_size, docs_maxlen, sentence_maxlen,2)

In [15]:
def get_sentence_array(sentence_len,batch_size,docs_maxlen):
    sentence_len_array = np.zeros([batch_size, docs_maxlen])
    for i in range(batch_size):
        sentence_len_array[i,:] = sentence_len[i]
    return sentence_len_array
a = get_sentence_array(batch[2], 30, 20)
print(a.shape)

(30, 20)


In [16]:
print(len(batch[2][0]))

20


In [17]:
batch_num = len(trainset[0])//(batch_size)
print(batch_num)

2247


In [18]:
print(batch_size)

30


### Read Data

In [29]:
batch_size = 30
max_features = data_reader.embeds.shape[0]
hidden_size = 200
print(max_features)

105374


## with attention

In [20]:
# build the graph functions
# add the placeholders
def add_placeholders():
    input_placeholder = tf.placeholder(tf.int32, shape=[batch_size, docs_maxlen,sentence_maxlen])
    label_placeholder = tf.placeholder(tf.int32, shape=[batch_size])
    #seq_len_word_placeholder = tf.placeholder(tf.int32) # dynamic rnn for word lstm layer
    #seq_len_sentence_placeholder = tf.placeholder(tf.int32) #dynamic rnn for sentence word lstm layer
    return input_placeholder, label_placeholder

In [21]:
def add_embed_layer(vocab_size,  input_placeholder):
    embed_size = 200
    with tf.device('/cpu:0'), tf.variable_scope('embed'):
        embed = tf.get_variable(name="Embedding", shape=[vocab_size, embed_size])
        inputs = tf.nn.embedding_lookup(embed, input_placeholder)
        #inputs = tf.transpose(inputs, perm=[0,2,1])  
    return inputs

In [22]:
## add lstm layer
def add_rnn_model(hidden_size):
    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size, forget_bias=0.0)
    return lstm_cell

In [23]:
## add training op
def add_train_op(loss):
    train_op = tf.train.AdamOptimizer(0.000001).minimize(loss)
    return train_op

In [24]:
help(tf.nn.dynamic_rnn)

Help on function dynamic_rnn in module tensorflow.python.ops.rnn:

dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None, dtype=None, parallel_iterations=None, swap_memory=False, time_major=False, scope=None)
    Creates a recurrent neural network specified by RNNCell `cell`.
    
    This function is functionally identical to the function `rnn` above, but
    performs fully dynamic unrolling of `inputs`.
    
    Unlike `rnn`, the input `inputs` is not a Python list of `Tensors`, one for
    each frame.  Instead, `inputs` may be a single `Tensor` where
    the maximum time is either the first or second dimension (see the parameter
    `time_major`).  Alternatively, it may be a (possibly nested) tuple of
    Tensors, each of them having matching batch and time dimensions.
    The corresponding output is either a single `Tensor` having the same number
    of time steps and batch size, or a (possibly nested) tuple of such tensors,
    matching the nested structure of `cell.o

In [25]:
def evaluation(logits, labels):
  correct = tf.nn.in_top_k(logits, labels, 1)
  return tf.reduce_sum(tf.cast(correct, tf.int32))

In [26]:
help(tf.reduce_mean)

Help on function reduce_mean in module tensorflow.python.ops.math_ops:

reduce_mean(input_tensor, reduction_indices=None, keep_dims=False, name=None)
    Computes the mean of elements across dimensions of a tensor.
    
    Reduces `input_tensor` along the dimensions given in `reduction_indices`.
    Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each
    entry in `reduction_indices`. If `keep_dims` is true, the reduced dimensions
    are retained with length 1.
    
    If `reduction_indices` has no entries, all dimensions are reduced, and a
    tensor with a single element is returned.
    
    For example:
    
    ```python
    # 'x' is [[1., 1.]
    #         [2., 2.]]
    tf.reduce_mean(x) ==> 1.5
    tf.reduce_mean(x, 0) ==> [1.5, 1.5]
    tf.reduce_mean(x, 1) ==> [1.,  2.]
    ```
    
    Args:
      input_tensor: The tensor to reduce. Should have numeric type.
      reduction_indices: The dimensions to reduce. If `None` (the default),
        reduces a

In [27]:
class_num = 10

In [30]:
with tf.Graph().as_default(), tf.Session() as sess:
    input_placeholder, label_placeholder = add_placeholders()
    word_sequence_length_placeholder = tf.placeholder(tf.int32)
    sentence_sequence_length_placeholder = tf.placeholder(tf.int32)
    word_initial_state_placeholder = tf.placeholder(tf.float32)
    vocab_size = max_features
    print(vocab_size)
    word_inputs = add_embed_layer(vocab_size, input_placeholder)
    ##word lstm
    cell = add_rnn_model(hidden_size)
    word_initial_state = cell.zero_state(batch_size, tf.float32)
    # state is the final state
    
    # batch_size major
    with tf.variable_scope('word_rnn'):
        sentence_inputs = []
        for sentence_id in range(docs_maxlen):
            if sentence_id > 0:
                tf.get_variable_scope().reuse_variables()
            word_output, word_state = tf.nn.dynamic_rnn(cell, tf.squeeze(word_inputs[:,sentence_id,:,:]), 
                                                        initial_state = word_initial_state,
                                                        sequence_length=word_sequence_length_placeholder[:, sentence_id])
            
            sentence_inputs.append(tf.reduce_mean(word_output, 1))
        
    
    ## sentence lstm
    ## construct the sentence inputs
    
    sentence_inputs_tensor = tf.pack(sentence_inputs, axis=0)
    
    cell = add_rnn_model(hidden_size)
    sentence_initial_state = cell.zero_state(batch_size, tf.float32)
    sentence_outputs, sentence_state = tf.nn.dynamic_rnn(cell, sentence_inputs_tensor,
                                                         initial_state=sentence_initial_state,
                                                         time_major=True, 
                                                         sequence_length=sentence_sequence_length_placeholder)
    ##time major
    
    doc_output = tf.reduce_mean(sentence_outputs, 0)
    tf.expand_dims(doc_output,0)
    ##Fully Connected Layer
    W_fc = tf.get_variable('Weights_fc', shape=[hidden_size, hidden_size])
    B_fc = tf.get_variable('bias_fc', shape=[hidden_size])
    h_fc = tf.nn.relu(tf.matmul(doc_output,W_fc)+B_fc)
    #add projection layer
    W = tf.get_variable('Weights', shape=[hidden_size, class_num])
    b = tf.get_variable('Bias', shape = [class_num])
    
    y_pred = tf.matmul(h_fc, W) + b
    
    #y_pred_sigmoid = tf.sigmoid(y_pred)
    
    correct_num = evaluation(y_pred, label_placeholder)
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(y_pred, label_placeholder)
    loss = tf.reduce_mean(cross_entropy)

    train_op = add_train_op(loss)
    
    #state_step = initial_state
    sess.run(tf.initialize_all_variables())
    max_epochs = 2
    
    for epoch in range(max_epochs):
        w_init_state = sess.run([word_initial_state])
        s_init_state = sess.run([sentence_initial_state])
        print('%d Epoch starts, Training....' %(epoch))
        mean_loss = []
        total_correct_num = 0
        for step in range(batch_num):
            # generate the data feed dict
            
            #batch = [doc_batch,docs_len, sentence_len]
            batch, label_batch = genBatch(trainset[0],trainset[1], batch_size, docs_maxlen, sentence_maxlen,step)
            input_batch = np.array(batch[0])
            word_sq_len_batch = get_sentence_array(batch[2],batch_size,docs_maxlen)
            #print(word_sq_len_batch.shape)
             
            sentence_len_batch = np.array(batch[1])
            #print(sentece_len_batch.shape)
            #print(word_sq_len_batch)
            feed = {input_placeholder:input_batch, word_initial_state:w_init_state,
                   label_placeholder:label_batch, sentence_initial_state:s_init_state,
                   word_sequence_length_placeholder:word_sq_len_batch,
                   sentence_sequence_length_placeholder:sentence_len_batch}
            #input_step = sess.run([word_inputs], feed)
            loss_step,correct_num_step,w_init_state,s_init_state = sess.run([loss, correct_num,
                                                                            word_state,sentence_state], feed)
            #print(loss_step)
        
            mean_loss.append(loss_step)
            total_correct_num += correct_num_step
            
            if step % 100 == 0:
                print('step %d : loss : %f' %(step, np.mean(mean_loss)))
                mean_loss = []
            #do_evaluation(sess, X_test, y_test)
            
        
        print('precision: %f' %(total_correct_num/(batch_size*batch_num)))
        #print('Testing....')
        #do_evaluation(sess, X_test, y_test)

105374
0 Epoch starts, Training....
step 0 : loss : 3.159263
step 100 : loss : 3.029322
step 200 : loss : 2.989373
step 300 : loss : 2.975912
step 400 : loss : 2.962045
step 500 : loss : 2.862700
step 600 : loss : 2.901631
step 700 : loss : 2.952078
step 800 : loss : 2.902164
step 900 : loss : 2.923768
step 1000 : loss : 2.974546
step 1100 : loss : 2.962385
step 1200 : loss : 2.986768
step 1300 : loss : 2.933807
step 1400 : loss : 2.939800
step 1500 : loss : 3.009450
step 1600 : loss : 2.971812
step 1700 : loss : 2.858559
step 1800 : loss : 2.908821
step 1900 : loss : 2.939851
step 2000 : loss : 2.952921
step 2100 : loss : 2.948024
step 2200 : loss : 2.845860
precision: 0.190580
1 Epoch starts, Training....
step 0 : loss : 3.159263
step 100 : loss : 3.029322
step 200 : loss : 2.989373
step 300 : loss : 2.975912
step 400 : loss : 2.962045
step 500 : loss : 2.862700
step 600 : loss : 2.901631
step 700 : loss : 2.952078


KeyboardInterrupt: 

In [None]:
a = np.asarray([[1,2],[3,4]])
print(a)
a.dimshuffle(0,'x') 

In [None]:
import theano

In [None]:
import random

In [None]:
print(len(doc_output_step))
print(doc_output_step[0].shape)

In [None]:
print(y_pred_sigmoid_step.shape)

In [None]:
print(loss_step)