In [2]:
import numpy as np
import sys
import string
import json

## Prepare data

In [38]:
def load_data(data_file=None):
    if data_file == None:
        return
    with open(data_file) as f:
        data = json.load(f)
    return data

In [39]:
folder = './data/large_files/stanford_sentiment/parsed_data/'
word2idx = load_data(folder + "sentiment_word2idx.json")
sentiment_binary_train = load_data(folder + "sentiment_binary_train.json")
sentiment_train = load_data(folder + "sentiment_train.json")
sentiment_binary_test = load_data(folder + "sentiment_binary_test.json")
sentiment_test = load_data(folder + "sentiment_test.json")

In [40]:
# the loaded samples has three type of labels -1,0,1, in which -1 indicates neutral sentiment.
# We exclude samples with neutral sentiment.
def exclude_neutral_sample(samples:dict):
    ssamples = {}
    for k, v in samples.items():
        if v[3][-1] != -1:
            ssamples[k] = v
    return ssamples
        
train_b = exclude_neutral_sample(sentiment_binary_train)
test_b = exclude_neutral_sample(sentiment_binary_test)

print("After filtering: # of training samples and # of test samples")
print("# of traing samples: ", len(train_b))
print("# of test samples: ", len(test_b))

After filtering: # of training samples and # of test samples
# of traing samples:  6920
# of test samples:  1821


In [41]:
def get_comment(wordidx, idx2word:dict):
    wordlist = []
    for idx in wordidx:
        if idx != -1:
            token = idx2word[idx]
            if token not in string.punctuation:
                wordlist.append(token)
    return wordlist

In [42]:
def get_comments_samples(samples:dict, idx2word:dict):
    comments = []
    targets = []
    for _, v in samples.items():
        if v[3][-1] != -1:
            comment = " ".join(get_comment(v[0], idx2word))
            label = v[3][-1]
            comments.append(comment)
            targets.append(label) 
    return comments, targets

In [43]:
idx2word = {v:k for k, v in word2idx.items()}
train_comments_0, train_targets = get_comments_samples(train_b, idx2word)
test_comments_0, test_targets = get_comments_samples(test_b, idx2word)

count0 = 0
count1 = 0
count2 = 0
for i in range(len(train_comments_0)):

    if train_targets[i] == 0:
        count0 += 1
    elif train_targets[i] == 1:
        count1 += 1
    else:
        count2 += 1
#     print(i, comments[i], targets[i])
    
print("0", count0)
print("1", count1)
print("-1", count2)

0 3310
1 3610
-1 0


In [15]:
print(type(train_comments_0))
print(train_comments_0[1])
print(train_targets[1])

<class 'list'>
an undeniably gorgeous terminally smitten document of a troubadour his acolytes and the triumph of his band
1


In [44]:
vocabulary_size = len(idx2word)
print('vocabulary_size' , vocabulary_size)

vocabulary_size 18647


In [45]:
all_text1 = ' '.join(train_comments_0)
all_text2 = ' '.join(test_comments_0)
words1 = all_text1.split()
words2 = all_text2.split()
words = words1 + words2

In [46]:
# Create your dictionary that maps vocab words to integers here
from collections import Counter
counts = Counter(words)
print(type(counts))

vocab = sorted(counts, key=counts.get, reverse=True)

# Note that index start from 1
vocab_to_int = {word:i for i, word in enumerate(vocab, 1)}

<class 'collections.Counter'>


In [47]:
print("type of vocab", type(vocab))
print('total # of words: ', len(vocab_to_int))
print("first word:", vocab[0])
print("last word:", vocab[-1])
print("first word index:", vocab_to_int[vocab[0]])
print("last word index", vocab_to_int[vocab[-1]])

type of vocab <class 'list'>
total # of words:  16750
first word: the
last word: miracles
first word index: 1
last word index 16750


In [48]:
index2word = {idx:word for word, idx in vocab_to_int.items()}

In [49]:
def convert_to_int(reviews, vocab_to_int):
    # Convert the reviews to integers, same shape as reviews list, but with integers
    print('# of reviews before index: ', len(reviews))
    reviews_ints = []
    for review in reviews:
        reviews_ints.append([vocab_to_int[word] for word in review.split()])

    print('# of reviews after index: ', len(reviews_ints))
    return reviews_ints

In [50]:
x_train = convert_to_int(train_comments_0, vocab_to_int)

# of reviews before index:  6920
# of reviews after index:  6920


In [51]:
# test
print(x_train[7])
print(train_comments_0[7])
text = [index2word[idx] for idx in x_train[7]]
print(text)

[37, 4, 8, 217, 5664, 320, 1, 246, 4, 746, 48, 55, 2, 888, 5958, 158, 4, 2, 78, 3283, 488, 2824]
all of it works smoothly under the direction of spielberg who does a convincing impersonation here of a director enjoying himself immensely
['all', 'of', 'it', 'works', 'smoothly', 'under', 'the', 'direction', 'of', 'spielberg', 'who', 'does', 'a', 'convincing', 'impersonation', 'here', 'of', 'a', 'director', 'enjoying', 'himself', 'immensely']


In [52]:
x_test = convert_to_int(test_comments_0, vocab_to_int)

# of reviews before index:  1821
# of reviews after index:  1821


In [53]:
from collections import Counter

# Create length to frequency map
x_train_lens = Counter([len(x) for x in x_train])
print("Zero-length reviews: {}".format(x_train_lens[0]))
print("Maximum train example length: {}".format(max(x_train_lens)))

Zero-length reviews: 0
Maximum train example length: 49


In [54]:
# Create length to frequency map
x_test_lens = Counter([len(x) for x in x_test])
print("Zero-length reviews: {}".format(x_test_lens[0]))
print("Maximum test example length: {}".format(max(x_test_lens)))

Zero-length reviews: 0
Maximum test example length: 53


In [55]:
def padding(reviews_ints, seq_len):
    
    # The features created here are the data that we are going to train and test the network

    # Create features with shape (len(reviews_ints), seq_len) and initialized with zeros
    features = np.zeros((len(reviews_ints), seq_len), dtype=int)

    print(features.shape)
    # Create list holding the length for each review
    lengths = []

    # row is the review in forms of a list of integers
    for i, row in enumerate(reviews_ints):

        # left padding
        features[i, :len(row)] = np.array(row)[:seq_len]

        # record the length of each review. This might be useful when we want to use sequence_length argument
        # of tf.nn.dynamic_rnn(...)
        lengths.append(len(row) if len(row) < seq_len else seq_len)
        
    return features, lengths


In [56]:
## test

sample1 = [4,6,7,2,3,5,7,8,1]
sample2 = [1,2,3,4,6,7,2,3,5,7]
sample3 = [1,2,3,4,6,7,2,3,26,1, 11, 12]
sample4 = [8,7,3]
samples = []
samples.append(sample1)
samples.append(sample2)
samples.append(sample3)
samples.append(sample4)
features_, lengths_ = padding(samples, 10)

print(features_)
print(lengths_)

(4, 10)
[[ 4  6  7  2  3  5  7  8  1  0]
 [ 1  2  3  4  6  7  2  3  5  7]
 [ 1  2  3  4  6  7  2  3 26  1]
 [ 8  7  3  0  0  0  0  0  0  0]]
[9, 10, 10, 3]


In [33]:
seq_len = 50
kk = [1,2,3]
pad = [0] * 7
np.append(kk, pad)

array([1, 2, 3, 0, 0, 0, 0, 0, 0, 0])

In [59]:
x_train_p, x_train_len = padding(x_train, 50)
x_test_p, x_text_len = padding(x_test, 50)

(6920, 50)
(1821, 50)


In [60]:
x_train_p[:10,:100]

array([[    2,  1501,    13,   493,    10,   903,    14,   504,    12,
          205,   563,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0],
       [    6,    22,     8,     2,   265,  1485,    10,    38,   725,
           16,    78,  1602, 11045,     5,    81,     2,  1607,     5,
          488,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0],
       [   33,    19,  1182,  2685,     8, 11230,   238,  2230,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,    

In [63]:
train_label = np.array(train_targets)
test_label = np.array(test_targets)
print(train_label.shape)
print(test_label.shape)

(6920,)
(1821,)


## Model

In [None]:
def get_batches(features, labels, seq_len, batch_size):
    """
    Create batches of features and labels
    :param batch_size: The batch size
    :param features: List of features
    :param labels: List of labels
    :return: Batches of (Features, Labels)
    """
    assert len(features) == len(labels)
    outout_batches = []
    
    sample_size = len(features)
    for start_i in range(0, sample_size, batch_size):
        end_i = start_i + batch_size
        batch = [features[start_i:end_i], labels[start_i:end_i]]
        outout_batches.append(batch)
        
    return outout_batches

In [None]:
def get_batches(x, y, seq_len, batch_size=100):
    
    n_batches = len(x)//batch_size
    
    # Only get full batches
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]

In [64]:
def build_cell(lstm_units, keep_prob):
    cell = tf.contrib.rnn.BasicLSTMCell(lstm_units)
    drop = tf.contrib.rnn.DropoutWapper(cell, output_keep_prob = keep_prob)
    return drop

In [None]:
# hyperparameters
batch_size = 500

embedding_dim = 300
vocab_size = len(vocab_to_int) + 1

num_lstm_layer = 1
lstm_size = 

# input placeholders
inputs = tf.placeholder(tf.int32, shape=[None, None], 'inputs')
labels = tf.placeholder(tf.int32, shape=[None, None], 'labels')
seq_len = tf.placeholder(tf.int32, shape=[None], 'seq_len')
keep_prob = tf.placeholder(tf.float32, name = "keep_prob")


# embedding layer
embedding = tf.Variable(tf.random_normal((vocab_size, embedding_dim), -1, 1), name = 'embedding')
embed = tf.nn.embedding_lookup(embedding, inputs)


# LSTM layer
cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size, keep_prob) for _ in range(num_lstm_layer)])
initial_state = cell.zero_state(batch_size, tf.float32)

outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state = initial_state)

predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
cost = tf.losses.mean_squared_error(labels_, predictions)
    
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    