In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import spacy
import gensim

import os
import csv
import time
import random
from datetime import datetime, timedelta

  from ._conv import register_converters as _register_converters


In [2]:
sess = None

def reset_vars():
    sess.run(tf.global_variables_initializer())

def reset_tf():
    global sess
    if sess:
        sess.close()
    tf.reset_default_graph()
    sess = tf.Session()

In [3]:
nlp = spacy.load('en_vectors_web_lg')

## Import data

Data is sentences from reviews on Yelp, IMDB, and Amazon. All sentences are labelled positive or negative--there's meant to be no neutral sentences in the data.

In [4]:
data_dir = 'data/sentiment_sentences/'
file_names = ['amazon_cells_labelled.txt','imdb_labelled.txt','yelp_labelled.txt']

def read_data(file_name):
    return pd.read_csv(os.path.join(data_dir, file_name), sep='\t', header=None, quoting=csv.QUOTE_NONE)\
        .rename(columns={
            0: 'sentence',
            1: 'score',
        })
df = pd.concat([read_data(f) for f in file_names]).reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,sentence,score
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [6]:
df.head(1)['sentence'][0]

'So there is no way for me to plug it in here in the US unless I go by a converter.'

Clean, remove punctuation

In [7]:
from gensim.parsing.preprocessing import strip_numeric, strip_punctuation, strip_multiple_whitespaces
from gensim.parsing.preprocessing import preprocess_string

# prep_filters = [strip_punctuation, strip_numeric, strip_multiple_whitespaces]
prep_filters = [strip_punctuation, strip_multiple_whitespaces]

df['clean'] = df['sentence'].map(lambda s: ' '.join(preprocess_string(s.lower(), prep_filters)))
# ' '.join(preprocess_string(df.head(1)['sentence'][0], prep_filters))
df.head()

Unnamed: 0,sentence,score,clean
0,So there is no way for me to plug it in here i...,0,so there is no way for me to plug it in here i...
1,"Good case, Excellent value.",1,good case excellent value
2,Great for the jawbone.,1,great for the jawbone
3,Tied to charger for conversations lasting more...,0,tied to charger for conversations lasting more...
4,The mic is great.,1,the mic is great


# Baseline: bag of words model with LR, NB

# LSTM with learned word embedding

Create word to index mapping

In [8]:
from gensim.corpora import Dictionary
corpus = [sent.split() for sent in df['clean']]
dct = Dictionary(corpus)

dct.filter_extremes(no_below=5)
dct.compactify()

vocab_size = len(dct)

In [9]:
# exclude words that don't exist so we don't have to think about how to embed them :)
def sent2seq(sent): 
    return [idx for idx in dct.doc2idx(sent.split()) if idx != -1]

In [10]:
df['seqlen'] = [len(sent2seq(s)) for s in df['clean']]
df = df[df['seqlen'] > 0]

In [11]:
for i in range(10):
    sample_sentence = df['clean'][i]
    seq = sent2seq(sample_sentence)
    print(seq)

[12, 14, 7, 10, 18, 2, 9, 15, 11, 8, 6, 4, 6, 13, 17, 16, 5, 3, 1, 0]
[21, 19, 20, 22]
[23, 2, 13]
[15, 24, 2, 26, 28, 25, 27]
[13, 7, 23]
[5, 31, 15, 13, 11, 15, 30, 8, 15, 32, 34, 33, 15, 30, 29, 35]
[38, 45, 31, 42, 41, 42, 44, 13, 37, 39, 36, 39, 43, 40, 1, 40]
[38, 45, 46, 49, 48, 45, 47, 31, 50]
[15, 53, 5, 54, 52, 51]
[58, 0, 57, 39, 51, 55, 56]


In [12]:
INPUT_SIZE = 300
# OUTPUT_SIZE = 2
PAD_SIZE = 40 # 99th percentile (longer ones truncated)
BATCH_SIZE = 10
LSTM_SIZE = 24

In [13]:
def pad(xs, pad_size=PAD_SIZE): # pass in as list, since next dim is not fixed size
    padded = np.zeros([len(xs), pad_size])
    lens = np.zeros(len(xs), dtype=np.int32)
    for i,vec in enumerate(xs): # by row
        if len(vec) > pad_size:
            vec = vec[:pad_size]
        
        padded[i,:len(vec)] = vec
        lens[i] = len(vec)
        
    return padded, lens

data, lengths = pad([sent2seq(s) for s in df['clean']])
data.shape, lengths.shape

((2993, 40), (2993,))

In [14]:
y_labels = np.array(df['score'])
# y_labels = np.array([np.array([1-s,s]) for s in df['score']])

# do test train split
split_idxs = np.random.random(len(df)) < 0.8

x_train = data[split_idxs]
y_train = y_labels[split_idxs]
lengths_train = lengths[split_idxs]

x_test = data[~split_idxs]
y_test = y_labels[~split_idxs]
lengths_test = lengths[~split_idxs]

Build the graph

In [50]:
def build_lstm(batch_size=BATCH_SIZE, 
               lstm_size=LSTM_SIZE, 
               embedding_size=64):

    reset_tf()

    x = tf.placeholder(tf.int32, shape=(batch_size, PAD_SIZE), name='x') # as indices of embedding
    seqlens = tf.placeholder(tf.int32, shape=[batch_size], name='seqlens')
    y_true = tf.placeholder(tf.float32, shape=[batch_size], name='y_true')

    # Start with embedding layer
    embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
    # embeddings = tf.get_variable('embeddings', [vocab_size, embedding_size])
    embedding_input = tf.nn.embedding_lookup(embeddings, x)

    # RNN - try also BasicRNNCell, GRUCell, BasicLSTMCell
    lstm = tf.contrib.rnn.GRUCell(lstm_size)
    lstm_init_state = lstm.zero_state(batch_size, tf.float32)

    # Iteratively compute output of recurrent network
    rnn_out, lstm_state = tf.nn.dynamic_rnn(lstm, embedding_input, initial_state=lstm_init_state,
                                            sequence_length=seqlens, dtype=tf.float32)

    rnn_out = tf.nn.dropout(rnn_out, 0.9)

    # Get single output accoring to each sequence length
    out = tf.gather_nd(rnn_out, tf.stack([tf.range(batch_size), seqlens-1], axis=1))

    # Linear activation (FC layer on top of the LSTM net)
    y = tf.layers.dense(out, 1, activation=None)
    y = tf.reshape(y,(batch_size,))
    
    return x, seqlens, y, y_true

In [51]:
batch_size = BATCH_SIZE
x, seqlens, y, y_true = build_lstm(lstm_size=48)

preds = tf.nn.softmax(y)
label_predictions = preds > 0.5
correct = tf.equal(tf.cast(label_predictions, tf.int32), tf.cast(y_true, tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

ETA = 0.01

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_true))
optimizer = tf.train.AdamOptimizer(ETA).minimize(loss) 
# optimizer = tf.train.RMSPropOptimizer(ETA).minimize(loss)

In [52]:
reset_vars()

num_epochs = 20

metrics = {
    'train_loss': [],
    'train_acc': [],
    'test_loss': [],
    'test_acc' : [],
}

for i in range(num_epochs):
    
    shuffle_idxs = np.arange(len(x_train))
    np.random.shuffle(shuffle_idxs)

    x_train = x_train[shuffle_idxs]
    y_train = y_train[shuffle_idxs]
    lengths_train = lengths_train[shuffle_idxs]
    
    metrics['train_loss'].append(0)
    metrics['train_acc'].append(0)
    
    num_steps = len(x_train) // batch_size
#     print(i, metrics['train_loss'][i])

    # loop through train data in batches
    for j in range(num_steps):

        start, end = j*batch_size, (j+1)*batch_size
#         print(x_train[start:end])
#         print(y_train[start:end])
#         print(lengths_train[start:end])
#         continue
        train_feed = {
            x: x_train[start:end],
            y_true: y_train[start:end],
            seqlens: lengths_train[start:end],
        }

        sess.run(optimizer, feed_dict=train_feed)
        l, a = sess.run([loss, accuracy], feed_dict=train_feed)
        metrics['train_loss'][i] += l
        metrics['train_acc'][i] += a

    # calculate train metrics
    metrics['train_loss'][i] /= num_steps
    metrics['train_acc'][i] /= num_steps
    
    # prep test loop
    num_test_steps = len(x_test) // batch_size     # TODO this leaves out the last few data points..
    metrics['test_loss'].append(0)
    metrics['test_acc'].append(0)
    
    for k in range(num_test_steps):
        start, end = k*batch_size, (k+1)*batch_size
        tl, ta = sess.run([loss, accuracy], feed_dict={
            x: x_test[start:end],
            y_true: y_test[start:end],
            seqlens: lengths_test[start:end]
        })
        metrics['test_loss'][i] += tl
        metrics['test_acc'][i] += ta
        
    metrics['test_loss'][i] /= num_test_steps
    metrics['test_acc'][i] /= num_test_steps
    
    # print
    print("(epoch %i)\t Train: %0.5f, %0.5f \tTest: %0.5f, %0.5f" % (i, metrics['train_loss'][i], metrics['train_acc'][i], metrics['test_loss'][i], metrics['test_acc'][i]))
    


(epoch 0)	 Train: 0.49298, 0.51271 	Test: 0.42506, 0.55556
(epoch 1)	 Train: 0.25656, 0.56441 	Test: 0.38735, 0.58730
(epoch 2)	 Train: 0.13795, 0.56949 	Test: 0.42265, 0.60000
(epoch 3)	 Train: 0.07666, 0.57119 	Test: 0.54989, 0.58730
(epoch 4)	 Train: 0.05818, 0.57797 	Test: 0.55915, 0.59048
(epoch 5)	 Train: 0.03893, 0.58093 	Test: 0.68873, 0.58571
(epoch 6)	 Train: 0.02980, 0.58390 	Test: 0.75989, 0.59841
(epoch 7)	 Train: 0.02909, 0.58771 	Test: 0.65924, 0.58889
(epoch 8)	 Train: 0.03809, 0.58390 	Test: 0.68514, 0.58413
(epoch 9)	 Train: 0.04102, 0.58220 	Test: 0.69925, 0.58571
(epoch 10)	 Train: 0.03802, 0.58602 	Test: 0.80177, 0.58730
(epoch 11)	 Train: 0.02526, 0.58729 	Test: 0.78708, 0.58095
(epoch 12)	 Train: 0.04061, 0.58686 	Test: 0.81123, 0.58254
(epoch 13)	 Train: 0.05178, 0.58559 	Test: 0.73437, 0.58889
(epoch 14)	 Train: 0.03231, 0.58771 	Test: 0.82247, 0.58889
(epoch 15)	 Train: 0.02942, 0.58347 	Test: 0.87045, 0.58889
(epoch 16)	 Train: 0.02062, 0.58559 	Test: 0.93426

# Bidirectional LSTM

# LSTM with Spacy's GloVe word embeddings

In [None]:
# Load all embeddings into memory? Sure why not, dataset is small
glove = np.zeros([len(dct), 300])
for i, word in dct.items():
    glove[i,:] = nlp.vocab['word'].vector

In [None]:
# padding

Graph is nearly the same except for the embedding layer. It is no longer learned, but constant.