In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import spacy
import gensim
import matplotlib.pyplot as plt

import os
import csv
import time
import random
from datetime import datetime, timedelta

  from ._conv import register_converters as _register_converters


In [2]:
sess = None

def reset_vars():
    sess.run(tf.global_variables_initializer())

def reset_tf():
    global sess
    if sess:
        sess.close()
    tf.reset_default_graph()
    sess = tf.Session()

In [3]:
nlp = spacy.load('en_vectors_web_lg')

## Import data

Proof of concept data: Data is sentences from reviews on Yelp, IMDB, and Amazon. All sentences are labelled positive or negative--there's meant to be no neutral sentences in the data. Sentiment is binary.

In [4]:
data_dir = 'data/sentiment_sentences/'
file_names = ['amazon_cells_labelled.txt','imdb_labelled.txt','yelp_labelled.txt']

def read_data(file_name):
    return pd.read_csv(os.path.join(data_dir, file_name), sep='\t', header=None, quoting=csv.QUOTE_NONE)\
        .rename(columns={
            0: 'sentence',
            1: 'score',
        })
df = pd.concat([read_data(f) for f in file_names]).reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,sentence,score
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [6]:
df.head(1)['sentence'][0]

'So there is no way for me to plug it in here in the US unless I go by a converter.'

In [7]:
len(df)

3000

> Data set is small and we have to split to train. May not be enough to get good results. Lets find out how many words per sentence on avg...

Clean, remove punctuation

In [8]:
from gensim.corpora import Dictionary
from gensim.parsing.preprocessing import strip_numeric, strip_punctuation, strip_multiple_whitespaces
from gensim.parsing.preprocessing import preprocess_string

# prep_filters = [strip_punctuation, strip_numeric, strip_multiple_whitespaces]
prep_filters = [strip_punctuation, strip_multiple_whitespaces]

df['clean'] = df['sentence'].map(lambda s: ' '.join(preprocess_string(s.lower(), prep_filters)))
# ' '.join(preprocess_string(df.head(1)['sentence'][0], prep_filters))
df.head()

Unnamed: 0,sentence,score,clean
0,So there is no way for me to plug it in here i...,0,so there is no way for me to plug it in here i...
1,"Good case, Excellent value.",1,good case excellent value
2,Great for the jawbone.,1,great for the jawbone
3,Tied to charger for conversations lasting more...,0,tied to charger for conversations lasting more...
4,The mic is great.,1,the mic is great


# TODO: Baseline: bag of words model with LR or NB classifier

# LSTM with learned word embedding

Create word to index mapping

In [9]:
corpus = [sent.split() for sent in df['clean']]
dct = Dictionary(corpus)

dct.filter_extremes(no_below=5)
dct.compactify()

vocab_size = len(dct)

In [10]:
# exclude words that don't exist so we don't have to think about how to embed them :)
def sent2seq(sent, dct): 
    return [idx for idx in dct.doc2idx(sent.split()) if idx != -1]

In [11]:
for i in range(10):
    sample_sentence = df['clean'][i]
    seq = sent2seq(sample_sentence, dct)
    print(seq)

[12, 14, 7, 10, 18, 2, 9, 15, 11, 8, 6, 4, 6, 13, 17, 16, 5, 3, 1, 0]
[21, 19, 20, 22]
[23, 2, 13]
[15, 24, 2, 26, 28, 25, 27]
[13, 7, 23]
[5, 31, 15, 13, 11, 15, 30, 8, 15, 32, 34, 33, 15, 30, 29, 35]
[38, 45, 31, 42, 41, 42, 44, 13, 37, 39, 36, 39, 43, 40, 1, 40]
[38, 45, 46, 49, 48, 45, 47, 31, 50]
[15, 53, 5, 54, 52, 51]
[58, 0, 57, 39, 51, 55, 56]


In [12]:
df['seqlen'] = [len(sent2seq(s, dct)) for s in df['clean']]
df['is_valid_seq_gensim'] = df['seqlen'] > 0

In [13]:
df['seqlen'].mean(), df['seqlen'].median(), (~df['is_valid_seq_gensim']).sum()

(9.934, 9.0, 7)

> A typical sequence is about 9 words. Is this large? It seems like it may be too small given the limited number of sentences.

In [14]:
# OUTPUT_SIZE = 2
PAD_SIZE = 40 # 99th percentile (longer ones truncated)
BATCH_SIZE = 10

In [15]:
def pad(xs, pad_size=PAD_SIZE): # pass in as list, since next dim is not fixed size
    padded = np.zeros([len(xs), pad_size])
    lens = np.zeros(len(xs), dtype=np.int32)
    for i,vec in enumerate(xs): # by row
        if len(vec) > pad_size:
            vec = vec[:pad_size]
        
        padded[i,:len(vec)] = vec
        lens[i] = len(vec)
        
    return padded, lens

In [64]:
data_name = 'gensim indexes'

data, lengths = pad([sent2seq(s,dct) for s in df[df['is_valid_seq_gensim']]['clean']])
y_labels = np.array(df[df['is_valid_seq_gensim']]['score'])

print(data.shape, lengths.shape, y_labels.shape)

# do test train split
split_idxs = np.random.random(len(data)) < 0.8

x_train = data[split_idxs]
y_train = y_labels[split_idxs]
lengths_train = lengths[split_idxs]

x_test = data[~split_idxs]
y_test = y_labels[~split_idxs]
lengths_test = lengths[~split_idxs]

(2993, 40) (2993,) (2993,)


### Graph runner

In [43]:
def run_graph(x, seqlens, y_true, optimizer, loss, accuracy, num_epochs=20, feed_extra={}):

    # TODO do not use global vars!
    global data_name
    global x_train, y_train
    global x_test, y_test
    global lengths_train, lengths_test
    
    print('Using global data! - ' + data_name)
    
    reset_vars()

    metrics = {
        'train_loss': [],
        'train_acc': [],
        'test_loss': [],
        'test_acc' : [],
    }
    print_every = num_epochs // 10

    for i in range(num_epochs):

        shuffle_idxs = np.arange(len(x_train))
        np.random.shuffle(shuffle_idxs)

        x_train = x_train[shuffle_idxs]
        y_train = y_train[shuffle_idxs]
        lengths_train = lengths_train[shuffle_idxs]

        metrics['train_loss'].append(0)
        metrics['train_acc'].append(0)

        num_steps = len(x_train) // batch_size

        # loop through train data in batches
        for j in range(num_steps):

            start, end = j*batch_size, (j+1)*batch_size

            train_feed = {
                x: x_train[start:end],
                y_true: y_train[start:end],
                seqlens: lengths_train[start:end],
            }
            train_feed.update(feed_extra)
            
            sess.run(optimizer, feed_dict=train_feed)
            l, a = sess.run([loss, accuracy], feed_dict=train_feed)
            metrics['train_loss'][i] += l
            metrics['train_acc'][i] += a

        # calculate train metrics
        metrics['train_loss'][i] /= num_steps
        metrics['train_acc'][i] /= num_steps

        # prep test loop
        num_test_steps = len(x_test) // batch_size     # TODO this leaves out the last few data points..
        metrics['test_loss'].append(0)
        metrics['test_acc'].append(0)

        for k in range(num_test_steps):
            start, end = k*batch_size, (k+1)*batch_size
            
            test_feed = {
                x: x_test[start:end],
                y_true: y_test[start:end],
                seqlens: lengths_test[start:end]
            }
            test_feed.update(feed_extra)
            
            tl, ta = sess.run([loss, accuracy], feed_dict=test_feed)
            metrics['test_loss'][i] += tl
            metrics['test_acc'][i] += ta

        metrics['test_loss'][i] /= num_test_steps
        metrics['test_acc'][i] /= num_test_steps

#         print(i, i % print_every, print_every, num_epochs)
        if i % print_every == 0 or i == (num_epochs - 1):
            print("(epoch %i)\t Train: %0.5f, %0.5f \tTest: %0.5f, %0.5f" % (i, metrics['train_loss'][i], metrics['train_acc'][i], metrics['test_loss'][i], metrics['test_acc'][i]))
        
    return metrics
    

### Graph

In [18]:
def build_graph(batch_size=10, 
                rnn_size=25, 
                embedding_size=64,
                dropout_keepprob=0.8):

    reset_tf()

    x = tf.placeholder(tf.int32, shape=(batch_size, PAD_SIZE), name='x') # as indices of embedding
    seqlens = tf.placeholder(tf.int32, shape=[batch_size], name='seqlens')
    y_true = tf.placeholder(tf.float32, shape=[batch_size], name='y_true')

    # Start with embedding layer
    embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
    embedding_input = tf.nn.embedding_lookup(embeddings, x)

    # RNN - try also BasicRNNCell, GRUCell, BasicLSTMCell
    rnn_cell = tf.contrib.rnn.BasicLSTMCell(rnn_size)

    # Iteratively compute output of recurrent network
    rnn_out, lstm_state = tf.nn.dynamic_rnn(rnn_cell, embedding_input,
                                            sequence_length=seqlens, dtype=tf.float32)

    rnn_out = tf.nn.dropout(rnn_out, dropout_keepprob)

    # Get single output accoring to each sequence length
    out = tf.gather_nd(rnn_out, tf.stack([tf.range(batch_size), seqlens-1], axis=1))

    # Linear activation (FC layer on top of the LSTM net)
    y = tf.layers.dense(out, 1, activation=None)
    y = tf.reshape(y,(batch_size,))
    
    return x, seqlens, y, y_true

### Build it, create metrics, run it

In [37]:
batch_size = BATCH_SIZE
x, seqlens, y, y_true = build_graph(rnn_size=12, embedding_size=10, dropout_keepprob=0.7)

preds = tf.nn.softmax(y)
label_predictions = preds > 0.5
correct = tf.equal(tf.cast(label_predictions, tf.int32), tf.cast(y_true, tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

ETA = 0.01

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_true))
optimizer = tf.train.AdamOptimizer(ETA).minimize(loss) 
# optimizer = tf.train.RMSPropOptimizer(ETA).minimize(loss)

In [20]:
run_metrics = run_graph(x, seqlens, y_true, optimizer, loss, accuracy, num_epochs=20)

Using global data! - gensim indexes
(epoch 0)	 Train: 0.59145, 0.50000 	Test: 0.53654, 0.52031
(epoch 2)	 Train: 0.27829, 0.56766 	Test: 0.59447, 0.55000
(epoch 4)	 Train: 0.17064, 0.58085 	Test: 0.56564, 0.57500
(epoch 6)	 Train: 0.11808, 0.58340 	Test: 0.66359, 0.58594
(epoch 8)	 Train: 0.08761, 0.58936 	Test: 0.83225, 0.57813
(epoch 10)	 Train: 0.07510, 0.58596 	Test: 0.86325, 0.58594
(epoch 12)	 Train: 0.06223, 0.58979 	Test: 1.07629, 0.56406
(epoch 14)	 Train: 0.05084, 0.58681 	Test: 1.09434, 0.56875
(epoch 16)	 Train: 0.04171, 0.58936 	Test: 1.10326, 0.59063
(epoch 18)	 Train: 0.02981, 0.58936 	Test: 1.26463, 0.58125
(epoch 19)	 Train: 0.04814, 0.59106 	Test: 1.16361, 0.57969


> Better than chance, but likely the model is hindered by the small amount of training data and the sparsity of words--after all the embedding has no way to use context and thus won't build very meaningful relationships

# LSTM with Spacy's GloVe word embeddings

In [21]:
# size of embedding matrix, Gb
nlp.vocab.vectors.data.nbytes / 1024**3

1.1969033628702164

In [22]:
# exclude words that don't have a vector for now
def sent2seq_glove(sent,nlp): 
    return [nlp.vocab[w].rank for w in sent.split() if nlp.vocab[w].has_vector]

Rebuild data using spacy's indexes

In [23]:
df['seqlen_glove'] = [len(sent2seq_glove(s,nlp)) for s in df['clean']]
df['is_valid_seq_glove'] = df['seqlen_glove'] > 0

In [24]:
data_name = 'spacy indexes'

data, lengths = pad([sent2seq_glove(s,nlp) for s in df[df['is_valid_seq_glove']]['clean']])
y_labels = np.array(df[df['is_valid_seq_glove']]['score'])

print(data.shape, lengths.shape, y_labels.shape)

# do test train split
split_idxs = np.random.random(len(df)) < 0.8

x_train = data[split_idxs]
y_train = y_labels[split_idxs]
lengths_train = lengths[split_idxs]

x_test = data[~split_idxs]
y_test = y_labels[~split_idxs]
lengths_test = lengths[~split_idxs]

(3000, 40) (3000,) (3000,)


In [25]:
df['seqlen_glove'].mean(), df['seqlen_glove'].median(), (~df['is_valid_seq_glove']).sum()

(12.088333333333333, 10.0, 0)

> Fortunately this embedding is able to capture more words. 

Graph is nearly the exact same except for the embedding layer. It is no longer learned, but constant.

In [56]:
# INPUT_SIZE = 300

def build_graph_glove(batch_size=10, 
                     rnn_size=25,
                     dropout_keepprob=0.8):

    reset_tf()

    x = tf.placeholder(tf.int32, shape=(batch_size, PAD_SIZE), name='x') # as indices of embedding
    seqlens = tf.placeholder(tf.int32, shape=[batch_size], name='seqlens')
    y_true = tf.placeholder(tf.float32, shape=[batch_size], name='y_true')

    # Start with embedding layer
    embedding_matrix = tf.placeholder(shape=nlp.vocab.vectors.data.shape, 
                                      dtype=tf.float32, name='embedding_matrix')
    rnn_input = tf.nn.embedding_lookup(embedding_matrix, x)
    
    # optional dense layer...
#     rnn_input = tf.layers.dense(embedding_input, 64, activation=tf.nn.elu)

    # RNN - try also BasicRNNCell, GRUCell, BasicLSTMCell
    rnn_cell = tf.contrib.rnn.BasicLSTMCell(rnn_size)

    # Iteratively compute output of recurrent network
    rnn_out, lstm_state = tf.nn.dynamic_rnn(rnn_cell, rnn_input, 
                                            sequence_length=seqlens, dtype=tf.float32)

    rnn_out = tf.nn.dropout(rnn_out, dropout_keepprob)

    # Get single output accoring to each sequence length
    out = tf.gather_nd(rnn_out, tf.stack([tf.range(batch_size), seqlens-1], axis=1))

    # Linear activation (FC layer on top of the LSTM net)
    y = tf.layers.dense(out, 1, activation=None)
    y = tf.reshape(y,(batch_size,))
    
    return x, seqlens, y, y_true, embedding_matrix

In [58]:
x, seqlens, y, y_true, embedding_matrix = build_graph_glove(rnn_size=12, dropout_keepprob=0.7) #batch_size=

preds = tf.nn.softmax(y)
label_predictions = preds > 0.5
correct = tf.equal(tf.cast(label_predictions, tf.int32), tf.cast(y_true, tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

ETA = 0.01

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_true))
optimizer = tf.train.AdamOptimizer(ETA).minimize(loss) 
# optimizer = tf.train.RMSPropOptimizer(ETA).minimize(loss)

In [60]:
run_metrics = run_graph(x, seqlens, y_true, optimizer, loss, accuracy, num_epochs=20, 
                        feed_extra={embedding_matrix:nlp.vocab.vectors.data})

Using global data! - spacy indexes
(epoch 0)	 Train: 0.40510, 0.53792 	Test: 0.35100, 0.50847
(epoch 2)	 Train: 0.18295, 0.58125 	Test: 0.32369, 0.54576
(epoch 4)	 Train: 0.11481, 0.59000 	Test: 0.41696, 0.55593
(epoch 6)	 Train: 0.08695, 0.59292 	Test: 0.41609, 0.54915
(epoch 8)	 Train: 0.06195, 0.59500 	Test: 0.52518, 0.55424
(epoch 10)	 Train: 0.04288, 0.59792 	Test: 0.59999, 0.54237
(epoch 12)	 Train: 0.04089, 0.59625 	Test: 0.47918, 0.55593
(epoch 14)	 Train: 0.08053, 0.59500 	Test: 0.54101, 0.54915
(epoch 16)	 Train: 0.04207, 0.59833 	Test: 0.53253, 0.55932
(epoch 18)	 Train: 0.03377, 0.59667 	Test: 0.54245, 0.55593
(epoch 19)	 Train: 0.04224, 0.59875 	Test: 0.64583, 0.55254


> No great improvement (but at least it's not eating massive amount of memory now!)

# Bidirectional LSTM

In [61]:
# INPUT_SIZE = 300

def build_graph_bidir(batch_size=10, 
                      rnn_size=25, 
                      embedding_size=64,
                      dropout_keepprob=0.8):

    reset_tf()

    x = tf.placeholder(tf.int32, shape=(batch_size, PAD_SIZE), name='x') # as indices of embedding
    seqlens = tf.placeholder(tf.int32, shape=[batch_size], name='seqlens')
    y_true = tf.placeholder(tf.float32, shape=[batch_size], name='y_true')

#     # Start with embedding layer
#     embeddings = tf.Variable(nlp.vocab.vectors.data, trainable=False)
#     embedding_input = tf.nn.embedding_lookup(embeddings, x)
    # Start with embedding layer
    embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
    embedding_input = tf.nn.embedding_lookup(embeddings, x)

    # RNN - try also BasicRNNCell, GRUCell, BasicLSTMCell
    rnn_fw = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    rnn_bw = tf.contrib.rnn.BasicLSTMCell(rnn_size)

    rnn_outs, rnn_states  = tf.nn.bidirectional_dynamic_rnn(
                                cell_fw=rnn_fw,
                                cell_bw=rnn_bw,
                                inputs=embedding_input,
                                sequence_length=seqlens, dtype=tf.float32)
 
    out_fw, out_bw = rnn_outs
    state_fw, state_bw = rnn_states
    
    out_fw = tf.gather_nd(out_fw, tf.stack([tf.range(batch_size), seqlens-1], axis=1))
    out_bw = tf.gather_nd(out_bw, tf.stack([tf.range(batch_size), seqlens-1], axis=1))
    
#     print(out_fw)
    rnn_out = tf.concat([out_fw, out_bw], axis=1)
#     print(rnn_out)
    
    out = tf.nn.dropout(rnn_out, dropout_keepprob)    

    # Linear activation (FC layer on top of the LSTM net)
    y = tf.layers.dense(out, 1, activation=None)
    y = tf.reshape(y,(batch_size,))
    
    return x, seqlens, y, y_true

In [62]:
x, seqlens, y, y_true = build_graph_bidir(rnn_size=8, embedding_size=10, dropout_keepprob=0.7) #batch_size=

preds = tf.nn.softmax(y)
label_predictions = preds > 0.5
correct = tf.equal(tf.cast(label_predictions, tf.int32), tf.cast(y_true, tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

ETA = 0.01

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_true))
optimizer = tf.train.AdamOptimizer(ETA).minimize(loss) 
# optimizer = tf.train.RMSPropOptimizer(ETA).minimize(loss)

Instructions for updating:
seq_dim is deprecated, use seq_axis instead
Instructions for updating:
batch_dim is deprecated, use batch_axis instead


In [65]:
run_metrics = run_graph(x, seqlens, y_true, optimizer, loss, accuracy, num_epochs=20)

Using global data! - gensim indexes
(epoch 0)	 Train: 0.63287, 0.49325 	Test: 0.56135, 0.53115
(epoch 2)	 Train: 0.29969, 0.56034 	Test: 0.47250, 0.59672
(epoch 4)	 Train: 0.20060, 0.56920 	Test: 0.47774, 0.60000
(epoch 6)	 Train: 0.14366, 0.57637 	Test: 0.58261, 0.60000
(epoch 8)	 Train: 0.10701, 0.57511 	Test: 0.65026, 0.60656
(epoch 10)	 Train: 0.08950, 0.58397 	Test: 0.58054, 0.60492
(epoch 12)	 Train: 0.08199, 0.58228 	Test: 0.85413, 0.59836
(epoch 14)	 Train: 0.07426, 0.58650 	Test: 0.86435, 0.60656
(epoch 16)	 Train: 0.05752, 0.58650 	Test: 0.97854, 0.60164
(epoch 18)	 Train: 0.06786, 0.58481 	Test: 1.04995, 0.58197
(epoch 19)	 Train: 0.08466, 0.58608 	Test: 0.85015, 0.59836


> A little improvment over a single LSTM here, but once I move on to longer text it may begin to make a difference.