In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import spacy
import gensim
import matplotlib.pyplot as plt

import os
import csv
import time
import random
from datetime import datetime, timedelta

In [None]:
sess = None

def reset_vars():
    sess.run(tf.global_variables_initializer())

def reset_tf():
    global sess
    if sess:
        sess.close()
    tf.reset_default_graph()
    sess = tf.Session()

In [None]:
nlp = spacy.load('en_vectors_web_lg')

# All below in progress....

### Graph runner

In [None]:
def run_graph(data_dto, x, seqlens, y_true, optimizer, loss, accuracy, num_epochs=20, feed_extra={}):

    # TODO do not use global vars!
    data_name = data_dto.name
    x_train, x_test = data_dto.x_train, data_dto.x_test
    y_train, y_test = data_dto.y_train, data_dto.y_test
    lengths_train, lengths_test = data_dto.l_train, data_dto.l_test
    
    print('Using data: ' + data_name)
    
    reset_vars()

    metrics = {
        'train_loss': [],
        'train_acc': [],
        'test_loss': [],
        'test_acc' : [],
    }
    print_every = num_epochs // 10

    for i in range(num_epochs):

        shuffle_idxs = np.arange(len(x_train))
        np.random.shuffle(shuffle_idxs)

        x_train = x_train[shuffle_idxs]
        y_train = y_train[shuffle_idxs]
        lengths_train = lengths_train[shuffle_idxs]

        metrics['train_loss'].append(0)
        metrics['train_acc'].append(0)

        num_steps = len(x_train) // batch_size

        # loop through train data in batches
        for j in range(num_steps):

            start, end = j*batch_size, (j+1)*batch_size

            train_feed = {
                x: x_train[start:end],
                y_true: y_train[start:end],
                seqlens: lengths_train[start:end],
            }
            train_feed.update(feed_extra)
            
            sess.run(optimizer, feed_dict=train_feed)
            l, a = sess.run([loss, accuracy], feed_dict=train_feed)
            metrics['train_loss'][i] += l
            metrics['train_acc'][i] += a

        # calculate train metrics
        metrics['train_loss'][i] /= num_steps
        metrics['train_acc'][i] /= num_steps

        # prep test loop
        num_test_steps = len(x_test) // batch_size     # TODO this leaves out the last few data points..
        metrics['test_loss'].append(0)
        metrics['test_acc'].append(0)

        for k in range(num_test_steps):
            start, end = k*batch_size, (k+1)*batch_size
            
            test_feed = {
                x: x_test[start:end],
                y_true: y_test[start:end],
                seqlens: lengths_test[start:end]
            }
            test_feed.update(feed_extra)
            
            tl, ta = sess.run([loss, accuracy], feed_dict=test_feed)
            metrics['test_loss'][i] += tl
            metrics['test_acc'][i] += ta

        metrics['test_loss'][i] /= num_test_steps
        metrics['test_acc'][i] /= num_test_steps

#         print(i, i % print_every, print_every, num_epochs)
        if i % print_every == 0 or i == (num_epochs - 1):
            print("(epoch %i)\t Train: %0.5f, %0.5f \tTest: %0.5f, %0.5f" % (i, metrics['train_loss'][i], metrics['train_acc'][i], metrics['test_loss'][i], metrics['test_acc'][i]))
        
    return metrics
    

### Graph

In [None]:
def build_graph(batch_size=10, 
                rnn_size=25, 
                embedding_size=64,
                dropout_keepprob=0.8):

    reset_tf()

    x = tf.placeholder(tf.int32, shape=(batch_size, PAD_SIZE), name='x') # as indices of embedding
    seqlens = tf.placeholder(tf.int32, shape=[batch_size], name='seqlens')
    y_true = tf.placeholder(tf.float32, shape=[batch_size], name='y_true')

    # Start with embedding layer
    embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
    embedding_input = tf.nn.embedding_lookup(embeddings, x)

    # RNN - try also BasicRNNCell, GRUCell, BasicLSTMCell
    rnn_cell = tf.contrib.rnn.BasicLSTMCell(rnn_size)

    # Iteratively compute output of recurrent network
    rnn_out, lstm_state = tf.nn.dynamic_rnn(rnn_cell, embedding_input,
                                            sequence_length=seqlens, dtype=tf.float32)

    rnn_out = tf.nn.dropout(rnn_out, dropout_keepprob)

    # Get single output accoring to each sequence length
    out = tf.gather_nd(rnn_out, tf.stack([tf.range(batch_size), seqlens-1], axis=1))

    # Linear activation (FC layer on top of the LSTM net)
    y = tf.layers.dense(out, 1, activation=None)
    y = tf.reshape(y,(batch_size,))
    
    return x, seqlens, y, y_true

### Build it, create metrics, run it

In [None]:
batch_size = BATCH_SIZE
x, seqlens, y, y_true = build_graph(rnn_size=12, embedding_size=10, dropout_keepprob=0.7)

preds = tf.nn.softmax(y)
label_predictions = preds > 0.5
correct = tf.equal(tf.cast(label_predictions, tf.int32), tf.cast(y_true, tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

ETA = 0.01

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_true))
optimizer = tf.train.AdamOptimizer(ETA).minimize(loss) 
# optimizer = tf.train.RMSPropOptimizer(ETA).minimize(loss)

In [None]:
run_metrics = run_graph(data_gensim_dct, x, seqlens, y_true, optimizer, loss, accuracy, num_epochs=20)

> Better than chance, but likely the model is hindered by the small amount of training data and the sparsity of words--after all the embedding has no way to use context and thus won't build very meaningful relationships

# LSTM with Spacy's GloVe word embeddings

In [None]:
# size of embedding matrix, Gb
nlp.vocab.vectors.data.nbytes / 1024**3

In [None]:
# exclude words that don't have a vector for now
# get index of vector, not vector itself
def sent2seq_glove(sent,nlp): 
    return [nlp.vocab[w].rank for w in sent.split() if nlp.vocab[w].has_vector]

Rebuild data using spacy's indexes

In [None]:
df['seqlen_glove'] = [len(sent2seq_glove(s,nlp)) for s in df['clean']]
df['is_valid_seq_glove'] = df['seqlen_glove'] > 0

In [None]:
data_name = 'spacy indexes'

data, lengths = pad([sent2seq_glove(s,nlp) for s in df[df['is_valid_seq_glove']]['clean']])
y_labels = np.array(df[df['is_valid_seq_glove']]['score'])

print(data.shape, lengths.shape, y_labels.shape)

# do test train split
split_idxs = np.random.random(len(df)) < 0.8

x_train = data[split_idxs]
y_train = y_labels[split_idxs]
lengths_train = lengths[split_idxs]

x_test = data[~split_idxs]
y_test = y_labels[~split_idxs]
lengths_test = lengths[~split_idxs]

In [None]:
df['seqlen_glove'].mean(), df['seqlen_glove'].median(), (~df['is_valid_seq_glove']).sum()

> Fortunately this embedding is able to capture more words. 

Graph is nearly the exact same except for the embedding layer. It is no longer learned, but constant.

In [None]:
# INPUT_SIZE = 300

def build_graph_glove(batch_size=10, 
                     rnn_size=25,
                     dropout_keepprob=0.8):

    reset_tf()

    x = tf.placeholder(tf.int32, shape=(batch_size, PAD_SIZE), name='x') # as indices of embedding
    seqlens = tf.placeholder(tf.int32, shape=[batch_size], name='seqlens')
    y_true = tf.placeholder(tf.float32, shape=[batch_size], name='y_true')

    # Start with embedding layer
    embedding_matrix = tf.placeholder(shape=nlp.vocab.vectors.data.shape, 
                                      dtype=tf.float32, name='embedding_matrix')
    rnn_input = tf.nn.embedding_lookup(embedding_matrix, x)
    
    # optional dense layer...
#     rnn_input = tf.layers.dense(embedding_input, 64, activation=tf.nn.elu)

    # RNN - try also BasicRNNCell, GRUCell, BasicLSTMCell
    rnn_cell = tf.contrib.rnn.BasicLSTMCell(rnn_size)

    # Iteratively compute output of recurrent network
    rnn_out, lstm_state = tf.nn.dynamic_rnn(rnn_cell, rnn_input, 
                                            sequence_length=seqlens, dtype=tf.float32)

    rnn_out = tf.nn.dropout(rnn_out, dropout_keepprob)

    # Get single output accoring to each sequence length
    out = tf.gather_nd(rnn_out, tf.stack([tf.range(batch_size), seqlens-1], axis=1))

    # Linear activation (FC layer on top of the LSTM net)
    y = tf.layers.dense(out, 1, activation=None)
    y = tf.reshape(y,(batch_size,))
    
    return x, seqlens, y, y_true, embedding_matrix

In [None]:
x, seqlens, y, y_true, embedding_matrix = build_graph_glove(rnn_size=12, dropout_keepprob=0.7) #batch_size=

preds = tf.nn.softmax(y)
label_predictions = preds > 0.5
correct = tf.equal(tf.cast(label_predictions, tf.int32), tf.cast(y_true, tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

ETA = 0.01

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_true))
optimizer = tf.train.AdamOptimizer(ETA).minimize(loss) 
# optimizer = tf.train.RMSPropOptimizer(ETA).minimize(loss)

In [None]:
run_metrics = run_graph(x, seqlens, y_true, optimizer, loss, accuracy, num_epochs=20, 
                        feed_extra={embedding_matrix:nlp.vocab.vectors.data})

> No great improvement (but at least it's not eating massive amount of memory now!)

# Bidirectional LSTM

In [None]:
# INPUT_SIZE = 300

def build_graph_bidir(batch_size=10, 
                      rnn_size=25, 
                      embedding_size=64,
                      dropout_keepprob=0.8):

    reset_tf()

    x = tf.placeholder(tf.int32, shape=(batch_size, PAD_SIZE), name='x') # as indices of embedding
    seqlens = tf.placeholder(tf.int32, shape=[batch_size], name='seqlens')
    y_true = tf.placeholder(tf.float32, shape=[batch_size], name='y_true')

#     # Start with embedding layer
#     embeddings = tf.Variable(nlp.vocab.vectors.data, trainable=False)
#     embedding_input = tf.nn.embedding_lookup(embeddings, x)
    # Start with embedding layer
    embeddings = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
    embedding_input = tf.nn.embedding_lookup(embeddings, x)

    # RNN - try also BasicRNNCell, GRUCell, BasicLSTMCell
    rnn_fw = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    rnn_bw = tf.contrib.rnn.BasicLSTMCell(rnn_size)

    rnn_outs, rnn_states  = tf.nn.bidirectional_dynamic_rnn(
                                cell_fw=rnn_fw,
                                cell_bw=rnn_bw,
                                inputs=embedding_input,
                                sequence_length=seqlens, dtype=tf.float32)
 
    out_fw, out_bw = rnn_outs
    state_fw, state_bw = rnn_states
    
    out_fw = tf.gather_nd(out_fw, tf.stack([tf.range(batch_size), seqlens-1], axis=1))
    out_bw = tf.gather_nd(out_bw, tf.stack([tf.range(batch_size), seqlens-1], axis=1))
    
#     print(out_fw)
    rnn_out = tf.concat([out_fw, out_bw], axis=1)
#     print(rnn_out)
    
    out = tf.nn.dropout(rnn_out, dropout_keepprob)    

    # Linear activation (FC layer on top of the LSTM net)
    y = tf.layers.dense(out, 1, activation=None)
    y = tf.reshape(y,(batch_size,))
    
    return x, seqlens, y, y_true

In [None]:
x, seqlens, y, y_true = build_graph_bidir(rnn_size=8, embedding_size=10, dropout_keepprob=0.7) #batch_size=

preds = tf.nn.softmax(y)
label_predictions = preds > 0.5
correct = tf.equal(tf.cast(label_predictions, tf.int32), tf.cast(y_true, tf.int32))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

ETA = 0.01

loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=y_true))
optimizer = tf.train.AdamOptimizer(ETA).minimize(loss) 
# optimizer = tf.train.RMSPropOptimizer(ETA).minimize(loss)

In [None]:
run_metrics = run_graph(x, seqlens, y_true, optimizer, loss, accuracy, num_epochs=20)

> A little improvment over a single LSTM here, but once I move on to longer text it may begin to make a difference.