# Plumbing
1. Download and unpack `sentence polarity dataset v1.0` from http://www.cs.cornell.edu/people/pabo/movie-review-data/
2. Download BNC (TODO)
3. Download the EasyCCG parser from http://homepages.inf.ed.ac.uk/s1049478/easyccg.html, unpack the package (you should get a catalog like `easyccg-0.2`). From the same page, download the regular pretrained model (`model.tar.gz`). Unpack the model to the parser's catalog.

# Getting the British National Corpus & the word list

We will parse BNC XML files with lxml. NLTK technically has a dedicated parser for BNC, which is extremely slow in the lazy mode, and in the non-lazy mode it is very slow and also consumes >8GB of memory.

In [1]:
bnc_path = 'BNC/Texts/'
from os.path import exists

def bnc_files_iter():
    top_level = ['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'K']
    symbols = top_level + ['L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'W', 'V', 'X', 'Y', 'Z',
                           '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    for top in top_level:
        top_path = bnc_path + '/' + top
        if not exists(top_path):
            continue
        for symbol2 in symbols:
            path2 = top_path + '/' + top + symbol2
            if not exists(path2):
                continue
            for symbol3 in symbols:
                current_path = path2 + '/' + top + symbol2 + symbol3 + '.xml'
                if not exists(current_path):
                    continue
                yield open(current_path)

In [2]:
from lxml import etree

In [3]:
unique_words = set()

for bnc_file in bnc_files_iter():
    file_tree = etree.parse(bnc_file)
    for element in file_tree.iter():
        if (element.tag == 'w' or element.tag == 'c') and element.text:
            unique_words.add(element.text.strip())
    bnc_file.close()
    
unique_words = list(unique_words)
print(unique_words[:10])

['', 'Ndadaye', 'Jarl', 'Raidan', 'speechlessly', '7°', 'shipowner', 'INTENSITY', 'Visio', '112½°']


In [4]:
unique_count = len(unique_words)
print(unique_count)

705241


# Getting CCG parse trees for BNC

In [5]:
# we will run the underlying parser with pexpect, and intercept its outputs from within Python
import pexpect
parser = pexpect.spawn('java -jar easyccg-0.2/easyccg.jar --model easyccg-0.2/model')
parser.expect('Model loaded, ready to parse.')
parser.send('The cat chases a ball of yarn.\n')
parser.expect('ID')
parser.expect('\n\(.*\n')
parser_output = parser.after.decode().strip() # encode from bytes into str, strip whitespace
print(parser_output)
parser.terminate()

(<T S[dcl] 1 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS The NP[nb]/N>) (<L N POS POS cat N>) ) (<T S[dcl]\NP 0 2> (<L (S[dcl]\NP)/NP POS POS chases (S[dcl]\NP)/NP>) (<T NP[nb] 0 2> (<T NP[nb] 0 2> (<L NP[nb]/N POS POS a NP[nb]/N>) (<L N POS POS ball N>) ) (<T NP\NP 0 2> (<L (NP\NP)/NP POS POS of (NP\NP)/NP>) (<T NP 0 1> (<L N POS POS yarn. N>) ) ) ) ) )


False

Let's see how NLTK can handle parse trees.

In [6]:
import re
only_word = re.compile(r'<L\s\S+\sPOS\sPOS\s(\S+)\s\S+>')
concat_label = re.compile(r'<(\S+)\s(\S+)\s(\S+)\s(\S+)>')

# some string cleanup
def clean_parser_output(parse_output):
    return concat_label.sub(lambda match: '<'+match.group(1)+'_'+match.group(2).replace('(', '[').replace(')', ']')
                            +'_'+match.group(3)+'_'+match.group(4)+'>',
                            only_word.sub(lambda match: match.group(1), parse_output))

from nltk.tree import ParentedTree
tree = ParentedTree.fromstring(clean_parser_output(parser_output))
print(tree)

(<T_S[dcl]_1_2>
  (<T_NP[nb]_0_2> (The ) (cat ))
  (<T_S[dcl]\NP_0_2>
    (chases )
    (<T_NP[nb]_0_2>
      (<T_NP[nb]_0_2> (a ) (ball ))
      (<T_NP\NP_0_2> (of ) (<T_NP_0_1> (yarn. ))))))


In each `(parenthesized expression)`, the first item `(head)` is the category of node, and two next items are its child nodes.

## Learning word embeddings

Our embedding procedure will be based on this Tensorflow [word2vec tutorial](https://www.tensorflow.org/tutorials/word2vec).

In [7]:
# Consistently map each unique word to a integer.
word_map = {word: index for index, word in enumerate(unique_words) }

In [8]:
# Collect all sentences from the corpus, with words as their indices in the word map.
corpus_sents = []

for bnc_file in bnc_files_iter():
    file_tree = etree.parse(bnc_file)
    for element in file_tree.iter():
        if element.tag == 's':
            corpus_sents.append([])
        if (element.tag == 'w' or element.tag == 'c') and element.text:
            corpus_sents[-1].append(word_map[element.text.strip()])
    bnc_file.close()

Generate batches of pairs (context word, target word). For simplicity, we hardcode the window size (2) and number of examples in window.

In [9]:
import numpy as np

In [10]:
from random import randint

num_samples = 4

def skipgrams_batch(batch_size):
    assert batch_size % num_samples == 0
    windows_n = batch_size // num_samples
    
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    for i in range(windows_n):
        target_sent = randint(0, len(corpus_sents)-1)
        while len(corpus_sents[target_sent]) < 5:
            target_sent = randint(0, len(corpus_sents)-1)
        target = randint(2, len(corpus_sents[target_sent])-3)
        for j in range(num_samples):
            labels[i*num_samples+j][0] = corpus_sents[target_sent][target]
        batch[i*num_samples] = corpus_sents[target_sent][target-2]
        batch[i*num_samples+1] = corpus_sents[target_sent][target-1]
        batch[i*num_samples+2] = corpus_sents[target_sent][target+1]
        batch[i*num_samples+3] = corpus_sents[target_sent][target+2]
        
    return batch, labels

print(skipgrams_batch(12))

(array([319948, 328721, 246274, 522645, 175160, 545942, 629992,  64627,
       182598, 672074,  89054,  18919], dtype=int32), array([[160660],
       [160660],
       [160660],
       [160660],
       [545265],
       [545265],
       [545265],
       [545265],
       [272765],
       [272765],
       [272765],
       [272765]], dtype=int32))


In [11]:
import tensorflow as tf
import math

In [12]:
vocabulary_size = len(unique_words)
embedding_size = 70

# Model parameters.
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                              stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

In [13]:
batch_size = 128

# The computation graph.
inputs = tf.placeholder(tf.int32, shape=[batch_size])
labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
embedding_layer = tf.nn.embedding_lookup(embeddings, inputs)

# Note that word2vec has no "real" hidden layers apart from the embedding.

# Number of random words to sample apart from the true target; the model should learn to
# assign low probability to them given the context.
negative_samples_n = 64

loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                     biases=nce_biases,
                                     labels=labels,
                                     inputs=embedding_layer,
                                     num_sampled=negative_samples_n,
                                     num_classes=vocabulary_size))
optimizer = tf.train.AdagradOptimizer(0.1).minimize(loss)

In [14]:
steps_n = len(unique_words) * 3
trained_embeddings = [] # we want to use them later
with tf.Session() as sess:
    tf.global_variables_initializer().run()
    for i in range(steps_n):
        batch_inputs, batch_labels = skipgrams_batch(batch_size)
        if i+1 == steps_n:
            _, loss_val, trained_embeddings = sess.run([optimizer, loss, embeddings], feed_dict={inputs: batch_inputs,
                                                             labels: batch_labels})
            print('Final loss:', loss_val)
        else:
            _, loss_val = sess.run([optimizer, loss], feed_dict={inputs: batch_inputs,
                                                             labels: batch_labels})
            # TODO meaningful completion info
            if (i % 100000 == 0):
                print(loss_val)
print(trained_embeddings.shape)

328.992
63.6749
10.3768
3.23855
7.22826
8.20637
7.24628
7.81358
2.36533
5.33443
3.36494
4.38888
8.69388
6.57857
1.57129
2.89697
3.04295
2.89717
3.25142
2.1147
4.2987
3.13078
Final loss: 3.7977
(705241, 70)


In [15]:
trained_embeddings[word_map['honey'], ]

array([-1.76175618, -0.48622197, -0.34541205, -0.99966902, -1.71851861,
        1.11292863, -0.51043302,  1.39485371, -1.43934906, -2.27567148,
        1.80909979, -1.66944766, -0.97985524, -1.38823986, -1.1190412 ,
        2.17810202,  2.24670577, -0.43959919,  2.17650867, -0.39423934,
        0.80785453,  1.88030303,  2.01537085, -1.87546027,  1.59910655,
       -0.66886234, -1.32832623,  1.83181989,  0.70867652, -2.16132832,
        1.50185454, -0.696419  , -0.92023361, -1.05840933, -0.50320876,
        0.78315854,  0.67317289, -1.36801219,  1.37414455,  2.03041959,
        1.46217203, -0.63557875,  1.68831849,  0.72084016,  1.09909523,
       -0.79727054, -0.43951368,  0.78295988,  1.68929565,  1.90566957,
        1.32585812,  2.26107812,  1.23102951, -1.14303839, -1.15728951,
       -1.03613913, -1.71762002, -0.44431561,  1.55029309,  2.29587865,
        1.72170174, -2.05274391, -1.26773238,  0.91030282,  0.40452766,
        1.6505971 , -1.67897463,  0.41661876, -1.50299275, -2.19

## Learning the transformation matrix

In [16]:
import numpy as np
from scipy.optimize import fmin_l_bfgs_b as l_bfgs

In [17]:
#encoding_matrix = np.random.randn(embedding_size*2, embedding_size)
#encoding_bias = np.zeros((1, embedding_size))
#decoding_matrix = np.random.randn(embedding_size, embedding_size*2)
#decoding_bias = np.zeros((1, embedding_size*2))
init_enc_theta = np.random.randn(4, embedding_size, embedding_size*2)
enc_theta_shape = (4, embedding_size, embedding_size*2)

# Here the parameters need to passed as function arguments, because they will be optimized
# by the LBFGS implementation.
def encode_node(child1, child2, enc_theta):
    #"""Both child1 and child2 are numpy arrays of shape (1, embedding_size). Return the encoding
    #(1, embedding_size) and partial derivatives for enc_mat and enc_bias."""
    conc_embeds = np.concatenate((child1, child2))
    linear = np.dot(conc_embeds, np.transpose(enc_theta[0, :, :])) + enc_theta[1, :, 0] # enc_mat, enc_bias
    d_tanh = 1 - np.tanh(linear) ** 2
    d_mat = np.dot(np.reshape(conc_embeds, (conc_embeds.shape[0], 1)), np.reshape(d_tanh, (1, d_tanh.shape[0])))
    return np.tanh(linear), d_mat, d_tanh

def decode_node(node, enc_theta):
    # node is (1, embedding_size), output is (1, 2*embedding_size)
    linear = np.dot(node, enc_theta[2, :, :]) + enc_theta[3, 0, :] # dec_mat, dec_bias
    d_tanh = 1 - np.tanh(linear) ** 2
    d_mat = np.dot(np.reshape(node, (node.shape[0], 1)), np.reshape(d_tanh, (1, d_tanh.shape[0])))
    return np.tanh(linear), d_mat, d_tanh

In [18]:
from functools import reduce
from random import choice, randint
encoding_train_batch_size = 20 # number of sentences

# Handle special treatment of parens by our parser.
def nd_lbl(node):
    if node.label() == '-LRB-':
        return '('
    elif node.label() == '-RRB-':
        return ')'
    else:
        return node.label()

# (just an implementation detail - we need to declare those in the global scope to
# access them from node encoding function)
batch_d_enc_mat = np.zeros((embedding_size*2, embedding_size))
batch_d_enc_bias = np.zeros((1, embedding_size))
batch_d_dec_mat = np.zeros((embedding_size, embedding_size*2))
batch_d_dec_bias = np.zeros((1, embedding_size*2))

# Note that node_encodings are passed by value, so we always modify the dictionary given to
# the topmost function call.
def encode_tree(node, enc_theta, node_encodings):
    "Encode_tree returns a pair of lists of partial derivatives for encoding matrix and bias"
    subtrees = [subtr for subtr in node]
    if len(subtrees) == 0: # a leaf
        if nd_lbl(node) in word_map:
            node_encodings[nd_lbl(node)] = trained_embeddings[word_map[nd_lbl(node)], ]
        else: # replace unknowns with a random word
            node_encodings[nd_lbl(node)] = trained_embeddings[randint(0, trained_embeddings.shape[0]), ]
        return ([], [])
    elif len(subtrees) == 1:
        encode_tree(subtrees[0], enc_theta, node_encodings)
        node_encodings[nd_lbl(node)] = node_encodings[nd_lbl(subtrees[0])]
        return ([], [])
    else:
        if len(subtrees) != 2: # dbg
            print(subtrees)
        derivs = encode_tree(subtrees[0], enc_theta, node_encodings)
        derivs2 = encode_tree(subtrees[1], enc_theta, node_encodings)
        node_encodings[nd_lbl(node)], d_enc_mat, d_enc_bias = encode_node(
            node_encodings[nd_lbl(subtrees[0])],
            node_encodings[nd_lbl(subtrees[1])],
            enc_theta)
        return ( derivs[0] + derivs2[0] + [ d_enc_mat ],
                 derivs[1] + derivs2[1] + [ d_enc_bias ])

def make_parser():
    parser = pexpect.spawn('java -jar easyccg-0.2/easyccg.jar --model easyccg-0.2/model')
    parser.expect('Model loaded, ready to parse.')
    return parser

def kill_parser(parser):
    parser.terminate()
    
def sentence_tree(sentence_form, parser):
    parser.send(sentence_form+'\n')
    # (this secures us from finding one of the patterns below in the sentence itself:)
    response = parser.expect([pexpect.TIMEOUT, 'ID'])
    if response == 1: # can't happen if timed out
        response = parser.expect(['Skipping sentence of length', '\n\(.*\n', pexpect.TIMEOUT])
    if response in [0, 2]:
        return False
    parser_output = parser.after.decode().strip() # encode from bytes into str, strip whitespace
    return ParentedTree.fromstring(clean_parser_output(parser_output))
    
def encoding_train_batch(enc_theta):
    used_sents = [] # at least don't repeat them in one batch
    batch_error = np.zeros((1, embedding_size))
    # Partial derivatives -- to be stacked into a gradient at the end.
    global batch_d_enc_mat, batch_d_enc_bias, batch_d_dec_mat, batch_d_dec_bias
    batch_d_enc_mat = np.zeros((embedding_size*2, embedding_size))
    batch_d_enc_bias = np.zeros((1, embedding_size))
    batch_d_dec_mat = np.zeros((embedding_size, embedding_size*2))
    batch_d_dec_bias = np.zeros((1, embedding_size*2))
    # this is used to scale down the derivatives, ie. averaging
    nodes_n = 0
    enc_theta = np.reshape(enc_theta, enc_theta_shape)
    
    parser = make_parser()
    
    for i in range(encoding_train_batch_size):
        tree = False
        # It's possible that sentence_tree() returns False, if the sentence was too long and
        # rejected by the parser, or it timeouts.
        while not tree:
            sentence_n = randint(0, len(corpus_sents))
            while sentence_n in used_sents:
                sentence_n = randint(0, len(corpus_sents))
            sentence = corpus_sents[sentence_n]
            used_sents.append(sentence_n)
            
            sentence_form = ' '.join([unique_words[word_id] for word_id in sentence])
            #print(sentence_n, sentence_form)
            tree = sentence_tree(sentence_form, parser)

        # Encode the tree.
        node_encodings = dict()
        #print(clean_parse_output(parse_output))
        (derivs_enc_mat, derivs_enc_bias) = encode_tree(tree, enc_theta, node_encodings)
        #global batch_d_enc_mat, batch_d_enc_bias
        batch_d_enc_mat = reduce(lambda d_sum, d_part: d_sum + d_part,
                                 derivs_enc_mat, batch_d_enc_mat)
        batch_d_enc_bias = reduce(lambda d_sum, d_part: d_sum + d_part,
                                  derivs_enc_bias, batch_d_enc_bias)
        
        # Decode the tree back again.
        # this dictionary in fact maps nodes to their *partial* decodings from which their children are to be
        # recreated; thus for the root it's just its encoding, from which we will retrieve immediate children
        node_decodings = dict()
        node_decodings[nd_lbl(tree.root())] = node_encodings[nd_lbl(tree.root())]
        encoding_errors = dict()
        nodes_to_visit = [ tree.root() ]
        while nodes_to_visit:
            current_node = nodes_to_visit.pop()
            children = [child for child in current_node]
            if len(children) == 0:
                continue
            elif len(children) == 2: # not a leaf
                decoded_node, d_dec_mat, d_dec_bias = decode_node(node_decodings[nd_lbl(current_node)],
                                                                  enc_theta)
                node_decodings[nd_lbl(children[0])] = decoded_node[:embedding_size]
                node_decodings[nd_lbl(children[1])] = decoded_node[embedding_size:]
                # Get the error and partial derivatives (both (1, 2*embedding_size)).
                encoding_errors[nd_lbl(current_node)] = (node_encodings[nd_lbl(current_node)]
                                                         - node_decodings[nd_lbl(current_node)])
                batch_d_dec_mat = batch_d_dec_mat - d_dec_mat # dec is the minuend, so its part dev is -1
                batch_d_dec_bias = batch_d_dec_bias - d_dec_bias
                # np.abs()
                #d_encoding = ((node_encodings[current_node] - node_decodings[current_node])
                #              / encoding_errors[current_node])
                #d_decoding = ((node_decodings[current_node] - node_encodings[current_node])
                #              / encoding_errors[current_node])
                #batch_d_enc_mat = batch_d_enc_mat + np.dot(d_encoding, sent_d_enc_mat[current_node])
                #batch_d_enc_bias = batch_d_enc_bias + np.dot(d_encoding, sent_d_enc_bias[current_node])
                nodes_n += 1
            else:
                print('unexpected number of node children in decode:', children)
                return
        
        # Compute the error value.
        sent_error = reduce(lambda err_sum, node_err: err_sum + node_err, encoding_errors.values(), np.zeros((1, embedding_size)))
        sent_error = sent_error / len(encoding_errors)
        # Update batch error.
        batch_error = batch_error + sent_error / encoding_train_batch_size
        
    kill_parser(parser)
    # TODO regularization    
    batch_gradient = np.zeros((4, embedding_size, embedding_size*2))
    batch_gradient[0, :, :] = np.transpose(batch_d_enc_mat) / nodes_n
    batch_gradient[1, :, 0] = batch_d_enc_bias / nodes_n
    batch_gradient[2, :, :] = batch_d_dec_mat / nodes_n
    batch_gradient[3, 0, :] = batch_d_dec_bias / nodes_n
    #[x / nodes_n for x in [batch_d_enc_mat, batch_d_enc_bias,
    #                                        batch_d_dec_mat, batch_d_dec_bias]]
    return batch_error, np.reshape(batch_gradient, (batch_gradient.size,)) # we need to return a 1d array

In [19]:
learned_enc_theta = l_bfgs(encoding_train_batch, init_enc_theta)
print(learned_enc_theta)
learned_enc_theta = np.reshape(learned_enc_theta[0], enc_theta_shape)

(array([-0.53868982,  0.00788834, -0.16616896, ..., -0.3556517 ,
        0.18173019, -1.52697216]), array([[  2.15764155e-01,   4.50472088e-01,  -3.37242172e-01,
          5.11381850e-01,  -3.34684135e-01,   3.32768988e-01,
          9.71768843e-02,  -4.75634239e-01,   3.00089633e-02,
          1.48959275e-07,   1.96650162e-01,   2.80371911e-01,
          3.04179163e-01,   8.41766410e-02,   7.12384013e-02,
          3.74978811e-01,   5.91840257e-01,   3.85963767e-01,
          2.06987566e-01,  -7.55973052e-02,  -1.79390376e-01,
          3.32294649e-02,   1.01937192e-01,   4.55952456e-01,
         -1.01372252e-04,  -5.99674830e-01,   6.02569230e-01,
          2.54261084e-01,   8.28629114e-04,  -9.27813045e-02,
          2.79320505e-01,   2.02760039e-01,  -5.39431739e-01,
          2.44421104e-01,   4.75181986e-02,   9.97902087e-02,
         -1.99658934e-01,  -2.00739331e-01,   5.26042902e-01,
         -3.03092527e-01,  -2.79803218e-01,  -1.47332131e-01,
         -3.21314711e-01,  -3.71

# Training a sentiment analysis model

In [20]:
import nltk
nltk.download('sentence_polarity')

[nltk_data] Downloading package sentence_polarity to
[nltk_data]     /home/szymon/nltk_data...
[nltk_data]   Package sentence_polarity is already up-to-date!


True

In [21]:
from nltk.corpus import sentence_polarity
print(sentence_polarity.sents(categories='pos')[:3])

[['the', 'rock', 'is', 'destined', 'to', 'be', 'the', '21st', "century's", 'new', '"', 'conan', '"', 'and', 'that', "he's", 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', ',', 'jean-claud', 'van', 'damme', 'or', 'steven', 'segal', '.'], ['the', 'gorgeously', 'elaborate', 'continuation', 'of', '"', 'the', 'lord', 'of', 'the', 'rings', '"', 'trilogy', 'is', 'so', 'huge', 'that', 'a', 'column', 'of', 'words', 'cannot', 'adequately', 'describe', 'co-writer/director', 'peter', "jackson's", 'expanded', 'vision', 'of', 'j', '.', 'r', '.', 'r', '.', "tolkien's", 'middle-earth', '.'], ['effective', 'but', 'too-tepid', 'biopic']]


Now we will split the sentence polarity corpus into test and training slices in proportion 10/90, just as in the paper.

In [22]:
from random import sample
assert len(sentence_polarity.sents(categories='pos')) == len(sentence_polarity.sents(categories='neg'))

In [23]:
train_pnt, test_pnt = 0, 0
test_corp_len = (len(sentence_polarity.sents(categories='pos')) // 10
                 + len(sentence_polarity.sents(categories='neg')) // 10)
sent_pol_len = len(sentence_polarity.sents())

train_sent_vecs = np.zeros((sent_pol_len - test_corp_len, embedding_size))
test_sent_vecs = np.zeros((test_corp_len, embedding_size))
train_sent_labels = np.zeros((sent_pol_len - test_corp_len, 1))
test_sent_labels = np.zeros((test_corp_len, 1))

parser = make_parser()

for (label, sents) in [(1.0, sentence_polarity.sents(categories='pos')),
                      (0.0, sentence_polarity.sents(categories='neg'))]:
    sents = list(sents)
    test_ids = sample(range(len(sents)), len(sents) // 10)
    for sent_i in range(len(sents)):
        tree = sentence_tree(' '.join(sents[sent_i]), parser)
        if not tree: # sentence too long, or times out the parser
            continue
        node_encodings = dict()
        encode_tree(tree, learned_enc_theta, node_encodings)
        if sent_i in test_ids:
            test_sent_vecs[test_pnt, :] = node_encodings[nd_lbl(tree.root())]
            test_sent_labels[test_pnt, 0] = label
            test_pnt += 1
        else:
            train_sent_vecs[train_pnt, :] = node_encodings[nd_lbl(tree.root())]
            train_sent_labels[train_pnt, 0] = label
            train_pnt += 1
kill_parser(parser)

## Logistic regression

The original paper seems to use just a binary classifier of sentence vectors, without any neural net hidden layers. It is the approach we will try first.

In [24]:
regr_weights = tf.Variable(tf.random_normal([embedding_size, 1]))
regr_bias = tf.Variable(tf.random_normal([1, 1]))

X = tf.placeholder("float", [None, embedding_size], name='samples')
Y = tf.placeholder("float", [None, 1], name='labels')

prediction = tf.sigmoid(tf.matmul(X, regr_weights) + regr_bias)
loss = tf.losses.sigmoid_cross_entropy(prediction, Y)
optimizer = tf.train.AdagradOptimizer(learning_rate=0.1).minimize(loss)

acc = tf.metrics.accuracy(tf.round(prediction), tf.round(Y))

INFO:tensorflow:logits.dtype=<dtype: 'float32'>.
INFO:tensorflow:multi_class_labels.dtype=<dtype: 'float32'>.
INFO:tensorflow:losses.dtype=<dtype: 'float32'>.


In [25]:
epochs = 20001
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer()) # needed for accuracy metric
    for epoch_i in range(epochs):
        _, curr_loss = sess.run([optimizer, loss],
                                feed_dict={X: train_sent_vecs,
                                           Y: train_sent_labels})
        if epoch_i % 1000 == 0:
            print(epoch_i, curr_loss)
    
    _, test_acc = sess.run([loss, acc], feed_dict={X: test_sent_vecs,
                                          Y: test_sent_labels})
    print('Test accuracy', test_acc)

0 0.627485
1000 0.509037
2000 0.507463
3000 0.507022
4000 0.506808
5000 0.506669
6000 0.506573
7000 0.506515
8000 0.50648
9000 0.506456
10000 0.506436
11000 0.506416
12000 0.506395
13000 0.506375
14000 0.506358
15000 0.506344
16000 0.506334
17000 0.506326
18000 0.506319
19000 0.506311
20000 0.506302
Test accuracy (0.0, 0.49437147)
