# CNN for Sentence Similarity

* Let $x_1$, $x_2$ be two sentences (encoded in integer indices) of lengths $l_1$, $l_2$ respectively, then we first pad them to be of length $l$, and then embed their consisting words in $d$ dimensional space to produce two matrices $X_1^{l\times d}, X_2^{l\times d}$.
* We then convolve on the matrices with filters of shapes $(3\times d), (4\times d)$ and $(5\times d)$. This results in vectors of the shape $(l-3+1, 1), (l-4+1, 1)$ and $(l-5+1, 1)$ (with 'VALID', i.e. narrow convolution), for each filter size (i.e. $3, 4$ and $5$). With $k$ filters per filter size, we end up with vectors $((l-3+1)\times k, 1), ((l-4+1)\times k, 1)$ and $((l-5+1)\times k, 1)$. For $x_1$, call them $h_1^3, h_1^4, h_1^5$, on which we do ReLU nonlinearity: $$h = \texttt{ReLU}(h)$$
* Performing max-pooling on the above and concatenate results, we get a single vector representation for each of the two sentences, of shape $(3k,)$, for the two sentences we get $h_{pool1}, h_{pool2}$, we then put them through dropout regularization with probability $p$: $$h_{pool} = \texttt{Dropout}(h_{pool}, p)$$
* Finally we do a bilinear transformation on these vectors to make logits: $$\texttt{Logit}(x_{pool1}, x_{pool2}) = \texttt{Round}(\sigma(x_{pool1}^T W^{3k\times 3k} x_{pool2}))$$
from which we get the loss $\mathcal{L}$: $$\mathcal{L} = \texttt{Cross-Ent}(\texttt{Logit}(x_{pool1}, x_{pool2}), y)$$
where $y$ is the true label.
where 
* Finally, the prediction is made by rounding the logit: $$\hat{y} = \texttt{Round}(\texttt{Logit}(x_{pool1}, x_{pool2}))$$
where $\texttt{Round}$ returns $1$ if the sigmoid probability is greater than $0.5$ and $0$ otherwise.

### Prepare data

In [9]:
import sys
sys.path.append('/Users/jacob.su.wang/Desktop/CODER/TENSORFLOW/SCRIPTS')

In [10]:
import random
from helpers import Indexer
from mock_sentence_similarity import *

In [11]:
indexer = Indexer()
glove_embeddings = get_dict_and_embeddings(VOCAB, indexer)

In [12]:
print(generate_pos_datum(19))
print(generate_neg_datum(14))

(array(['motorcycle', 'train', 'train', 'car', 'bus', 'truck', 'train',
       'train', 'bus', 'car', 'train', 'train', 'truck', 'train', 'truck',
       'car', 'motorcycle', 'car', 'train'],
      dtype='<U10'), array(['motorcycle', 'train', 'motorcycle', 'car', 'car', 'car', 'train',
       'motorcycle', 'truck', 'train', 'train', 'truck', 'car', 'truck',
       'truck', 'truck', 'motorcycle', 'motorcycle', 'car'],
      dtype='<U10'))
(array(['cat', 'cat', 'deer', 'deer', 'pig', 'dog', 'dog', 'deer', 'dog',
       'pig', 'pig', 'pig', 'cat', 'pig'],
      dtype='<U5'), array(['train', 'train', 'bus', 'train', 'bus', 'bus', 'motorcycle', 'car',
       'motorcycle', 'motorcycle', 'bus', 'motorcycle', 'train',
       'motorcycle'],
      dtype='<U10'))


In [13]:
x1, x2, y = generate_batch(indexer, len_from=2, len_to=5, batch_size=2)
print(x1)
print(x2)
print(y)

[[ 3  2 10 10 10]
 [ 9  4  4 10 10]]
[[ 2  2 10 10 10]
 [ 7  2  1 10 10]]
[1 0]


### Model

In [62]:
tf.reset_default_graph()

LEN_FROM, LEN_TO = 5, 15
MAX_LEN = 15
NUM_CLASSES = 2
VOCAB_SIZE = len(VOCAB) + 1
EMBED_SIZE = 20
FILTER_SIZES = [3,4,5]   # size types.
NUM_FILTERS = 10         # #filters per size type.
NUM_CHANNELS = 1

sess = tf.InteractiveSession()

input_x1 = tf.placeholder(tf.int32, [None, MAX_LEN], name='input_x1')
input_x2 = tf.placeholder(tf.int32, [None, MAX_LEN], name='input_x2')
input_y  = tf.placeholder(tf.int32, [None], name='input_y')

keep_prob = tf.placeholder(tf.float32, name="keep_prob")

with tf.device('/cpu:0'), tf.variable_scope('embeddings'): 
        # name_scope works only with tf.Variable
        # variable_scope works with tf.get_variable
    E = tf.get_variable('E', [VOCAB_SIZE, EMBED_SIZE], initializer=tf.contrib.layers.xavier_initializer())
    embed_x1 = tf.expand_dims(tf.nn.embedding_lookup(E, input_x1), -1)
    embed_x2 = tf.expand_dims(tf.nn.embedding_lookup(E, input_x2), -1)
        # embed_x*: [batch_size, height=MAX_LEN, width=EMBED_SIZE, num_channels=1]

pool1_outputs, pool2_outputs = [], []
for i, filter_size in enumerate(FILTER_SIZES):
    with tf.variable_scope('conv-max-pool-%s' % filter_size): 
        filter_shape = [filter_size, EMBED_SIZE, NUM_CHANNELS, NUM_FILTERS]
            # Filter dims: [filter_size, emb_size, num_channels, num_filters]
        W1 = tf.get_variable('W1', filter_shape, initializer=tf.contrib.layers.xavier_initializer())
        W2 = tf.get_variable('W2', filter_shape, initializer=tf.contrib.layers.xavier_initializer())
        b1 = tf.get_variable('b1', [NUM_FILTERS], initializer=tf.contrib.layers.xavier_initializer())
        b2 = tf.get_variable('b2', [NUM_FILTERS], initializer=tf.contrib.layers.xavier_initializer())
        conv1 = tf.nn.conv2d(embed_x1, W1, strides=[1,1,1,1], padding='VALID', name='conv1')
        conv2 = tf.nn.conv2d(embed_x2, W2, strides=[1,1,1,1], padding='VALID', name='conv2')
            # Conv dims: [batch_size, height, width, num_channels]
        h1 = tf.nn.relu(tf.nn.bias_add(conv1, b1), name='relu1')
        h2 = tf.nn.relu(tf.nn.bias_add(conv2, b2), name='relu2')
        pool1 = tf.nn.max_pool(h1, ksize=[1,MAX_LEN-filter_size+1,1,1], strides=[1,1,1,1], padding='VALID', name='pool1')
        pool2 = tf.nn.max_pool(h2, ksize=[1,MAX_LEN-filter_size+1,1,1], strides=[1,1,1,1], padding='VALID', name='pool2')
            # kernel size (ksize): [batch_size, height, width, num_channels]
        pool1_outputs.append(pool1)
        pool2_outputs.append(pool2)

num_filters_total = NUM_FILTERS * len(FILTER_SIZES)
h_pool1_flat = tf.nn.dropout(tf.reshape(tf.concat(pool1_outputs, 3), [-1, num_filters_total]), keep_prob)
h_pool2_flat = tf.nn.dropout(tf.reshape(tf.concat(pool2_outputs, 3), [-1, num_filters_total]), keep_prob)
    # flat shape: [batch_size, num_filters_total].
W_bi = tf.get_variable('W_bi', [num_filters_total, num_filters_total],
                       initializer=tf.contrib.layers.xavier_initializer())
scores = tf.nn.sigmoid(tf.diag_part(tf.matmul(tf.matmul(h_pool1_flat, W_bi), tf.transpose(h_pool2_flat))))

predictions = tf.cast(tf.round(scores), tf.int32)

with tf.name_scope('loss'):
    losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(input_y, tf.float32), logits=scores)
    loss = tf.reduce_mean(losses)

with tf.name_scope('accuracy'):
    correct_predictions = tf.equal(predictions, input_y)
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')
    
global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(1e-4)
grads_and_vars = optimizer.compute_gradients(loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

In [89]:
NUM_EPOCHS = 3
NUM_BATCHES = 1000

sess.run(tf.global_variables_initializer())

for e in range(NUM_EPOCHS):
    print('Epoch ', e+1)
    print('\n')
    loss_track, accuracy_track = [], []
    for b in range(NUM_BATCHES):
        batch_x1, batch_x2, batch_y = generate_batch(indexer, LEN_FROM, LEN_TO, batch_size=64)
        fd = {input_x1:batch_x1, input_x2:batch_x2, input_y:batch_y, keep_prob:0.7}
        _, step, loss_, accuracy_ = sess.run([train_op, global_step, loss, accuracy], feed_dict=fd)
        loss_track.append(loss_)
        accuracy_track.append(accuracy_)
        if step%100==0:
            print('M.Loss = {} | M.Accuracy = {}'.format(np.mean(loss_track), np.mean(accuracy_track)))
    print('\n')

Epoch  1


M.Loss = 0.7023053765296936 | M.Accuracy = 0.4996874928474426
M.Loss = 0.7000969648361206 | M.Accuracy = 0.49976563453674316
M.Loss = 0.6982930898666382 | M.Accuracy = 0.499895840883255
M.Loss = 0.6973963379859924 | M.Accuracy = 0.49996092915534973
M.Loss = 0.6965771317481995 | M.Accuracy = 0.499937504529953
M.Loss = 0.6957278251647949 | M.Accuracy = 0.5000260472297668
M.Loss = 0.6942504644393921 | M.Accuracy = 0.5012500286102295
M.Loss = 0.6894691586494446 | M.Accuracy = 0.5171484351158142
M.Loss = 0.6815314292907715 | M.Accuracy = 0.5413888692855835
M.Loss = 0.6685624718666077 | M.Accuracy = 0.5791562795639038


Epoch  2


M.Loss = 0.5177417993545532 | M.Accuracy = 0.9943749904632568
M.Loss = 0.5142209529876709 | M.Accuracy = 0.9961718916893005
M.Loss = 0.5122621655464172 | M.Accuracy = 0.9970312714576721
M.Loss = 0.5108184814453125 | M.Accuracy = 0.9976953268051147
M.Loss = 0.5099108219146729 | M.Accuracy = 0.9980000257492065
M.Loss = 0.5091516375541687 | M.Accuracy = 0.9

In [87]:
# test x1 = x2; test x3 in another group.
test_x1 = [['motorcycle', 'train', 'motorcycle', 'car', 'car', 'car', 'train']]
test_x2 = [['motorcycle', 'train', 'train', 'car', 'bus']]
test_x3 = [['cat', 'cat', 'deer', 'deer', 'pig', 'dog', 'dog', 'deer', 'dog']]
test_y12 = [1]
test_y13 = [0]
test_y23 = [0]

def to_code(x_):
    return [[indexer.get_index(elem) for elem in pad_sentence(MAX_LEN, x_[0])]]

def predict(x1_, x2_, y_, sess_):
    fd = {input_x1:to_code(x1_), input_x2:to_code(x2_), keep_prob:1.0}
    pred = sess_.run(predictions, feed_dict=fd)
    print('True = {} | Pred = {}'.format(y_, pred))

In [90]:
predict(test_x1, test_x2, test_y12, sess)
predict(test_x1, test_x2, test_y12, sess)
predict(test_x1, test_x2, test_y12, sess)

True = [1] | Pred = [1]
True = [1] | Pred = [1]
True = [1] | Pred = [1]
