The following code cell is building the sentence autoencoder using Keras.

In [None]:
'''
variational autoencoder
Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114
'''
from keras.layers import Input, Dense, Lambda
from keras.models import Model
from keras import backend as K
from keras import objectives
from keras.datasets import mnist

batch_size = 16
original_dim = 784  # sentence length
latent_dim = 2
intermediate_dim = 128  # encode size
epsilon_std = 0.01
nb_epoch = 40

x = Input(batch_shape=(batch_size, original_dim))
h = Dense(intermediate_dim, activation='relu')(x)
z_mean = Dense(latent_dim)(h)
z_log_std = Dense(latent_dim)(h)

def sampling(args):
    z_mean, z_log_std = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim),
                              mean=0., std=epsilon_std)
    return z_mean + K.exp(z_log_std) * epsilon

# note that "output_shape" isn't necessary with the TensorFlow backend
# so you could write `Lambda(sampling)([z_mean, z_log_std])`
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_std])

# we instantiate these layers separately so as to reuse them later
decoder_h = Dense(intermediate_dim, activation='relu')
decoder_mean = Dense(original_dim, activation='sigmoid')
h_decoded = decoder_h(z)
x_decoded_mean = decoder_mean(h_decoded)

def vae_loss(x, x_decoded_mean):
    xent_loss = objectives.binary_crossentropy(x, x_decoded_mean)
    kl_loss = - 0.5 * K.mean(1 + z_log_std - K.square(z_mean) - K.exp(z_log_std), axis=-1)
    return xent_loss + kl_loss

vae = Model(x, x_decoded_mean) 
vae.compile(optimizer='rmsprop', loss=vae_loss)

In [None]:
vae.fit(_, _,
        shuffle=True,
        nb_epoch=nb_epoch,
        batch_size=batch_size,
        validation_data=(_, _),
        callbacks=[TensorBoard(log_dir='/tmp/autoencoder')])

# build a model to project inputs on the latent space
encoder = Model(x, z_mean)

The following code cell is building the sentence siamese network using Keras.

In [None]:
from keras.engine.topology import Layer
from keras.layers import initializations
from keras import backend as K
import tensorflow as tf

# multi-sense vectors
class MultiSense(Layer):
    def __init__(self, num_sense, **kwargs):
        self.init = initializations.get('glorot_uniform')
        self.num_sense = num_sense
        
        super(MultiSense, self).__init__(**kwargs)

    def build(self, input_shape):
        # input_shape = R^{sentence_length x word_embedding}
        # for each word, we have (1,num_sense) weight
        assert len(input_shape) == 2
        input_dim = input_shape[1]
        # TODO: to determine the input data size, we need to use batches
        # TODO: currently is the R^{num_sense x input_dim}, but desired R^{batch x num_sense x input_dim}
        self.W = self.init((self.num_sense, input_dim),
                             name='%s_W%d' % (self.name, i))
        self.trainable_weights = self.Ws

    def call(self, x, mask=None):
        # x.shape = input_shape
        # for each word, we elementwisely product with its weight
        # for each word, we keep the row that has the highest softmax score
        output = tf.mul(x, self.W)
        return output
        

    def get_output_shape_for(self, input_shape):
        return input_shape
    
    
from keras.models import Sequential
from keras.layers import LSTM

def create_shared_layer(output_dim, batch_input_dim):
    model = Sequential()
    model.add(MultiSense(60, input_shape=batch_input_dim))
    model.add(LSTM(output_dim))
    return model

shared_layer = create_shared_layer(128, (16, 784))

In [None]:
from keras.models import Graph
from keras.layers import Lambda, Activation

batch_size = 16
original_dim = 784  # sentence length
intermediate_dim = 128  # encode size


g = Graph()
g.add_input(name='left', batch_input_shape=(batch_size, original_dim))
g.add_input(name='right', batch_input_shape=(batch_size, original_dim))

shared_layer = create_shared_layer(intermediate_dim, (batch_size, original_dim))
g.add_shared_node(shared_layer,
                  name='shared',
                  inputs=['left', 'right'],
                  merge_mode='cos',
                  create_output=False)
g.add_node(Activation('sigmoid'),
           name='d',
           input='shared')
g.add_output(name='output', 
             input='d')
g.compile(optimizer='RMSprop', loss={'output': 'mse'})

In [None]:
shared_layer.predict()

The following code cell is the draft.

In [None]:
import numpy as np
import theano.tensor as T
import theano.tensor.nnet as F
import theano


class HiddenLayer(object):
    def __init__(self, rng, ins, n_in, n_out, w=None, b=None, f=T.tanh):
        # initial weight and bias
        if w is not isinstance(w, np.ndarray):
            w = np.asarray(
                    rng.uniform(
                        low=-np.sqrt(6. / (n_in + n_out)),
                        high=np.sqrt(6. / (n_in + n_out)),
                        size=(n_in, n_out)
                    )
                )
            if f == theano.tensor.nnet.sigmoid:
                w *= 4
        if b is not isinstance(b, np.ndarray):
            b = np.zeros((n_out,))
       
        w = theano.shared(w, borrow=True)
        b = theano.shared(b, borrow=True)
        
        self.outs = T.dot(ins, w) - b
        if f is not None:
            # not a linear layer
            self.outs = f(self.outs)
        self.params = [w, b]
        

class NNet(object):
    def __init__(self, data, gstep=0.01, epochs=1000, rng=None):
        self.data = data
        self.ins = data
        self.n_sample, self.n_in = data.shape
        self.gstep = gstep   # gradient step
        self.layers = []
        self.params = []
        self.epochs = epochs
        if rng is None:
            self.rng = np.random
        else:
            self.rng = rng
        
    def add_layer(self, n_out, w=None, b=None, f=T.tanh):
        layer = HiddenLayer(rng=self.rng,
                            ins=self.ins,
                            n_in=self.n_in,
                            n_out=n_out,
                            w=w, b=b, f=f)
        self.layers.append(layer)
        # the output of this layer is the input of next layer
        self.ins = layer.outs
        self.n_in = n_out
        self.params += layer.params
        
    def train(self):
        x = T.dmatrix("x")
        # calculate the gradients
        gparams = T.grad(cost, [param for param in self.params])
        updates = [(param, param - self.gstep * gparam) for param, gparam in zip(self.params, gparams)]
        _train = theano.function(
                  inputs=[x],
                  outputs=self.cost,
                  updates=updates,
                  on_unused_input='ignore')
        for i in range(self.epochs):
            err = _train(self.data)
            print("Epoch: %d; Distance: %f" %(i+1, err))
        
    def set_cost(self, cost):
        self.cost = cost
    
    def get_final_outs(self):
        return self.layers[-1].outs
    

N = 5 # training sample size
IN_FEATS = 50 # input feature space
EPOCHS = 1000 # train iteration

rng = np.random
D = (rng.randn(N, IN_FEATS))      # inputs

nnet = NNet(D)
# encoder
nnet.add_layer(100)
# decoder
nnet.add_layer(IN_FEATS)
cost = ((nnet.get_final_outs() - D) ** 2).sum()
nnet.set_cost(cost)
nnet.train()

In [None]:
# Autoencoder
import numpy as np
import theano.tensor as T
import theano.tensor.nnet as F
import theano

N = 5 # training sample size
IN_FEATS = 50 # input feature space, for encoder
OUT_FEATS = 100 # output feature space, for decoder
EPOCHS = 1000

rng = np.random
D = (rng.randn(N, IN_FEATS))      # inputs

x = T.dmatrix("x")

e_w1 = theano.shared(rng.randn(IN_FEATS, OUT_FEATS), name="encode_w1")  # weights
e_b1 = theano.shared(np.zeros(OUT_FEATS), name="encode_b1")             # bias
d_w1 = theano.shared(rng.randn(OUT_FEATS, IN_FEATS), name="decode_w1")
d_b1 = theano.shared(np.zeros(IN_FEATS), name="decode_b1")

encoder = F.sigmoid(T.dot(x, e_w1) - e_b1)
decoder = F.sigmoid(T.dot(encoder, d_w1) - d_b1)
cost = ((decoder - x) ** 2).sum()
e_gw1, e_gb1, d_gw1, d_gb1 = T.grad(cost, [e_w1, e_b1, d_w1, d_b1])

train = theano.function(
          inputs=[x],
          outputs=cost,
          updates=[(e_w1, e_w1 - 0.01*e_gw1), (e_b1, e_b1 - 0.01*e_gb1),
                   (d_w1, d_w1 - 0.01*d_gw1), (d_b1, d_b1 - 0.01*d_gb1)])
predict = theano.function(inputs=[x], outputs=encoder)

for i in range(EPOCHS):
    err = train(D)
    print("Epoch: %d; Distance: %f" %(i, err))

TODO: Psedocode of the CCA

```
func train_CCA(question_set, answer_set):
    matrix = build_cross_covariance_matrix(question_set, answer_set)
    U, s, V = svd(matrix)
    return U, V
```

```
func build_cross_covariance_matrix(question_set, answer_set):
    pair_num, M = question_set.shape
    pair_num, N = answer_set.shape
    indx = 0
    forEach question_set:
        avg = average(question_set[indx])
        question_set[indx] -= avg
        avg = average(answer_set[indx])
        answer_set[indx] -= avg
        i++
        
    initial matrix \in R^M x N fills by 0
    for i in range(M):
        for j in range(N):
            matrix[i, j] = sum(question_set[i] \dot answer_set[j].T) / (pair_num - 1)
    return matrix
```

```
func find_best_answer(question, answer_set, U, V):
    question_project = question \dot U
    initial best_answer as NULL
    initial best_similarity as INFINITY
    indx = 0
    forEach answer in answer_set:
        answer_project = answer \dot V
        similarity = cosine_distance(question_project, answer_project)
        if similarity <= best_similarity:
            best_similarity = similarity
            best_answer = indx
        indx++
    return indx
```

In [38]:
import numpy as np
from scipy.sparse.linalg import svds
from scipy.spatial.distance import cosine
import logging
from multiprocessing import Pool
from functools import partial

Qs = np.random.randn(80, 120)
As = np.random.randn(80, 150)


def train(Qs, As, is_sparse=True):
    '''
    params q: sentence embedding for question set
    params a: sentence embedding for answer set
    '''
    if isinstance(Qs, list):
        Qs = np.asarray(Qs, dtype="float32")
    if isinstance(As, list):
        As = np.asarray(As, dtype="float32")

    logging.info("computing CCA")
    sample_num = Qs.shape[0]
    c_qq_sqrt = np.power(Qs.T.dot(Qs), -0.5) / sample_num
    c_qa = Qs.T.dot(As) / sample_num
    c_aa_sqrt = np.power(As.T.dot(As), -0.5) / sample_num
    # keep only diagonal
    c_qq_sqrt = np.diag(np.diag(c_qq_sqrt))
    c_aa_sqrt = np.diag(np.diag(c_aa_sqrt))
    # get result
    result = c_qq_sqrt.dot(c_qa).dot(c_aa_sqrt)
    # U, s, V = np.linalg.svd(result, full_matrices=False)
    U, s, V = svds(result, k=110)
    Q_k = c_qq_sqrt.dot(U)
    A_k = c_aa_sqrt.dot(V.T)
    return Q_k, A_k


# get distance between the question and answer, return with the answer index
# def distance(indx_a, proj_q, A_k):
#     indx, a = indx_a
#     proj_a = A_k.T.dot(a)
#     dist = cosine(proj_q, proj_a)
#     return indx, dist
# 
# 
# def find_answer(q, As, Q_k, A_k):
#     proj_q = Q_k.T.dot(q)
#     assert (proj_q == q.dot(Q_k)).all()
#     with Pool(processes=8) as pool:
#         result = pool.map(partial(distance, proj_q=proj_q, A_k=A_k), enumerate(As))
#     best_indx, _ = min(result, key=lambda x: x[1])
# 
#     return best_indx


# get distance between the question and answer, return with the answer index
def distance(indx_a, proj_q):
    indx, proj_a = indx_a
    dist = cosine(proj_q, proj_a)
    return indx, dist


def find_answer(proj_q, proj_As):
    with Pool(processes=8) as pool:
        result = pool.map(partial(distance, proj_q=proj_q), enumerate(proj_As))
    best_indx, _ = min(result, key=lambda x: x[1])

    return best_indx

Q_k, A_k = train(Qs, As)

# for i in range(Qs.shape[0]):
#     print(find_answer(Qs[i], As, Q_k, A_k))
    
proj_Qs = np.tensordot(Qs, Q_k, axes=1)
proj_As = np.tensordot(As, A_k, axes=1)
for i in range(Qs.shape[0]):
    print(find_answer(proj_Qs[i], proj_As))



0


1


2


3


4


5


6


7


8


9


10


11


12


13


14


15


16


17


18


19


20


21


22


23


24


25


26


27


28


29


30


31


32


33


34


35


36


37


38


39


40


41


42


43


44


45


46


47


48


49


50


51


52


53


54


55


56


57


58


59


60


61


62


63


64


65


66


67


68


69


70


71


72


73


74


75


76


77


78


79
