In [3]:
import numpy as np
import tensorflow as tf
from scipy.linalg.interpolative import svd

In [4]:
def my_svd(A, eps_or_k=0.01):
    if A.dtype != np.float64:
        A = A.astype(np.float64)
    U, S, V = svd(A, eps_or_k, rand=False)

    return U, S, V.T

In [5]:
def t_unfold(A, k):
    A = np.transpose(A, np.hstack([k, np.delete(np.arange(A.ndim), k)]))
    A = np.reshape(A, [A.shape[0], np.prod(A.shape[1:])])

    return A

In [6]:
def t_dot(A, B, axes=(-1, 0)):
    return np.tensordot(A, B, axes)

In [7]:
def tt_dcmp(A, eps_or_k=0.01):
    d = A.ndim
    n = A.shape

    max_rank = [min(np.prod(n[:i + 1]), np.prod(n[i + 1:])) for i in range(d - 1)]

    if np.any(np.array(eps_or_k) > np.array(max_rank)):
        raise ValueError('the rank is up to %s' % str(max_rank))

    if not isinstance(eps_or_k, list):
        eps_or_k = [eps_or_k] * (d - 1)

    r = [1] * (d + 1)

    TT = []
    C = A.copy()

    for k in range(d - 1):
        C = C.reshape((r[k] * n[k], C.size / (r[k] * n[k])))
        (U, S, V) = my_svd(C, eps_or_k[k])
        r[k + 1] = U.shape[1]
        TT.append(U[:, :r[k + 1]].reshape((r[k], n[k], r[k + 1])))
        C = np.dot(np.diag(S[:r[k + 1]]), V[:r[k + 1], :])
    TT.append(C.reshape(r[k + 1], n[k + 1], 1))

    return TT

In [8]:
def tucker_dcmp(A, eps_or_k=0.01):
    d = A.ndim
    n = A.shape

    max_rank = list(n)

    if np.any(np.array(eps_or_k) > np.array(max_rank)):
        raise ValueError('the rank is up to %s' % str(max_rank))

    if not isinstance(eps_or_k, list):
        eps_or_k = [eps_or_k] * d

    U = [my_svd(t_unfold(A, k), eps_or_k[k])[0] for k in range(d)]
    S = A
    for i in range(d):
        S = t_dot(S, U[i], (0, 0))

    return U, S

In [9]:
def tt_cnst(A):
    S = A[0]
    for i in range(len(A) - 1):
        S = t_dot(S, A[i + 1])

    return np.squeeze(S, axis=(0, -1))

In [10]:
def tucker_cnst(U, S):
    for i in range(len(U)):
        S = t_dot(S, U[i], (0, 1))

    return S

In [11]:
def TensorUnfold(A, k):
    tmp_arr = np.arange(A.get_shape().ndims)
    A = tf.transpose(A, [tmp_arr[k]] + np.delete(tmp_arr, k).tolist())
    shapeA = A.get_shape().as_list()
    A = tf.reshape(A, [shapeA[0], np.prod(shapeA[1:])])

    return A

In [12]:
def TensorProduct(A, B, axes=(-1, 0)):
    shapeA = A.get_shape().as_list()
    shapeB = B.get_shape().as_list()
    shapeR = np.delete(shapeA, axes[0]).tolist() + np.delete(shapeB, axes[1]).tolist()
    result = tf.matmul(tf.transpose(TensorUnfold(A, axes[0])), TensorUnfold(B, axes[1]))

    return tf.reshape(result, shapeR)

In [13]:
def TTTensorProducer(A):
    S = A[0]
    for i in range(len(A) - 1):
        S = TensorProduct(S, A[i + 1])

    return tf.squeeze(S, squeeze_dims=[0, -1])

In [14]:
def TuckerTensorProducer(U, S):
    for i in range(len(U)):
        S = TensorProduct(S, U[i], (0, 1))

    return S

In [15]:
def TensorProducer(X, method, eps_or_k=0.01, datatype=np.float32, return_true_var=False):
    if method == 'Tucker':
        U, S = tucker_dcmp(X, eps_or_k)
        U = [tf.Variable(i.astype(datatype)) for i in U]
        S = tf.Variable(S.astype(datatype))
        W = TuckerTensorProducer(U, S)
        param_dict = {'U': U, 'S': S}
    elif method == 'TT':
        A = tt_dcmp(X, eps_or_k)
        A = [tf.Variable(i.astype(datatype)) for i in A]
        W = TTTensorProducer(A)
        param_dict = {'U': A}
    elif method == 'LAF':
        U, S, V = my_svd(np.transpose(t_unfold(X, -1)), eps_or_k)
        U = tf.Variable(U.astype(datatype))
        V = tf.Variable(np.dot(np.diag(S), V).astype(datatype))
        W = tf.reshape(tf.matmul(U, V), X.shape)
        param_dict = {'U': U, 'V': V}
    if return_true_var:
        return W, param_dict
    else:
        return W

In [18]:
T = 3 # number of tasks, len(O)
O = [3, 2, 5] # number of outputs for each task
N = 100 # number of training sample
D = 20 # dimension of feature vector, assumed to be the same for all tasks

In [19]:
X = [np.random.randn(N, D) for _ in range(T)]
Y = [x.dot(np.random.randn(D, o)) for x,o in zip(X, O)]

**Training regression for each tasks independently**

In [20]:
sess = tf.InteractiveSession()

In [21]:
# 3 layer network
# hidden layer has 10 neurons
H = 10
W_input_to_hidden = [tf.Variable(tf.truncated_normal(shape=[D, H])) for _ in range(T)]
b_input_to_hidden = [tf.Variable(tf.zeros(shape=[H])) for _ in range(T)]
W_hidden_to_output = [tf.Variable(tf.truncated_normal(shape=[H, o])) for o in O]
b_hidden_to_output = [tf.Variable(tf.zeros(shape=[o])) for o in O]

In [22]:
X_placeholder = [tf.placeholder(tf.float32, shape=[None, D]) for _ in range(T)]
Y_placeholder = [tf.placeholder(tf.float32, shape=[None, o]) for o in O]

In [23]:
Y_hat = [tf.nn.xw_plus_b(tf.nn.sigmoid(tf.nn.xw_plus_b(x,w0,b0)),w1,b1) 
         for x,w0,b0,w1,b1 in zip(X_placeholder, W_input_to_hidden, b_input_to_hidden, W_hidden_to_output, b_hidden_to_output)]

In [25]:
MSE = [tf.reduce_mean(tf.squared_difference(y,y_hat)) for y,y_hat in zip(Y_placeholder,Y_hat)]

In [27]:
loss = tf.reduce_mean(MSE)

In [28]:
opt = tf.train.AdamOptimizer(learning_rate=0.01)

In [29]:
train = opt.minimize(loss)

In [30]:
sess.run(tf.global_variables_initializer())

In [31]:
feed_dict = dict(list(zip(X_placeholder,X))+list(zip(Y_placeholder,Y)))

In [32]:
for _ in range(1000):
    train.run(feed_dict=feed_dict)
    if _ % 100 == 0:
        print(loss.eval(feed_dict=feed_dict))

23.779251
7.4169793
2.2905195
1.0746982
0.6570417
0.45569625
0.32902658
0.24736142
0.19520433
0.15932886


**'Shareable' layer is input-to-hidden layer.**

In [33]:
# We can resue the parameters learned above for initialisation, though it is optional
W_init = np.stack(sess.run(W_input_to_hidden))

In [34]:
# We put the task-axis in the last position
W_init = np.transpose(W_init, axes=[1,2,0])

In [35]:
# The only thing that changes: we use "TensorProducer" to generate the trainable variables
# TensorProducer has a few parameters:
# The first one is the stacked parameters
# The second one is factorisation method: 'LAF', 'Tucker', or 'TT'
# 'eps_or_k' is the handpicked ranks or relative error
# Read https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.interpolative.svd.html for details
# Set 'return_true_var' to be True if you want the factors (real parameters to learn); Default is False
W_input_to_hidden, W_factors = TensorProducer(W_init, 'LAF', eps_or_k=0.1, return_true_var=True)
W_input_to_hidden = [W_input_to_hidden[:,:,i] for i in range(T)]

In [36]:
# Nothing changes for bias terms or unshared parameters
b_input_to_hidden = [tf.Variable(tf.zeros(shape=[H])) for _ in range(T)]
W_hidden_to_output = [tf.Variable(tf.truncated_normal(shape=[H, o])) for o in O]
b_hidden_to_output = [tf.Variable(tf.zeros(shape=[o])) for o in O]

In [37]:

# Build the network as usual
X_placeholder = [tf.placeholder(tf.float32, shape=[None, D]) for _ in range(T)]
Y_placeholder = [tf.placeholder(tf.float32, shape=[None, o]) for o in O]

Y_hat = [tf.nn.xw_plus_b(tf.nn.sigmoid(tf.nn.xw_plus_b(x,w0,b0)),w1,b1) 
         for x,w0,b0,w1,b1 in zip(X_placeholder, W_input_to_hidden, b_input_to_hidden, W_hidden_to_output, b_hidden_to_output)]

MSE = [tf.reduce_mean(tf.squared_difference(y,y_hat)) for y,y_hat in zip(Y_placeholder,Y_hat)]

loss = tf.reduce_mean(MSE)

opt = tf.train.AdamOptimizer(learning_rate=0.01)

train = opt.minimize(loss)

In [41]:
# Train the model as usual
sess.run(tf.global_variables_initializer())

In [39]:
feed_dict = dict(list(zip(X_placeholder,X))+list(zip(Y_placeholder,Y)))

In [40]:
for _ in range(1000):
    train.run(feed_dict=feed_dict)
    if _ % 100 == 0:
        print(loss.eval(feed_dict=feed_dict))

19.385046
2.932336
1.3348275
0.7552206
0.50948447
0.39846826
0.32564265
0.27077052
0.2194042
0.17648698
