In [1]:
import numpy as np
import tensorflow as tf
from scipy.linalg.interpolative import svd

from tensorflow.nn import xw_plus_b, sigmoid

## Set up tensor manipulation functions

In [2]:
'''
SVD generalizes spectral decomposition to n by m matrices
U and V are orthogonal matrices, S is a diagonal (rectangular) matrix
'''

def my_svd(A, eps_or_k=0.01):
    if A.dtype != np.float64:
        A = A.astype(np.float64)
    U, S, V = svd(A, eps_or_k, rand=False)
    return U, S, V.T

In [3]:
def t_unfold(A, k):
    A = np.transpose(A, np.hstack([k, np.delete(np.arange(A.ndim), k)]))
    A = np.reshape(A, [A.shape[0], np.prod(A.shape[1:])])
    return A

In [4]:
def t_dot(A, B, axes=(-1, 0)):
    return np.tensordot(A, B, axes)

In [5]:
def tt_dcmp(A, eps_or_k=0.01):
    d = A.ndim
    n = A.shape
    max_rank = [min(np.prod(n[:i + 1]), np.prod(n[i + 1:])) for i in range(d - 1)]

    if np.any(np.array(eps_or_k) > np.array(max_rank)):
        raise ValueError('the rank is up to %s' % str(max_rank))
    if not isinstance(eps_or_k, list):
        eps_or_k = [eps_or_k] * (d - 1)

    r = [1] * (d + 1)
    TT = []
    C = A.copy()

    for k in range(d - 1):
        C = C.reshape((r[k] * n[k], C.size / (r[k] * n[k])))
        (U, S, V) = my_svd(C, eps_or_k[k])
        r[k + 1] = U.shape[1]
        TT.append(U[:, :r[k + 1]].reshape((r[k], n[k], r[k + 1])))
        C = np.dot(np.diag(S[:r[k + 1]]), V[:r[k + 1], :])
    TT.append(C.reshape(r[k + 1], n[k + 1], 1))
    return TT

In [6]:
def tucker_dcmp(A, eps_or_k=0.01):
    d = A.ndim
    n = A.shape
    max_rank = list(n)

    if np.any(np.array(eps_or_k) > np.array(max_rank)):
        raise ValueError('the rank is up to %s' % str(max_rank))
    if not isinstance(eps_or_k, list):
        eps_or_k = [eps_or_k] * d

    U = [my_svd(t_unfold(A, k), eps_or_k[k])[0] for k in range(d)]
    S = A
    for i in range(d):
        S = t_dot(S, U[i], (0, 0))
    return U, S

In [7]:
def tt_cnst(A):
    S = A[0]
    for i in range(len(A) - 1):
        S = t_dot(S, A[i + 1])
    return np.squeeze(S, axis=(0, -1))

In [8]:
def tucker_cnst(U, S):
    for i in range(len(U)):
        S = t_dot(S, U[i], (0, 1))
    return S

In [9]:
def TensorUnfold(A, k):
    tmp_arr = np.arange(A.get_shape().ndims)
    A = tf.transpose(A, [tmp_arr[k]] + np.delete(tmp_arr, k).tolist())
    shapeA = A.get_shape().as_list()
    A = tf.reshape(A, [shapeA[0], np.prod(shapeA[1:])])
    return A

In [10]:
def TensorProduct(A, B, axes=(-1, 0)):
    shapeA = A.get_shape().as_list()
    shapeB = B.get_shape().as_list()
    shapeR = np.delete(shapeA, axes[0]).tolist() + np.delete(shapeB, axes[1]).tolist()
    result = tf.matmul(tf.transpose(TensorUnfold(A, axes[0])), TensorUnfold(B, axes[1]))
    return tf.reshape(result, shapeR)

In [11]:
def TTTensorProducer(A):
    S = A[0]
    for i in range(len(A) - 1):
        S = TensorProduct(S, A[i + 1])
    return tf.squeeze(S, squeeze_dims=[0, -1])

In [12]:
def TuckerTensorProducer(U, S):
    for i in range(len(U)):
        S = TensorProduct(S, U[i], (0, 1))
    return S

In [13]:
def TensorProducer(X, method, eps_or_k=0.01, datatype=np.float32, return_true_var=False):
    if method == 'Tucker':
        U, S = tucker_dcmp(X, eps_or_k)
        U = [tf.Variable(i.astype(datatype)) for i in U]
        S = tf.Variable(S.astype(datatype))
        W = TuckerTensorProducer(U, S)
        param_dict = {'U': U, 'S': S}
    elif method == 'TT':
        A = tt_dcmp(X, eps_or_k)
        A = [tf.Variable(i.astype(datatype)) for i in A]
        W = TTTensorProducer(A)
        param_dict = {'U': A}
    elif method == 'LAF':
        U, S, V = my_svd(np.transpose(t_unfold(X, -1)), eps_or_k)
        U = tf.Variable(U.astype(datatype))
        V = tf.Variable(np.dot(np.diag(S), V).astype(datatype))
        W = tf.reshape(tf.matmul(U, V), X.shape)
        param_dict = {'U': U, 'V': V}
    if return_true_var:
        return W, param_dict
    else:
        return W

In [14]:
T = 3 # number of tasks, len(O)
O = [3, 2, 5] # number of outputs for each task
N = 100 # number of training sample
D = 20 # dimension of feature vector, assumed to be the same for all tasks

In [15]:
X = [np.random.randn(N, D) for _ in range(T)]
Y = [x.dot(np.random.randn(D, o)) for x,o in zip(X, O)]

In [46]:
print(Y[0].shape)

(100, 3)


## Training regression for each tasks independently

In [16]:
sess = tf.InteractiveSession()

In [17]:
# 3 layer network
# hidden layer has 10 neurons
H = 10
W_input_to_hidden = [tf.Variable(tf.truncated_normal(shape=[D, H])) for _ in range(T)]
b_input_to_hidden = [tf.Variable(tf.zeros(shape=[H])) for _ in range(T)]
W_hidden_to_output = [tf.Variable(tf.truncated_normal(shape=[H, o])) for o in O]
b_hidden_to_output = [tf.Variable(tf.zeros(shape=[o])) for o in O]

In [18]:
X_placeholder = [tf.placeholder(tf.float32, shape=[None, D]) for _ in range(T)]
Y_placeholder = [tf.placeholder(tf.float32, shape=[None, o]) for o in O]

In [19]:
Y_hat = [xw_plus_b(sigmoid(xw_plus_b(x,w0,b0)),w1,b1) 
         for x,w0,b0,w1,b1 in zip(X_placeholder, W_input_to_hidden, b_input_to_hidden, W_hidden_to_output, b_hidden_to_output)]

In [20]:
MSE = [tf.reduce_mean(tf.squared_difference(y,y_hat)) for y,y_hat in zip(Y_placeholder,Y_hat)]
loss = tf.reduce_mean(MSE)
opt = tf.train.AdamOptimizer(learning_rate=0.01)
train = opt.minimize(loss)
sess.run(tf.global_variables_initializer())

In [21]:
feed_dict = dict(list(zip(X_placeholder,X))+list(zip(Y_placeholder,Y)))

In [22]:
for _ in range(1000):
    train.run(feed_dict=feed_dict)
    if _ % 100 == 0:
        print(loss.eval(feed_dict=feed_dict))

20.087698
5.5514207
1.6607981
0.84037286
0.5585898
0.4100879
0.31203082
0.24233873
0.19026375
0.14955209


**'Shareable' layer is input-to-hidden layer.**

In [23]:
# We can resue the parameters learned above for initialisation, though it is optional
W_init = np.stack(sess.run(W_input_to_hidden))

In [24]:
W_init.shape

(3, 20, 10)

In [25]:
print(W_input_to_hidden)

[<tf.Variable 'Variable:0' shape=(20, 10) dtype=float32_ref>, <tf.Variable 'Variable_1:0' shape=(20, 10) dtype=float32_ref>, <tf.Variable 'Variable_2:0' shape=(20, 10) dtype=float32_ref>]


In [26]:
# We put the task-axis in the last position
W_init = np.transpose(W_init, axes=[1,2,0])

In [27]:
W_init.shape

(20, 10, 3)

In [28]:
# The only thing that changes: we use "TensorProducer" to generate the trainable variables
# TensorProducer has a few parameters:
# The first one is the stacked parameters
# The second one is factorisation method: 'LAF', 'Tucker', or 'TT'
# 'eps_or_k' is the handpicked ranks or relative error
# Read https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.interpolative.svd.html for details
# Set 'return_true_var' to be True if you want the factors (real parameters to learn); Default is False
W_input_to_hidden, W_factors = TensorProducer(W_init, 'LAF', eps_or_k=0.1, return_true_var=True)
W_input_to_hidden = [W_input_to_hidden[:,:,i] for i in range(T)]

In [29]:
# Nothing changes for bias terms or unshared parameters
b_input_to_hidden = [tf.Variable(tf.zeros(shape=[H])) for _ in range(T)]
W_hidden_to_output = [tf.Variable(tf.truncated_normal(shape=[H, o])) for o in O]
b_hidden_to_output = [tf.Variable(tf.zeros(shape=[o])) for o in O]

In [30]:

# Build the network as usual
X_placeholder = [tf.placeholder(tf.float32, shape=[None, D]) for _ in range(T)]
Y_placeholder = [tf.placeholder(tf.float32, shape=[None, o]) for o in O]

Y_hat = [tf.nn.xw_plus_b(tf.nn.sigmoid(tf.nn.xw_plus_b(x,w0,b0)),w1,b1) 
         for x,w0,b0,w1,b1 in zip(X_placeholder, W_input_to_hidden, b_input_to_hidden, W_hidden_to_output, b_hidden_to_output)]

MSE = [tf.reduce_mean(tf.squared_difference(y,y_hat)) for y,y_hat in zip(Y_placeholder,Y_hat)]

loss = tf.reduce_mean(MSE)

opt = tf.train.AdamOptimizer(learning_rate=0.01)

train = opt.minimize(loss)

In [31]:
# Train the model as usual
sess.run(tf.global_variables_initializer())

In [32]:
feed_dict = dict(list(zip(X_placeholder,X))+list(zip(Y_placeholder,Y)))

In [33]:
for _ in range(1000):
    train.run(feed_dict=feed_dict)
    if _ % 100 == 0:
        print(loss.eval(feed_dict=feed_dict))

20.986982
3.7213852
1.6735786
1.0002509
0.7184858
0.5557379
0.43352595
0.349537
0.29541144
0.2536019


In [43]:
sess.close()

## We are using MNIST to try to replicate the results of the original paper before we apply it to our own research.

In [23]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras

import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

1.14.0


In [24]:
if sess in locals() or sess in globals():
    sess.close()
sess = tf.InteractiveSession()

In [25]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train/255.0, x_test/255.0

In [45]:
print(y_train.shape)

(60000,)


In [27]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
#     tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [28]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 784)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               100480    
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1290      
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
_________________________________________________________________


In [29]:
model.fit(x_train, y_train, epochs=5)
model.evaluate(x_test, y_test, verbose=2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
10000/10000 - 0s - loss: 0.0777 - acc: 0.9767


[0.07771949775977992, 0.9767]

In [32]:
h = 128
w_input_to_hidden = [tf.Variable(tf.truncated_normal(shape=[28*28, 128])) for _ in range(10)]
sess.run(tf.global_variables_initializer())
w_init = np.stack(sess.run(w_input_to_hidden))
w_init = np.transpose(w_init, axes=[1, 2, 0])
w_input_to_hidden, w_factors = TensorProducer(w_init, 'LAF', eps_or_k=0.1, return_true_var=True)
w_input_to_hidden = [w_input_to_hidden[:,:,i] for i in range(10)]

O = [1]*10
b_input_to_hidden = [tf.Variable(tf.zeros(shape=[128])) for _ in range(10)]
w_hidden_to_output = [tf.Variable(tf.truncated_normal(shape=[128, o])) for o in O]
b_hidden_to_output = [tf.Variable(tf.zeros(shape=[o])) for o in O]

In [34]:
x_placeholder = [tf.placeholder(tf.float32, shape=[None, 28*28]) for _ in range(10)]
y_placeholder = [tf.placeholder(tf.float32, shape=[None, o]) for o in O]

## THIS MUST BE CHANGED!!!
y_hat = [tf.nn.xw_plus_b(tf.nn.sigmoid(tf.nn.xw_plus_b(x,w0,b0)),w1,b1) 
         for x,w0,b0,w1,b1 in zip(x_placeholder, w_input_to_hidden, b_input_to_hidden, w_hidden_to_output, b_hidden_to_output)]

mse = [tf.reduce_mean(tf.squared_difference(y,y_hat)) for y,y_hat in zip(y_placeholder,y_hat)]
loss = tf.reduce_mean(mse)
opt = tf.train.AdamOptimizer(learning_rate=0.01)
train = opt.minimize(loss)

In [60]:
# X = [np.random.randn(N, D) for _ in range(T)]
# Y = [x.dot(np.random.randn(D, o)) for x,o in zip(X, O)] # target shape is N, O
# T = num tasks
# N = num samples
# D = feature vector size
# O = number of outputs for each task # already handled
x_train.shape = (60000, 28*28)
x = [x_train for _ in range(10)]
y = []
for i in range(10):
    temp = np.zeros((y_train.shape[0], 1), dtype=np.float64)
    for ind in range(y_train.shape[0]):
        if y_train[ind] == i:
            temp[ind][0] = 1
        else:
            temp[ind][0] = 0
    y.append(temp)

In [61]:
print(x[0][0])

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         

In [None]:
sess.run(tf.global_variables_initializer())
feed_dict = dict(list(zip(x_placeholder, x)) + list(zip(y_placeholder, y)))
for _ in range(1000):
    train.run(feed_dict=feed_dict)
    if _ % 100 == 0:
        print(loss.eval(feed_dict=feed_dict))

913.4896
0.22368667
0.12075188


## Integrating with graph convolutional networks

For sandboxing purposes, I'll be copy-pasting code from QM9GNN2_Multitask.ipynb

In [None]:
import numpy as np

from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.backend import mean, square

from spektral.datasets import qm9
from spektral.layers import EdgeConditionedConv, GlobalAttentionPool
from spektral.utils import label_to_one_hot

In [None]:
A_complete, X_complete, E_complete, y_complete = qm9.load_data(
    return_type='numpy',
    nf_keys='atomic_num',
    ef_keys='type',
    self_loops=True,
    amount=None   # Set to None to train on whole dataset
)
# one-hot labeling of atoms
uniq_X = np.unique(X_complete)
X_complete = label_to_one_hot(X_complete, uniq_X)

In [None]:
A, X, E = list(), list(), list()
y = y_complete.sample(10000)
for index, row in y.iterrows():
    A.append(A_complete[index])
    X.append(X_complete[index])
    E.append(E_complete[index])
A = np.stack(A, axis=0)
X = np.stack(X, axis=0)
E = np.stack(E, axis=0)

In [None]:
tasks = list(y.columns)[1:]
num_tasks = len(tasks)
y_list = []
for task in tasks:
    y_list.append(y[[task]].values)
    
key_to_index = dict(zip(tasks, range(len(tasks))))
key_to_mean = dict()
key_to_std = dict()
for task in tasks:
    key_to_mean[task] = np.mean(y[[task]].values)
    key_to_std[task] = np.std(y[[task]].values)

# Transforms the output values to have mean 0 and variance 1
for i in range(len(y_list)):
    y_list[i] = StandardScaler().fit_transform(y_list[i]).reshape(-1, y_list[-1].shape[-1])

In [None]:
clusters = [['A', 'B', 'alpha'],
            ['C', 'r2', 'u0'],
            ['zpve', 'g298', 'cv'],
            ['lumo', 'u298', 'h298'],
            ['mu', 'homo']]

In [None]:
N = X.shape[-2]           # Number of nodes in the graphs
F = X.shape[-1]           # Node features dimensionality
S = E.shape[-1]           # Edge features dimensionality
n_out = y_list[0].shape[-1]    # Dimensionality of the target
learning_rate = 1e-3      # Learning rate for SGD
epochs = 25               # Number of training epochs # formerly 25
batch_size = 64           # Batch size
es_patience = 5           # Patience fot early stopping
soft = False
soft_weight = 0.1

In [None]:
A_train, A_test, \
X_train, X_test, \
E_train, E_test, \
*y_train_test_list = train_test_split(A, X, E, *y_list, test_size = 0.1)

y_train_list = y_train_test_list[::2]
y_test_list = y_train_test_list[1::2]

In [None]:
X_in = Input(shape=(N, F))
A_in = Input(shape=(N, N))
E_in = Input(shape=(N, N, S))