In [0]:
!pip install -q tensorflow-gpu==1.14

In [0]:
results_dir = './'
model_dir = './models/'

In [0]:
import os
import shutil

import numpy as np
np.random.seed(seed=0)

import tensorflow as tf
import tensorflow.contrib.eager as tfe

tf.enable_eager_execution()
tf.set_random_seed(0)

import pandas as pd
import sklearn
import time

from IPython.display import clear_output

try:
  step_counter = tf.train.create_global_step()
except ValueError:
  step_counter.assign(0)

#Synthetic data generation

In [0]:
x_train1 = tf.constant(np.random.rand(1000, 250), dtype=tf.float32)
x_train2 = tf.constant(np.random.rand(1000, 250), dtype=tf.float32)
x_val = tf.constant(np.random.rand(1000, 250), dtype=tf.float32)
x_test = tf.constant(np.random.rand(10000, 250), dtype=tf.float32)

b = tf.constant(10*np.random.randn(250, 100), dtype=tf.float32)
other_b = tf.constant(10*np.random.randn(250, 100), dtype=tf.float32)
noisy_b = tf.constant(b + 3.5*np.random.randn(250, 100), dtype=tf.float32)

def synthetic_function(x, matrix):
  return tf.tanh(tf.tensordot(x, matrix, axes=1))

y_train1 = synthetic_function(x_train1, b)
y_val = synthetic_function(x_val, b)
y_test = synthetic_function(x_test, b)

y_train2_same = synthetic_function(x_train2, b)
y_train2_other = synthetic_function(x_train2, other_b)
y_train2_noisy = synthetic_function(x_train2, noisy_b)

ds_train1 = tf.data.Dataset.from_tensor_slices((x_train1, y_train1))
ds_train2_same = tf.data.Dataset.from_tensor_slices((x_train2, y_train2_same))
ds_train2_other = tf.data.Dataset.from_tensor_slices((x_train2, y_train2_other))
ds_train2_noisy = tf.data.Dataset.from_tensor_slices((x_train2, y_train2_noisy))

ds_train1 = ds_train1.shuffle(1000).batch(100)
ds_train2_same = ds_train2_same.shuffle(1000).batch(100)
ds_train2_other = ds_train2_other.shuffle(1000).batch(100)
ds_train2_noisy = ds_train2_noisy.shuffle(1000).batch(100)

# Model architecture

In [0]:
class SyntheticModel(tf.keras.Model):
  def __init__(self):
    super(SyntheticModel, self).__init__()
    kwargs = {'activation': 'relu'}
    self.dense1 = tf.keras.layers.Dense(100, **kwargs)
    self.dense2 = tf.keras.layers.Dense(100, **kwargs)
    self.dense3 = tf.keras.layers.Dense(100, **kwargs)
    self.dense4 = tf.keras.layers.Dense(100, **kwargs)
    self.dense5_primary = tf.keras.layers.Dense(100)
    self.dense5_auxiliary = tf.keras.layers.Dense(100)
  
  def call(self, x):
    y = self.dense1(x)
    y = self.dense2(y)
    y = self.dense3(y)
    y = self.dense4(y)
    return self.dense5_primary(y), self.dense5_auxiliary(y)

In [0]:
# Initializes a series of networks to be used in the experiments, so that each
# combination of hyperparameters is tested with the same initial weights
if not os.path.exists(model_dir):
  os.mkdir(model_dir)
  for i in range(20):
    model = SyntheticModel()
    _ = model(x_val[:1, :])
    model.save_weights(model_dir + 'model_%i.h5' % i)
  del model

# Experiments with *Projection*, *Unweighted cosine*, *Weighted cosine* and *Orthogonal*

In [0]:
def censored_vector(u, v, mode):
  """Adjusts the auxiliary loss gradient
  
  Adjusts the auxiliary loss gradient before adding it to the primary loss
  gradient and using a gradient descent-based method
  
  Args:
    u: A tensorflow variable representing the auxiliary loss gradient
    v: A tensorflow variable representing the primary loss gradient
    mode: The method used for the adjustment:
      - Single task: the auxiliary loss gradient is ignored
      - Multitask: the auxiliary loss gradient is kept as it is
      - Unweighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Weighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Orthogonal: https://arxiv.org/abs/1801.07593
      - Projection: cf. ICML submission
    
  Returns:
    A tensorflow variable representing the adjusted auxiliary loss gradient
  """
  if mode == 'Single task' or u is None:
    return 0  
  if mode == 'Multitask' or v is None:
    return u
  if len(u.shape.as_list()) == 1:
    u_dot_v, l_u, l_v = tf.reduce_sum(u*v), tf.norm(u), tf.norm(v)
  else:
    a, b = tf.reshape(u, [-1]), tf.reshape(v, [-1])
    u_dot_v, l_u, l_v = tf.reduce_sum(a*b), tf.norm(a), tf.norm(b)
  if l_u.numpy() == 0 or l_v.numpy() == 0:
    return u
  if mode == 'Unweighted cosine':
    return u if u_dot_v > 0 else tf.zeros_like(u)
  if mode == 'Weighted cosine':
    return tf.maximum(u_dot_v, 0)*u/l_u/l_v
  if mode == 'Orthogonal':
    return u - u_dot_v*v/l_v/l_v
  if mode == 'Projection':
    return u - tf.minimum(u_dot_v, 0)*v/l_v/l_v

def combined_grads(primary_grad,
                   average_primary_grad,
                   auxiliary_grad,
                   mode,
                   overall=False,
                   lam=1):
  """Combines auxiliary loss gradients and primary loss gradients
  
  Combines a sequence of auxiliary loss gradients and a sequence of primary
  loss gradients before performing a gradient descent step
  
  Args:
    primary_grad: A list of tensorflow variables corresponding to the primary
    loss gradient for the network's Keras variables
    average_primary_grad: A list of tensorflow variables corresponding to
    exponential moving averages of the elements above
    auxiliary_grad: A list of tensorflow variables corresponding to the
    auxiliary loss gradient for the network's Keras variables
    mode: The method used for the adjustment:
      - Single task: the auxiliary loss gradient is ignored
      - Multitask: the auxiliary loss gradient is kept as it is
      - Unweighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Weighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Orthogonal: https://arxiv.org/abs/1801.07593
      - Projection: cf. ICML submission
    overall: True if the transformation takes place at the level of the whole
    parameter vector, i.e. the concatenation of all the Keras variables of the
    network
    lambda: Float balancing the primary loss and the auxiliary loss
    
  Returns:
    A list of tensorflow variables combining the primary loss gradients and the
    auxiliary loss gradients and that can directly be used for the next gradient
    descent step
  """
  result = [0]*len(primary_grad)
  a = tf.constant([], dtype=tf.float32)
  aa = tf.constant([], dtype=tf.float32)
  b = tf.constant([], dtype=tf.float32)
  shapes = []
  for i in range(len(primary_grad)):
    if auxiliary_grad[i] is None or mode == 'Single task':
      result[i] = primary_grad[i]
    elif primary_grad[i] is None:
      result[i] = lam*auxiliary_grad[i]
    elif mode == 'Multitask':
      result[i] = primary_grad[i] + lam*auxiliary_grad[i]
    elif not overall:
      if average_primary_grad is None:
        result[i] = (primary_grad[i]
                     + lam*censored_vector(auxiliary_grad[i],
                                           primary_grad[i],
                                           mode))
      else:
        result[i] = (primary_grad[i]
                     + lam*censored_vector(auxiliary_grad[i],
                                           average_primary_grad[i],
                                           mode))
    else:
      a = tf.concat([a, tf.reshape(primary_grad[i], [-1])], axis=0)
      if average_primary_grad is not None:
        aa = tf.concat([aa, tf.reshape(average_primary_grad[i], [-1])], axis=0)
      b = tf.concat([b, tf.reshape(auxiliary_grad[i], [-1])], axis=0)
      shapes.append((primary_grad[i].shape,
                     np.product(primary_grad[i].shape.as_list()),
                     i))

  if len(shapes) > 0:
    if average_primary_grad is None:
      c = a + lam*censored_vector(b, a, mode)
    else:
      c = a + lam*censored_vector(b, aa, mode)
    start = 0
    for i in range(len(shapes)):
      shape, length, index = shapes[i]
      result[index] = tf.reshape(c[start:start+length], shape)
      start += length
  return result

def train_iteration(model,
                    average_primary_grad,
                    alpha,
                    optimizer,
                    ds_train2,
                    writer,
                    step_counter,
                    mode,
                    overall=False,
                    lam=1):

  """Trains the model for one epoch
   
  Args:
    model: The Keras model being trained
    average_primary_grad: An exponential moving average of the main loss gradient for each variable
    alpha: The factor for the exponential moving average
    optimizer: The optimizer being used
    ds_train2: The dataset used for the auxiliary task
    writer: The writer collecting summaries
    step_counter: The global counter used by the optimizer
    mode: The method used for adjusting the auxiliary loss gradient:
      - Single task: the auxiliary loss gradient is ignored
      - Multitask: the auxiliary loss gradient is kept as it is
      - Unweighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Weighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Orthogonal: https://arxiv.org/abs/1801.07593
      - Projection: cf. ICML submission
    overall: A boolean indicating whether the previous method should be applied to the whole parameter vector
    lam: The weight of the auxiliary task  
    
  Returns:
    The updated value of the exponential moving average of the main loss gradient for each variable
  """

  if mode != 'Single task':
    train_iterator2 = ds_train2.make_one_shot_iterator()
    
  with writer.as_default(), tf.contrib.summary.always_record_summaries():
    for x1, y1 in ds_train1.make_one_shot_iterator():
      if mode != 'Single task':
        x2, y2 = train_iterator2.get_next()
        
      with tf.GradientTape(persistent=True) as tape:
        y1_hat = model(x1)[0]
        primary_loss = tf.reduce_mean((y1_hat-y1)**2)
        if mode != 'Single task':
          y2_hat = model(x2)[1]
          auxiliary_loss = tf.reduce_mean((y2_hat-y2)**2)

      tf.contrib.summary.scalar('primary_loss', primary_loss)
      primary_grad = tape.gradient(primary_loss, model.variables)
      if mode == 'Single task':       
        optimizer.apply_gradients(zip(primary_grad, model.variables),
                                  global_step=step_counter)
      else:
        tf.contrib.summary.scalar('auxiliary_loss', auxiliary_loss)
        auxiliary_grad = tape.gradient(auxiliary_loss, model.variables)
        
        if alpha != 1:
          if average_primary_grad is None:
            average_primary_grad = primary_grad
          else:
            for i in range(len(average_primary_grad)):
              if primary_grad[i] is not None:
                average_primary_grad[i] = ((1 - alpha)*average_primary_grad[i]
                                           + alpha*primary_grad[i])
    
        grad = combined_grads(primary_grad,
                              average_primary_grad,
                              auxiliary_grad,
                              mode,
                              overall=overall,
                              lam=lam)
        optimizer.apply_gradients(zip(grad, model.variables),
                                  global_step=step_counter)
  return average_primary_grad

def get_metrics(dataset,
                model,
                writer,
                step_counter):
  x, y = (x_val, y_val) if dataset == 'val' else (x_test, y_test)
  with writer.as_default(), tf.contrib.summary.always_record_summaries():
    y_hat = model(x)[0]
    primary_loss = tf.reduce_mean((y_hat-y)**2)
    tf.contrib.summary.scalar('primary_loss', primary_loss)
  return primary_loss.numpy()

In [0]:
def run_experiment(name, model, alpha, ds_train2, mode, overall, lam, output):
  """Trains the model until early stopping
   
  Args:
    name: The name to be used for the Tensorboard log files
    model: The Keras model being trained
    alpha: The factor for the exponential moving average
    ds_train2: The dataset used for the auxiliary task
    mode: The method used for adjusting the auxiliary loss gradient:
      - Single task: the auxiliary loss gradient is ignored
      - Multitask: the auxiliary loss gradient is kept as it is
      - Unweighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Weighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Orthogonal: https://arxiv.org/abs/1801.07593
      - Projection: cf. ICML submission
    overall: A boolean indicating whether the previous method should be applied to the whole parameter vector
    lam: The weight of the auxiliary task
    output: The current output printed for the user during training
    
  Returns:
    The performance metrics on the test set for the best model on the validation set
  """
  train_writer = tf.contrib.summary.create_file_writer('./log/train/' + name,
                                                       flush_millis=10000)
  val_writer = tf.contrib.summary.create_file_writer('./log/val/' + name,
                                                       flush_millis=10000)
  test_writer = tf.contrib.summary.create_file_writer('./log/test/' + name,
                                                      flush_millis=10000)
  step_counter.assign(0)
  optimizer = tf.train.AdamOptimizer()
  checkpoint_dir = 'model_synthetic'
  shutil.rmtree(checkpoint_dir, ignore_errors=True)
  checkpoint_prefix = os.path.join(checkpoint_dir, 'model.ckpt')
  root = tf.contrib.eager.Checkpoint(optimizer=optimizer,
                                     model=model,
                                     optimizer_step=step_counter)

  average_primary_grad = None
  iteration, not_better, best_loss = 1, 0, np.Inf
  while not_better < 10:
    average_primary_grad = train_iteration(model,
                                           average_primary_grad,
                                           alpha,
                                           optimizer,
                                           ds_train2,
                                           train_writer,
                                           step_counter,
                                           mode,
                                           overall=overall,
                                           lam=lam)
    val_loss = get_metrics('val', model, val_writer, step_counter)
    clear_output()
    print(output)
    print(iteration, val_loss)
    if val_loss < best_loss:
      not_better, best_loss = 0, val_loss
      root.save(file_prefix=checkpoint_prefix)
    else:
      not_better += 1
    iteration += 1

  root.restore(tf.train.latest_checkpoint(checkpoint_dir))
  return model, get_metrics('test', model, test_writer, step_counter)

In [0]:
def run_experiments(configs, filename):
  """Trains models for a series of configurations
   
  Args:
    configs: The combinations of hyper-parameters to use in the experiments
    filename: The name of the file used for recording the experiments' results
    
  Returns:
    Nothing
  """
  already_done = !cat {filename.replace(' ', '\ ')} | wc -l
  already_done = int(already_done[0])
  
  output = ''

  current_iteration = -1

  for iteration, mode, overall, lam, case, alpha in configs[already_done:]:
    
    start = time.time()
    
    model = SyntheticModel()
    _ = model(x_val[:1, :])
    model.load_weights(model_dir + 'model_%i.h5' % iteration)

    if case == 'Same task':
      ds_train2 = ds_train2_same
    elif case == 'Similar task':
      ds_train2 = ds_train2_noisy
    elif case == 'Unrelated task':
      ds_train2 = ds_train2_other
    
    output += 'Iteration #%d: %s, %s (overall: %s, %f)\n' % (iteration,
                                                             mode,
                                                             case,
                                                             overall,
                                                             lam)
    name = '%s-%s-%s-%s-%f-%f' % (iteration, mode, case, overall, lam, alpha)
    test_loss = run_experiment(name,
                               model,
                               alpha,
                               ds_train2,
                               mode,
                               overall,
                               lam,
                               output)[1]
    template = 'Loss: %.3f (%d seconds)\n\n'
    output += template % (test_loss, time.time()-start)
    line = '%s,%s,%s,%f,%f,%f' % (case,
                                  mode,
                                  overall,
                                  alpha,
                                  lam,
                                  test_loss)
    with open(filename, 'a') as file:
      file.write(f'\n{line}')

In [0]:
iterations = 20

filename = results_dir + 'experiment-2.csv'
header = 'situation,mode,overall,alpha,lam,test_loss'

if not os.path.isfile(filename):
    with open(filename, 'w') as file:
        file.write(header)

configs = [(iteration, mode, overall, lam, case, alpha)
           for iteration in range(iterations)
           for lam in [0]
           for overall in [True]
           for mode in ['Single task']
           for case in ['Same task']
           for alpha in [1]]

configs += [(iteration, mode, overall, lam, case, alpha)
            for iteration in range(iterations)
            for lam in [.1, .3, 1, 3, 10]
            for overall in [True]
            for mode in ['Multitask']
            for case in ['Same task', 'Similar task', 'Unrelated task']
            for alpha in [1]]

configs += [(iteration, mode, overall, lam, case, alpha)
            for iteration in range(iterations)
            for lam in [.1, .3, 1, 3, 10]
            for overall in [True, False]
            for mode in ['Projection']
            for case in ['Same task', 'Similar task', 'Unrelated task']
            for alpha in [0.01, 0.1, 1]]

configs += [(iteration, mode, overall, lam, case, alpha)
            for iteration in range(iterations)
            for lam in [.1, .3, 1, 3, 10]
            for overall in [True, False]
            for mode in ['Weighted cosine', 'Unweighted cosine', 'Orthogonal']
            for case in ['Same task', 'Similar task', 'Unrelated task']
            for alpha in [0.01, 0.1, 1]]

run_experiments(configs, filename)