Code derived from a [notebook](https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/4.4-overfitting-and-underfitting.ipynb) by [François Chollet](https://github.com/fchollet)

In [0]:
!pip install -q tensorflow-gpu==2.0
%load_ext tensorboard

In [0]:
import os
import shutil

import tensorflow as tf
from tensorflow import keras

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

models_dir = './models'
results_dir = "./"

#Dataset loading and pre-processing

In [0]:
NUM_WORDS = 10000

(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=NUM_WORDS)

def multi_hot_sequences(sequences, dimension):
  # Create an all-zero matrix of shape (len(sequences), dimension)
  results = np.zeros((len(sequences), dimension))
  for i, word_indices in enumerate(sequences):
    results[i, word_indices] = 1.0  # set specific indices of results[i] to 1s
  return results

y_train = y_train.astype('float32')
y_test = y_test.astype('float32')
y_train = np.expand_dims(y_train, 1)
y_test = np.expand_dims(y_test, 1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

x_train = multi_hot_sequences(x_train, dimension=NUM_WORDS)
x_val = multi_hot_sequences(x_val, dimension=NUM_WORDS)
x_test = multi_hot_sequences(x_test, dimension=NUM_WORDS)

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_val = x_val.astype('float32')

ds_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
ds_train = ds_train.shuffle(1000).batch(512)

plt.plot(x_train[0])

#Model architecture

In [0]:
def bigger_model():
  return keras.Sequential([
    # `input_shape` is only required here so that `.summary` works.
    keras.layers.Dense(512, activation=tf.nn.relu, input_shape=(NUM_WORDS,)),
    keras.layers.Dense(512, activation=tf.nn.relu),
    keras.layers.Dense(1)
  ])

# Initializes a series of networks to be used in the experiments, so that each
# combination of hyperparameters is tested with the same initial weights
tf.random.set_seed(0)
if not os.path.exists(models_dir):
  os.mkdir(models_dir)
  for i in range(50):
    model = bigger_model()
    _ = model(x_val[:1, :])
    model.save_weights('%s/model_%i.h5' % (models_dir, i))
  del model

# Experiments with *Projection*, *Unweighted cosine*, *Weighted cosine* and *Orthogonal*

In [0]:
def censored_vector(u, v, mode):
  """Adjusts the auxiliary loss gradient
  
  Adjusts the auxiliary loss gradient before adding it to the primary loss
  gradient and using a gradient descent-based method
  
  Args:
    u: A tensorflow variable representing the auxiliary loss gradient
    v: A tensorflow variable representing the primary loss gradient
    mode: The method used for the adjustment:
      - Single task: the auxiliary loss gradient is ignored
      - Multitask: the auxiliary loss gradient is kept as it is
      - Unweighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Weighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Orthogonal: https://arxiv.org/abs/1801.07593
      - Projection: cf. ICML submission
    
  Returns:
    A tensorflow variable representing the adjusted auxiliary loss gradient
  """
  if mode == 'Single task' or u is None:
    return 0  
  if mode == 'Multitask' or v is None:
    return u
  if len(u.shape.as_list()) == 1:
    u_dot_v, l_u, l_v = tf.reduce_sum(u*v), tf.norm(u), tf.norm(v)
  else:
    a, b = tf.reshape(u, [-1]), tf.reshape(v, [-1])
    u_dot_v, l_u, l_v = tf.reduce_sum(a*b), tf.norm(a), tf.norm(b)
  if l_u.numpy() == 0 or l_v.numpy() == 0:
    return u
  if mode == 'Unweighted cosine':
    return u if u_dot_v > 0 else tf.zeros_like(u)
  if mode == 'Weighted cosine':
    return tf.maximum(u_dot_v, 0)*u/l_u/l_v
  if mode == 'Projection':
    return u - tf.minimum(u_dot_v, 0)*v/l_v/l_v
  if mode == 'Orthogonal':
    return u - u_dot_v*v/l_v/l_v

def combined_grads(primary_grad,
                   average_primary_grad,
                   auxiliary_grad,
                   mode,
                   overall=False,
                   lam=1):
  """Combines auxiliary loss gradients and primary loss gradients
  
  Combines a sequence of auxiliary loss gradients and a sequence of primary
  loss gradients before performing a gradient descent step
  
  Args:
    primary_grad: A list of tensorflow variables corresponding to the primary
    loss gradient for the network's Keras variables
    average_primary_grad: A list of tensorflow variables corresponding to
    exponential moving averages of the elements above
    auxiliary_grad: A list of tensorflow variables corresponding to the
    auxiliary loss gradient for the network's Keras variables
    mode: The method used for the adjustment:
      - Single task: the auxiliary loss gradient is ignored
      - Multitask: the auxiliary loss gradient is kept as it is
      - Unweighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Weighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Orthogonal: https://arxiv.org/abs/1801.07593
      - Projection: cf. ICML submission
    overall: True if the transformation takes place at the level of the whole
    parameter vector, i.e. the concatenation of all the Keras variables of the
    network
    lambda: Float balancing the primary loss and the auxiliary loss
    
  Returns:
    A list of tensorflow variables combining the primary loss gradients and the
    auxiliary loss gradients and that can directly be used for the next gradient
    descent step
  """
  result = [0]*len(primary_grad)
  a = tf.constant([], dtype=tf.float32)
  aa = tf.constant([], dtype=tf.float32)
  b = tf.constant([], dtype=tf.float32)
  shapes = []
  for i in range(len(primary_grad)):
    if auxiliary_grad[i] is None or mode == 'Single task':
      result[i] = tf.zeros_like(primary_grad[i])
    elif primary_grad[i] is None or mode == 'Multitask':
      result[i] = lam*auxiliary_grad[i]
    elif not overall:
      if average_primary_grad is None:
        result[i] = lam*censored_vector(auxiliary_grad[i],
                                        primary_grad[i],
                                        mode)
      else:
        result[i] = lam*censored_vector(auxiliary_grad[i],
                                        average_primary_grad[i],
                                        mode)
    else:
      a = tf.concat([a, tf.reshape(primary_grad[i], [-1])], axis=0)
      if average_primary_grad is not None:
        aa = tf.concat([aa, tf.reshape(average_primary_grad[i], [-1])], axis=0)
      b = tf.concat([b, tf.reshape(auxiliary_grad[i], [-1])], axis=0)
      shapes.append((primary_grad[i].shape,
                     np.product(primary_grad[i].shape.as_list()),
                     i))

  if len(shapes) > 0:
    if average_primary_grad is None:
      c = lam*censored_vector(b, a, mode)
    else:
      c = lam*censored_vector(b, aa, mode)
    start = 0
    for i in range(len(shapes)):
      shape, length, index = shapes[i]
      result[index] = tf.reshape(c[start:start+length], shape)
      start += length
  return result

def train_iteration(model,
                    average_primary_grad,
                    alpha,
                    optimizer,
                    writer,
                    mode,
                    step,
                    overall=False,
                    lam=1):
  """Trains the model for one epoch
   
  Args:
    model: The Keras model being trained
    average_primary_grad: An exponential moving average of the main loss gradient for each variable
    alpha: The factor for the exponential moving average
    optimizer: The optimizer being used
    writer: The writer collecting summaries
    mode: The method used for adjusting the auxiliary loss gradient:
      - Single task: the auxiliary loss gradient is ignored
      - Multitask: the auxiliary loss gradient is kept as it is
      - Unweighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Weighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Orthogonal: https://arxiv.org/abs/1801.07593
      - Projection: cf. ICML submission
    step: the number of mini-batches so far      
    overall: A boolean indicating whether the previous method should be applied to the whole parameter vector
    lam: The weight of the auxiliary task  
    
  Returns:
    The updated value of the exponential moving average of the main loss gradient for each variable
  """


  with writer.as_default():
    for x1, y1 in ds_train.__iter__():    
      step += 1
      with tf.GradientTape() as tape:
        y1_hat = model(x1)
        primary_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y1, logits=y1_hat))

      tf.summary.scalar('primary_loss', primary_loss, step)
      primary_grad = tape.gradient(primary_loss, model.variables)
      if mode == 'Single task':       
        optimizer.apply_gradients(zip(primary_grad, model.variables))
      else:
        auxiliary_grad = model.variables #[v*(len(np.array(v.shape))-1) for v in model.variables]
        if alpha != 1:
          if average_primary_grad is None:
            average_primary_grad = [alpha*g for g in primary_grad]
          else:
            for i in range(len(average_primary_grad)):
              if primary_grad[i] is not None:
                average_primary_grad[i] = ((1 - alpha)*average_primary_grad[i]
                                           + alpha*primary_grad[i])
    
        censored_auxiliary_grad = combined_grads(primary_grad,
                                                 average_primary_grad,
                                                 auxiliary_grad,
                                                 mode,
                                                 overall=overall,
                                                 lam=lam)
        optimizer.apply_gradients(zip(primary_grad, model.variables))
        weights = model.get_weights()
        model.set_weights([weights[i] - (optimizer.learning_rate*censored_auxiliary_grad[i]).numpy() for i in range(len(weights))])
            
  return average_primary_grad, step

def get_metrics(dataset,
                model,
                writer,
                step):
  with writer.as_default():
    if dataset == 'val':
      y_hat = model(x_val)
      primary_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y_val, logits=y_hat))
    else:
      y_hat = model(x_test)
      primary_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y_test, logits=y_hat))
    tf.summary.scalar('primary_loss', primary_loss, step)
  return primary_loss.numpy()

In [0]:
def run_experiment(name, model, alpha, mode, overall, lam):
  """Trains the model until early stopping
   
  Args:
    name: The name to be used for the Tensorboard log files
    model: The Keras model being trained
    alpha: The factor for the exponential moving average
    mode: The method used for adjusting the auxiliary loss gradient:
      - Single task: the auxiliary loss gradient is ignored
      - Multitask: the auxiliary loss gradient is kept as it is
      - Unweighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Weighted cosine: cf. https://arxiv.org/abs/1812.02224
      - Orthogonal: https://arxiv.org/abs/1801.07593
      - Projection: cf. ICML submission
    overall: A boolean indicating whether the previous method should be applied to the whole parameter vector
    lam: The weight of the auxiliary task
    
  Returns:
    The performance metrics on the test set for the best model on the validation set
  """
  train_writer = tf.summary.create_file_writer('./log/train/' + name,
                                               flush_millis=10000)
  val_writer = tf.summary.create_file_writer('./log/val/' + name,
                                             flush_millis=10000)
  test_writer = tf.summary.create_file_writer('./log/test/' + name,
                                              flush_millis=10000)
  optimizer = tf.keras.optimizers.Adam()
  checkpoint_dir = 'checkpoint'
  shutil.rmtree(checkpoint_dir, ignore_errors=True)
  checkpoint_prefix = os.path.join(checkpoint_dir, 'model.ckpt')
  root = tf.train.Checkpoint(optimizer=optimizer, model=model)

  average_primary_grad = None
  iteration, step, not_better, best_loss = 1, 0, 0, np.Inf
  while not_better < 5:
    average_primary_grad, step = train_iteration(model,
                                                 average_primary_grad,
                                                 alpha,
                                                 optimizer,
                                                 train_writer,
                                                 mode,
                                                 step,
                                                 overall=overall,
                                                 lam=lam)
    val_loss = get_metrics('val', model, val_writer, step)
    if val_loss < best_loss:
      not_better, best_loss = 0, val_loss
      root.save(file_prefix=checkpoint_prefix)
    else:
      not_better += 1
    iteration += 1

  root.restore(tf.train.latest_checkpoint(checkpoint_dir))
  return get_metrics('test', model, test_writer, step)

In [0]:
results_file = results_dir + "experiment-4.csv"

if not os.path.isfile(results_file):
    with open(results_file, 'w') as file:
        file.write('run,mode,overall,alpha,lam,loss')

i = !cat {results_file.replace(' ', '\ ')} | wc -l
i = int(i[0])

parameters = [(run, mode, overall, lam, alpha)
              for run in range(50)
              for lam in [0, 300, 100, 30, 10, 3, 1]
              for overall in [False]
              for mode in ['Single task',
                           'Multitask',
                           'Projection',
                           'Weighted cosine',
                           'Unweighted cosine',
                           'Orthogonal']
              for alpha in [0.01, 1]]

parameters = [x for x in parameters if ((x[1] != 'Single task') or (x[3] == 0 and x[4] == 1))]
parameters = [x for x in parameters if ((x[1] == 'Single task') or (x[3] != 0))]
parameters = [x for x in parameters if ((x[1] not in ['Single task', 'Multitask']) or (x[4] == 1))]

while i < len(parameters):
  run, mode, overall, lam, alpha = parameters[i]
  name = '%s-%s-%s-%f-%f' % (run, mode, overall, lam, alpha)
  model = bigger_model()
  _ = model(x_val[:1, :])
  model.load_weights('%s/model_%i.h5' % (models_dir, run))
  result = run_experiment(name, model, alpha, mode, overall, lam)
  with open(results_file, 'a') as file:
    file.write(f'\n{run},{mode},{overall},{alpha},{lam},{str(result)}')
    print(f'{run},{mode},{overall},{alpha},{lam},{str(result)}')
  i += 1