<a href="https://colab.research.google.com/github/zosimanoz/head_pose_metalearning/blob/master/maml_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import os
os.chdir("/content/drive/My Drive/hpemaml/")

In [None]:
import json
import os
import random
import urllib.request
from collections import defaultdict
from copy import deepcopy
from statistics import mean, stdev
from urllib.error import HTTPError

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchvision
from IPython.display import set_matplotlib_formats
from PIL import Image
from torchvision import transforms
from torchvision.datasets import CIFAR100, SVHN
from tqdm.auto import tqdm

In [None]:
# PyTorch Lightning

try:
    import pytorch_lightning as pl
except ModuleNotFoundError: # Google Colab does not have PyTorch Lightning installed by default. Hence, we do it here if necessary
    !pip install pytorch-lightning=1.4.5
    import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

# Import tensorboard
%load_ext tensorboard

pl.__version__

'1.4.5'

In [None]:
CHECKPOINT_PATH = "../final_maml_head_pose_checkpoint"

In [None]:
class FewShotBatchSampler:
    def __init__(self,dataset_targets, N_way, K_shot, include_query=False, shuffle=True, shuffle_once=False):
        super().__init__()
        self.dataset_targets = torch.tensor(dataset_targets)
        self.N_way = N_way
        self.K_shot = K_shot
        self.shuffle = shuffle
        self.include_query = include_query
        if self.include_query:
            self.K_shot *= 2
        self.batch_size = self.N_way * self.K_shot  # Number of overall images per batch

        # Organize examples by class
        self.classes = torch.unique(self.dataset_targets).tolist()
        self.num_classes = len(self.classes)
        self.indices_per_class = {}
        self.batches_per_class = {}  # Number of K-shot batches that each class can provide
        for c in self.classes:
            self.indices_per_class[c] = torch.where(self.dataset_targets == c)[0]
            self.batches_per_class[c] = self.indices_per_class[c].shape[0] // self.K_shot

        # Create a list of classes from which we select the N classes per batch
        self.iterations = sum(self.batches_per_class.values()) // self.N_way
        self.class_list = [c for c in self.classes for _ in range(self.batches_per_class[c])]
        if shuffle_once or self.shuffle:
            self.shuffle_data()
        else:
            # For testing, we iterate over classes instead of shuffling them
            sort_idxs = [
                i + p * self.num_classes for i, c in enumerate(self.classes) for p in range(self.batches_per_class[c])
            ]
            self.class_list = np.array(self.class_list)[np.argsort(sort_idxs)].tolist()

    def shuffle_data(self):
        # Shuffle the examples per class
        for c in self.classes:
            perm = torch.randperm(self.indices_per_class[c].shape[0])
            self.indices_per_class[c] = self.indices_per_class[c][perm]
        # Shuffle the class list from which we sample. Note that this way of shuffling
        # does not prevent to choose the same class twice in a batch. However, for
        # training and validation, this is not a problem.
        random.shuffle(self.class_list)

    def __iter__(self):
        # Shuffle data
        if self.shuffle:
            self.shuffle_data()

        # Sample few-shot batches
        start_index = defaultdict(int)
        for it in range(self.iterations):
            class_batch = self.class_list[it * self.N_way : (it + 1) * self.N_way]  # Select N classes for the batch
            index_batch = []
            for c in class_batch:  # For each class, select the next K examples and add them to the batch
                index_batch.extend(self.indices_per_class[c][start_index[c] : start_index[c] + self.K_shot])
                start_index[c] += self.K_shot
            if self.include_query:  # If we return support+query set, sort them so that they are easy to split
                index_batch = index_batch[::2] + index_batch[1::2]
            yield index_batch

    def __len__(self):
        return self.iterations

In [None]:
class TaskBatchSampler(object):

    def __init__(self, dataset_targets, batch_size, N_way, K_shot, include_query=False, shuffle=True):
        super().__init__()
        self.batch_sampler = FewShotBatchSampler(dataset_targets, N_way, K_shot, include_query, shuffle)
        self.task_batch_size = batch_size
        self.local_batch_size = self.batch_sampler.batch_size

    def __iter__(self):
        # Aggregate multiple batches before returning the indices
        batch_list = []
        for batch_idx, batch in enumerate(self.batch_sampler):
            batch_list.extend(batch)
            if (batch_idx+1) % self.task_batch_size == 0:
                yield batch_list
                batch_list = []

    def __len__(self):
        return len(self.batch_sampler)//self.task_batch_size

    def get_collate_fn(self):
        # Returns a collate function that converts one big tensor into a list of task-specific tensors
        def collate_fn(item_list):
            imgs = torch.stack([torch.from_numpy(img) for img, gt, folder, target in item_list], dim=0)
            targets = torch.stack([torch.from_numpy(np.fromstring(target, dtype=int, sep=' ')) for img, gt, folder, target in item_list], dim=0)
            yprAnglesGt = torch.stack([torch.from_numpy(gt) for img, gt, folder, target in item_list], dim=0)

            imgs = imgs.chunk(self.task_batch_size, dim=0)
            targets = targets.chunk(self.task_batch_size, dim=0)
            yprAnglesGt = yprAnglesGt.chunk(self.task_batch_size, dim=0)


            return list(zip(imgs, targets, yprAnglesGt))
        return collate_fn

In [None]:
filename = 'trained_models/embeddings_full.npz'
n_embeddings = np.load(filename, allow_pickle=True)
arr = n_embeddings['arr_0']
classes = [str(i).zfill(2) for i in range(1,21)]
n_arr = []
for i in range(arr.shape[0]):
  a = arr[i]
  for j in classes:
    if a[2][13:15] == j:
      li = np.array([arr[i][0],arr[i][1],arr[i][2], j])
  n_arr.append(li)
n_arr = np.array(n_arr)

  # Remove the CWD from sys.path while we load stuff.


In [None]:
classes = [str(i).zfill(2) for i in range(1,21)]

In [None]:
# use first 15 person as training and rest 5 person as test

train_set = []
val_set = []

for i in range(n_arr.shape[0]):
  if int(n_arr[i][3]) <= 15:
    train_set.append(n_arr[i])
  else:
    val_set.append(n_arr[i])

train_set_n = np.array(train_set)
val_set_n = np.array(val_set)

print(train_set_n.shape)
print(val_set_n.shape)

# get classes of train and test set
train_targets = []
val_targets = []
for i in range(train_set_n.shape[0]):
    train_targets.append(int(train_set_n[i][3]))
for i in range(val_set_n.shape[0]):
    val_targets.append(int(val_set_n[i][3]))

(10581, 4)
(2638, 4)


In [None]:
# Training constant
N_WAY = 5
K_SHOT = 1

# Training set
train_maml_sampler = TaskBatchSampler(train_targets,
                                           include_query=True,
                                           N_way=N_WAY,
                                           K_shot=K_SHOT,
                                           batch_size=16)
train_maml_loader = data.DataLoader(train_set,
                                         batch_sampler=train_maml_sampler,
                                         collate_fn=train_maml_sampler.get_collate_fn(),
                                         num_workers=2)

# Validation set
val_maml_sampler = TaskBatchSampler(val_targets,
                                         include_query=True,
                                         N_way=N_WAY,
                                         K_shot=K_SHOT,
                                         batch_size=1,  # We do not update the parameters, hence the batch size is irrelevant here
                                         shuffle=False)
val_maml_loader = data.DataLoader(val_set,
                                       batch_sampler=val_maml_sampler,
                                       collate_fn=val_maml_sampler.get_collate_fn(),
                                       num_workers=2)

In [None]:
import torch
import torch.nn as nn

class NetSimple(nn.Module):
    def __init__(self, num_feature=200, num_hidden=1000, num_output=3):
        super(NetSimple, self).__init__()

        self.num_feature = num_feature
        self.num_hidden = num_hidden
        self.num_output = num_output
        
        self.linear1 = nn.Sequential(
            torch.nn.Linear(self.num_feature, self.num_hidden),
            torch.nn.Dropout(p=0.25),
            torch.nn.LeakyReLU(0.1),
            torch.nn.BatchNorm1d(self.num_hidden, eps=1e-05, momentum=0.01, affine=True)
        )

        self.linear2 = nn.Sequential(
            torch.nn.Linear(self.num_hidden, 1000),
            torch.nn.Dropout(p=0.25),
            torch.nn.LeakyReLU(0.1),
            torch.nn.BatchNorm1d(1000, eps=1e-05, momentum=0.01, affine=True)
        )

        self.linear3 = nn.Sequential(
            torch.nn.Linear(1000, 1000),
            torch.nn.Dropout(p=0.25),
            torch.nn.LeakyReLU(0.1),
            torch.nn.BatchNorm1d(1000, eps=1e-05, momentum=0.01, affine=True)
        )

        self.linear4 = nn.Sequential(
            torch.nn.Linear(1000, 1000),
            torch.nn.Dropout(p=0.25),
            torch.nn.LeakyReLU(0.1),
            torch.nn.BatchNorm1d(1000, eps=1e-05, momentum=0.01, affine=True)
        )
        
        self.output = nn.Sequential(
            torch.nn.Linear(1000, self.num_output),
        )

    def forward(self, x):
        var_size = x.data.shape[0]
        x = self.linear1(x)
        x = self.linear2(x)
        x = self.linear3(x)
        x = self.linear4(x)
        x = self.output(x)
        return x

In [None]:
def split_batch(imgs, targets, yprAngles):
    support_imgs, query_imgs = imgs.chunk(2, dim=0)
    support_targets, query_targets = targets.chunk(2, dim=0)
    support_gt, query_gt = yprAngles.chunk(2, dim=0)
    return support_imgs, query_imgs, support_targets, query_targets, support_gt, query_gt

In [None]:
class MAMLNetwork(pl.LightningModule):

    def __init__(self, config):
        """
        Inputs
            lr - Learning rate of the outer loop Adam optimizer
            lr_inner - Learning rate of the inner loop SGD optimizer
            lr_output - Learning rate for the output layer in the inner loop
            num_inner_steps - Number of inner loop updates to perform
        """
        super().__init__()
        self.save_hyperparameters()
        # self.model = Net()
        self.model = NetSimple()
        self.lr = config["lr"]
        self.lr_inner = config["lr_inner"]
        self.num_inner_steps = config["num_inner_steps"]

        self.mse_criterion = nn.MSELoss()
        self.mae_criterion = nn.L1Loss()
        self.training_losses = []
        self.val_losses = []


    def configure_optimizers(self):
        optimizer = optim.SGD(self.parameters(), lr=self.lr)
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[140,180], gamma=0.1)
        return [optimizer], [scheduler]

    def run_model(self, local_model, imgs, support_gt):
        # Execute a model with given output layer weights and inputs
        preds = local_model(imgs)
        loss = self.mae_criterion(preds.float(), support_gt.float())
        loss_y = self.mae_criterion(preds[0][0].float(), support_gt[0][0].float())
        loss_p = self.mae_criterion(preds[0][1].float(), support_gt[0][1].float())
        loss_r = self.mae_criterion(preds[0][2].float(), support_gt[0][2].float())
        return loss, preds, loss_y, loss_p, loss_r

    def predict_single(self, imgs, support_gt):
        local_model = deepcopy(self.model)
        local_model.train()
        local_optim = optim.SGD(local_model.parameters(), lr=self.lr_inner)
        local_optim.zero_grad()
        angles=[]
        for i in range(5):
            loss, pred, loss_y, loss_p, loss_r = self.run_model(local_model, imgs, support_gt)
            # backpropagate
            loss.backward()
            local_optim.step()
            # Reset gradients
            local_optim.zero_grad()
            
        return pred 

    def adapt_few_shot(self, support_imgs, support_targets, support_gt):

        # Create inner-loop model and optimizer
        local_model = deepcopy(self.model)
        local_model.train()
        local_optim = optim.SGD(local_model.parameters(), lr=self.lr_inner)
        local_optim.zero_grad()

        
        # Optimize inner loop model on support set
        for _ in range(self.num_inner_steps):
            # Determine loss on the support set
            loss, pred, loss_y, loss_p, loss_r = self.run_model(local_model, support_imgs, support_gt)
            # Calculate gradients and perform inner loop update
            
            loss.backward()
            local_optim.step()
           
            # Reset gradients
            local_optim.zero_grad()

        return local_model

    def outer_loop(self, batch, mode="train"):
        
        accuracies = []
        losses = []
        losses_y = []
        losses_p = []
        losses_r = []

        self.model.zero_grad()

        # Determine gradients for batch of tasks
        for task_batch in batch:
            imgs, targets, yprAngles = task_batch
            support_imgs, query_imgs, support_targets, query_targets, support_gt, query_gt = split_batch(imgs, targets, yprAngles)
            
            # Perform inner loop adaptation
            local_model = self.adapt_few_shot(support_imgs, support_targets, support_gt)
            
            # Determine loss of query set
            loss, preds, loss_y, loss_p, loss_r = self.run_model(local_model, query_imgs, query_gt)
            
            # Calculate gradients for query set loss
            if mode == "train":
                loss.backward()

                for p_global, p_local in zip(self.model.parameters(), local_model.parameters()):
                    p_global.grad = p_local.grad  # First-order approx. -> add gradients of finetuned and base model

            # accuracies.append(acc.mean().detach())
            losses.append(loss.detach())
            losses_y.append(loss_y.detach())
            losses_p.append(loss_p.detach())
            losses_r.append(loss_r.detach())
            
        # Perform update of base model
        if mode == "train":
            opt = self.optimizers()
            opt.step()
            opt.zero_grad()

        self.log(f"{mode}_loss", sum(losses) / len(losses), on_epoch=True)
        self.log(f"{mode}_loss_y", sum(losses_y) / len(losses_y), on_epoch=True)
        self.log(f"{mode}_loss_p", sum(losses_p) / len(losses_p), on_epoch=True)
        self.log(f"{mode}_loss_r", sum(losses_r) / len(losses_r), on_epoch=True)

    def training_step(self, batch, batch_idx):
        loss = self.outer_loop(batch, mode="train")
        return None  # Returning None means we skip the default training optimizer steps by PyTorch Lightning

    def validation_step(self, batch, batch_idx):
        # Validation requires to finetune a model, hence we need to enable gradients
        torch.set_grad_enabled(True)
        self.outer_loop(batch, mode="val")
        torch.set_grad_enabled(False)


In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

pl.__version__

Device: cuda:0


'1.4.5'

In [None]:
!pip --quiet install "ray[tune]"

In [None]:
from ray.tune.integration.pytorch_lightning import TuneReportCallback

def train_model(config, model_class, train_loader, val_loader):
    
    metrics = {"loss": "val_loss"}

    # tune hyperparameter
    trainer = pl.Trainer(default_root_dir=os.path.join(CHECKPOINT_PATH, model_class.__name__),
                         gpus=1 if str(device) == "cuda:0" else 0, log_every_n_steps = 10,
                         max_epochs=20,
                         callbacks=[ModelCheckpoint(save_weights_only=True, mode="min", monitor="val_loss"),
                                    LearningRateMonitor("epoch"), 
                                    TuneReportCallback(metrics, on="validation_end")], 
                         progress_bar_refresh_rate=0)
    
    trainer.logger._default_hp_metric = None

    # Check whether pretrained model exists. If yes, load it and skip training
    pretrained_filename = os.path.join(
        CHECKPOINT_PATH, model_class.__name__ + ".ckpt")
    if os.path.isfile(pretrained_filename):
        print(f"Found pretrained model at {pretrained_filename}, loading...")
        # Automatically loads the model with the saved hyperparameters
        # model_class = MAMLNetwork(config)
        model = model_class.load_from_checkpoint(pretrained_filename)
    else:
        pl.seed_everything(42)  # To be reproducable
        model = model_class(config)
        # model_class = MAMLNetwork(config)
        trainer.fit(model, train_loader, val_loader)
        model = model_class.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)  # Load best checkpoint after training

    return model

In [None]:
# Hyper parameter

import tempfile
from ray import tune

num_samples = 10
num_epochs = 10
gpus_per_trial = 1 # set this to higher if using GPU


config = {
    "lr": tune.choice([0.1,0.01,0.001,0.0001]),
    "lr_inner": tune.choice([0.1,0.01,0.001,0.0001]),
    "num_inner_steps": tune.choice([3, 5, 8, 10])
}

trainable = tune.with_parameters(train_model, model_class = MAMLNetwork, train_loader=train_maml_loader,
                              val_loader=val_maml_loader)

analysis = tune.run(
    trainable,
    resources_per_trial={
        "cpu": 1,
        "gpu": gpus_per_trial
    },
    metric="loss",
    mode="min",
    config=config,
    num_samples=num_samples,
    name="train_model")

print(analysis.best_config)


Trial name,status,loc,lr,lr_inner,num_inner_steps
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3
train_model_24539_00001,PENDING,,0.001,0.0001,5
train_model_24539_00002,PENDING,,0.001,0.0001,8
train_model_24539_00003,PENDING,,0.0001,0.1,3
train_model_24539_00004,PENDING,,0.01,0.01,3
train_model_24539_00005,PENDING,,0.01,0.01,8
train_model_24539_00006,PENDING,,0.001,0.0001,10
train_model_24539_00007,PENDING,,0.01,0.01,8
train_model_24539_00008,PENDING,,0.01,0.01,3
train_model_24539_00009,PENDING,,0.01,0.1,10


[2m[36m(ImplicitFunc pid=5510)[0m GPU available: True, used: True
[2m[36m(ImplicitFunc pid=5510)[0m TPU available: False, using: 0 TPU cores
[2m[36m(ImplicitFunc pid=5510)[0m IPU available: False, using: 0 IPUs
[2m[36m(ImplicitFunc pid=5510)[0m Global seed set to 42
[2m[36m(ImplicitFunc pid=5510)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Trial name,status,loc,lr,lr_inner,num_inner_steps
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3
train_model_24539_00001,PENDING,,0.001,0.0001,5
train_model_24539_00002,PENDING,,0.001,0.0001,8
train_model_24539_00003,PENDING,,0.0001,0.1,3
train_model_24539_00004,PENDING,,0.01,0.01,3
train_model_24539_00005,PENDING,,0.01,0.01,8
train_model_24539_00006,PENDING,,0.001,0.0001,10
train_model_24539_00007,PENDING,,0.01,0.01,8
train_model_24539_00008,PENDING,,0.01,0.01,3
train_model_24539_00009,PENDING,,0.01,0.1,10


[2m[36m(ImplicitFunc pid=5510)[0m 
[2m[36m(ImplicitFunc pid=5510)[0m   | Name          | Type    | Params
[2m[36m(ImplicitFunc pid=5510)[0m ------------------------------------------
[2m[36m(ImplicitFunc pid=5510)[0m 0 | model         | Net     | 427 K 
[2m[36m(ImplicitFunc pid=5510)[0m 1 | mse_criterion | MSELoss | 0     
[2m[36m(ImplicitFunc pid=5510)[0m 2 | mae_criterion | L1Loss  | 0     
[2m[36m(ImplicitFunc pid=5510)[0m ------------------------------------------
[2m[36m(ImplicitFunc pid=5510)[0m 427 K     Trainable params
[2m[36m(ImplicitFunc pid=5510)[0m 0         Non-trainable params
[2m[36m(ImplicitFunc pid=5510)[0m 427 K     Total params
[2m[36m(ImplicitFunc pid=5510)[0m 1.708     Total estimated model params size (MB)
[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")
[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")
[2m[36m(ImplicitFunc pid=5510)[0m   "`Trainer.running_sanit

Trial name,status,loc,lr,lr_inner,num_inner_steps
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3
train_model_24539_00001,PENDING,,0.001,0.0001,5
train_model_24539_00002,PENDING,,0.001,0.0001,8
train_model_24539_00003,PENDING,,0.0001,0.1,3
train_model_24539_00004,PENDING,,0.01,0.01,3
train_model_24539_00005,PENDING,,0.01,0.01,8
train_model_24539_00006,PENDING,,0.001,0.0001,10
train_model_24539_00007,PENDING,,0.01,0.01,8
train_model_24539_00008,PENDING,,0.01,0.01,3
train_model_24539_00009,PENDING,,0.01,0.1,10


Trial name,status,loc,lr,lr_inner,num_inner_steps
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3
train_model_24539_00001,PENDING,,0.001,0.0001,5
train_model_24539_00002,PENDING,,0.001,0.0001,8
train_model_24539_00003,PENDING,,0.0001,0.1,3
train_model_24539_00004,PENDING,,0.01,0.01,3
train_model_24539_00005,PENDING,,0.01,0.01,8
train_model_24539_00006,PENDING,,0.001,0.0001,10
train_model_24539_00007,PENDING,,0.01,0.01,8
train_model_24539_00008,PENDING,,0.01,0.01,3
train_model_24539_00009,PENDING,,0.01,0.1,10


Trial name,status,loc,lr,lr_inner,num_inner_steps
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3
train_model_24539_00001,PENDING,,0.001,0.0001,5
train_model_24539_00002,PENDING,,0.001,0.0001,8
train_model_24539_00003,PENDING,,0.0001,0.1,3
train_model_24539_00004,PENDING,,0.01,0.01,3
train_model_24539_00005,PENDING,,0.01,0.01,8
train_model_24539_00006,PENDING,,0.001,0.0001,10
train_model_24539_00007,PENDING,,0.01,0.01,8
train_model_24539_00008,PENDING,,0.01,0.01,3
train_model_24539_00009,PENDING,,0.01,0.1,10


Trial name,status,loc,lr,lr_inner,num_inner_steps
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3
train_model_24539_00001,PENDING,,0.001,0.0001,5
train_model_24539_00002,PENDING,,0.001,0.0001,8
train_model_24539_00003,PENDING,,0.0001,0.1,3
train_model_24539_00004,PENDING,,0.01,0.01,3
train_model_24539_00005,PENDING,,0.01,0.01,8
train_model_24539_00006,PENDING,,0.001,0.0001,10
train_model_24539_00007,PENDING,,0.01,0.01,8
train_model_24539_00008,PENDING,,0.01,0.01,3
train_model_24539_00009,PENDING,,0.01,0.1,10


Trial name,status,loc,lr,lr_inner,num_inner_steps
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3
train_model_24539_00001,PENDING,,0.001,0.0001,5
train_model_24539_00002,PENDING,,0.001,0.0001,8
train_model_24539_00003,PENDING,,0.0001,0.1,3
train_model_24539_00004,PENDING,,0.01,0.01,3
train_model_24539_00005,PENDING,,0.01,0.01,8
train_model_24539_00006,PENDING,,0.001,0.0001,10
train_model_24539_00007,PENDING,,0.01,0.01,8
train_model_24539_00008,PENDING,,0.01,0.01,3
train_model_24539_00009,PENDING,,0.01,0.1,10


[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")
[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")


Trial name,status,loc,lr,lr_inner,num_inner_steps
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3
train_model_24539_00001,PENDING,,0.001,0.0001,5
train_model_24539_00002,PENDING,,0.001,0.0001,8
train_model_24539_00003,PENDING,,0.0001,0.1,3
train_model_24539_00004,PENDING,,0.01,0.01,3
train_model_24539_00005,PENDING,,0.01,0.01,8
train_model_24539_00006,PENDING,,0.001,0.0001,10
train_model_24539_00007,PENDING,,0.01,0.01,8
train_model_24539_00008,PENDING,,0.01,0.01,3
train_model_24539_00009,PENDING,,0.01,0.1,10


Result for train_model_24539_00000:
  date: 2022-01-24_15-28-08
  done: false
  experiment_id: 33e8d81ad38b44adbf5bb0778168d613
  hostname: dfa2a0b682a2
  iterations_since_restore: 1
  loss: 20.71133804321289
  node_ip: 172.28.0.2
  pid: 5510
  time_since_restore: 36.749401330947876
  time_this_iter_s: 36.749401330947876
  time_total_s: 36.749401330947876
  timestamp: 1643038088
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '24539_00000'
  


[2m[36m(ImplicitFunc pid=5510)[0m   "`Trainer.running_sanity_check` has been renamed to `Trainer.sanity_checking` and will be removed in v1.5."
[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")
[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,1.0,36.7494,20.7113
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,1.0,36.7494,20.7113
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,1.0,36.7494,20.7113
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,1.0,36.7494,20.7113
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,1.0,36.7494,20.7113
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")
[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,1.0,36.7494,20.7113
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Result for train_model_24539_00000:
  date: 2022-01-24_15-28-39
  done: false
  experiment_id: 33e8d81ad38b44adbf5bb0778168d613
  hostname: dfa2a0b682a2
  iterations_since_restore: 2
  loss: 19.072946548461914
  node_ip: 172.28.0.2
  pid: 5510
  time_since_restore: 67.76050162315369
  time_this_iter_s: 31.01110029220581
  time_total_s: 67.76050162315369
  timestamp: 1643038119
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: '24539_00000'
  


[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")
[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,2.0,67.7605,19.0729
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,2.0,67.7605,19.0729
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,2.0,67.7605,19.0729
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,2.0,67.7605,19.0729
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,2.0,67.7605,19.0729
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")
[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,2.0,67.7605,19.0729
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Result for train_model_24539_00000:
  date: 2022-01-24_15-29-11
  done: false
  experiment_id: 33e8d81ad38b44adbf5bb0778168d613
  hostname: dfa2a0b682a2
  iterations_since_restore: 3
  loss: 16.01274299621582
  node_ip: 172.28.0.2
  pid: 5510
  time_since_restore: 99.3283793926239
  time_this_iter_s: 31.567877769470215
  time_total_s: 99.3283793926239
  timestamp: 1643038151
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '24539_00000'
  


[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")
[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,3.0,99.3284,16.0127
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,3.0,99.3284,16.0127
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,3.0,99.3284,16.0127
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,3.0,99.3284,16.0127
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,3.0,99.3284,16.0127
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")
[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,3.0,99.3284,16.0127
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Result for train_model_24539_00000:
  date: 2022-01-24_15-29-42
  done: false
  experiment_id: 33e8d81ad38b44adbf5bb0778168d613
  hostname: dfa2a0b682a2
  iterations_since_restore: 4
  loss: 14.470976829528809
  node_ip: 172.28.0.2
  pid: 5510
  time_since_restore: 130.98850560188293
  time_this_iter_s: 31.660126209259033
  time_total_s: 130.98850560188293
  timestamp: 1643038182
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: '24539_00000'
  


[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")
[2m[36m(ImplicitFunc pid=5510)[0m   help="the ip address of the worker's node")


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,4.0,130.989,14.471
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,4.0,130.989,14.471
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,4.0,130.989,14.471
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,4.0,130.989,14.471
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,


Trial name,status,loc,lr,lr_inner,num_inner_steps,iter,total time (s),loss
train_model_24539_00000,RUNNING,172.28.0.2:5510,0.1,0.1,3,4.0,130.989,14.471
train_model_24539_00001,PENDING,,0.001,0.0001,5,,,
train_model_24539_00002,PENDING,,0.001,0.0001,8,,,
train_model_24539_00003,PENDING,,0.0001,0.1,3,,,
train_model_24539_00004,PENDING,,0.01,0.01,3,,,
train_model_24539_00005,PENDING,,0.01,0.01,8,,,
train_model_24539_00006,PENDING,,0.001,0.0001,10,,,
train_model_24539_00007,PENDING,,0.01,0.01,8,,,
train_model_24539_00008,PENDING,,0.01,0.01,3,,,
train_model_24539_00009,PENDING,,0.01,0.1,10,,,
