In [1]:
! pip install ray 



In [2]:
!pip install tensorboardX



In [3]:
!pip install wandb



In [4]:
from functools import partial
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

import wandb




In [5]:
def load_data(data_dir="./data"):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=transform)

    testset = torchvision.datasets.CIFAR10(
        root=data_dir, train=False, download=True, transform=transform)

    return trainset, testset


In [6]:
class Net(nn.Module):
    def __init__(self, l1=120, l2=84):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, l1)
        self.fc2 = nn.Linear(l1, l2)
        self.fc3 = nn.Linear(l2, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [7]:
def train_cifar(config, checkpoint_dir=None, data_dir=None):
    net = Net(config["l1"], config["l2"])

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    wandb.init(project='torch-turn', entity='teamlab')
    wandb.watch(net)

    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        wandb.log({"val_loss": val_loss})
        wandb.log({"loss": loss})

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

In [8]:
def test_accuracy(net, device="cpu"):
    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [9]:
from ray.tune.suggest.bayesopt import BayesOptSearch
from ray.tune.suggest.hyperopt import HyperOptSearch

def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    
    data_dir = os.path.abspath("./data")
    load_data(data_dir)
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    

    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))


if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    wandb.login(key="0a25ae829bf4e2a6cd2acfdd4e65e6a26cd9927e")
    main(num_samples=10, max_num_epochs=10, gpus_per_trial=0)

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Files already downloaded and verified
Files already downloaded and verified


2021-08-12 16:42:51,312	INFO services.py:1247 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2021-08-12 16:42:53,144	INFO registry.py:67 -- Detected unknown callable for trainable. Converting to class.


== Status ==
Memory usage on this node: 1.4/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/7.32 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-08-12_16-42-53
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+---------------------+----------+-------+--------------+------+------+-------------+
| Trial name          | status   | loc   |   batch_size |   l1 |   l2 |          lr |
|---------------------+----------+-------+--------------+------+------+-------------|
| DEFAULT_569cb_00000 | RUNNING  |       |            2 |  256 |   64 | 0.000448082 |
| DEFAULT_569cb_00001 | PENDING  |       |            8 |    4 |  128 | 0.0115576   |
| DEFAULT_569cb_00002 | PENDING  |       |            4 |   64 |   64 | 0.00224206  |
| DEFAULT_569cb_00003 | PENDING  |       |           16 |   32 |  128 | 0.0219596   |
| DE

[2m[36m(pid=1830)[0m   cpuset_checked))
[2m[36m(pid=1830)[0m wandb: Currently logged in as: teamlab (use `wandb login --relogin` to force relogin)


== Status ==
Memory usage on this node: 1.7/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/7.32 GiB heap, 0.0/3.66 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-08-12_16-42-53
Number of trials: 10/10 (9 PENDING, 1 RUNNING)
+---------------------+----------+-------+--------------+------+------+-------------+
| Trial name          | status   | loc   |   batch_size |   l1 |   l2 |          lr |
|---------------------+----------+-------+--------------+------+------+-------------|
| DEFAULT_569cb_00000 | RUNNING  |       |            2 |  256 |   64 | 0.000448082 |
| DEFAULT_569cb_00001 | PENDING  |       |            8 |    4 |  128 | 0.0115576   |
| DEFAULT_569cb_00002 | PENDING  |       |            4 |   64 |   64 | 0.00224206  |
| DEFAULT_569cb_00003 | PENDING  |       |           16 |   32 |  128 | 0.0219596   |
| DE

[2m[36m(pid=1830)[0m 2021-08-12 16:42:59.360560: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


[2m[36m(pid=1830)[0m 
[2m[36m(pid=1830)[0m signal only works in main thread


[2m[36m(pid=1830)[0m wandb: Tracking run with wandb version 0.12.0
[2m[36m(pid=1830)[0m wandb: Syncing run cool-moon-14
[2m[36m(pid=1830)[0m wandb:  View project at https://wandb.ai/teamlab/torch-turn
[2m[36m(pid=1830)[0m wandb:  View run at https://wandb.ai/teamlab/torch-turn/runs/wr825zte
[2m[36m(pid=1830)[0m wandb: Run data is saved locally in /root/ray_results/DEFAULT_2021-08-12_16-42-53/DEFAULT_569cb_00000_0_batch_size=2,l1=256,l2=64,lr=0.00044808_2021-08-12_16-42-53/wandb/run-20210812_164257-wr825zte
[2m[36m(pid=1830)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(pid=1830)[0m   cpuset_checked))
[2m[36m(pid=1830)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[2m[36m(pid=1830)[0m [1,  2000] loss: 2.299
[2m[36m(pid=1830)[0m [1,  4000] loss: 1.088
[2m[36m(pid=1830)[0m [1,  6000] loss: 0.667
[2m[36m(pid=1830)[0m [1,  8000] loss: 0.465
[2m[36m(pid=1830)[0m [1, 10000] loss: 0.348
[2m[36m(pid=1830)[0m [1, 12000] loss: 0.279
[2m[36m(pid=1830)[0m [1, 14000] loss: 0.234
[2m[36m(pid=1830)[0m [1, 16000] loss: 0.198
[2m[36m(pid=1830)[0m [1, 18000] loss: 0.175
[2m[36m(pid=1830)[0m [1, 20000] loss: 0.152
Result for DEFAULT_569cb_00000:
  accuracy: 0.4538
  date: 2021-08-12_16-45-00
  done: false
  experiment_id: f22f79575e524e7497254dc48fa791da
  hostname: 8f95047dacfb
  iterations_since_restore: 1
  loss: 1.5119301937788725
  node_ip: 172.28.0.2
  pid: 1830
  should_checkpoint: true
  time_since_restore: 124.35959243774414
  time_this_iter_s: 124.35959243774414
  time_total_s: 124.35959243774414
  timestamp: 1628786700
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 569cb_00000
  
== Status ==
Memory usage 

[2m[36m(pid=1831)[0m   cpuset_checked))
[2m[36m(pid=1831)[0m wandb: Currently logged in as: teamlab (use `wandb login --relogin` to force relogin)
[2m[36m(pid=1831)[0m 2021-08-12 17:02:54.110624: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


[2m[36m(pid=1831)[0m 
[2m[36m(pid=1831)[0m signal only works in main thread


[2m[36m(pid=1831)[0m wandb: Tracking run with wandb version 0.12.0
[2m[36m(pid=1831)[0m wandb: Syncing run faithful-elevator-15
[2m[36m(pid=1831)[0m wandb:  View project at https://wandb.ai/teamlab/torch-turn
[2m[36m(pid=1831)[0m wandb:  View run at https://wandb.ai/teamlab/torch-turn/runs/3ibaphoj
[2m[36m(pid=1831)[0m wandb: Run data is saved locally in /root/ray_results/DEFAULT_2021-08-12_16-42-53/DEFAULT_569cb_00001_1_batch_size=8,l1=4,l2=128,lr=0.011558_2021-08-12_16-42-53/wandb/run-20210812_170252-3ibaphoj
[2m[36m(pid=1831)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(pid=1831)[0m   cpuset_checked))
[2m[36m(pid=1831)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=1830)[0m   len(cache))


[2m[36m(pid=1831)[0m [1,  2000] loss: 2.126
[2m[36m(pid=1831)[0m [1,  4000] loss: 0.975
Result for DEFAULT_569cb_00001:
  accuracy: 0.2872
  date: 2021-08-12_17-03-34
  done: true
  experiment_id: 5d4058695df346bc80bbe70bad245393
  hostname: 8f95047dacfb
  iterations_since_restore: 1
  loss: 1.8457046394348144
  node_ip: 172.28.0.2
  pid: 1831
  should_checkpoint: true
  time_since_restore: 43.89766979217529
  time_this_iter_s: 43.89766979217529
  time_total_s: 43.89766979217529
  timestamp: 1628787814
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 569cb_00001
  
== Status ==
Memory usage on this node: 1.8/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 8.000: -1.1552953161671426 | Iter 4.000: -1.1661095013778293 | Iter 2.000: -1.3351672032438218 | Iter 1.000: -1.6788174166068435
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/7.32 GiB heap, 0.0/3.66 GiB objects (0.0/2.0 CPU_group_25fbcaa92d434f489b6d715902470bb5, 0.0/2.0 CPU_group_0_25fbcaa92d434f489b



[2m[36m(pid=2860)[0m Files already downloaded and verified
[2m[36m(pid=2860)[0m Files already downloaded and verified


[2m[36m(pid=1831)[0m   len(cache))
[2m[36m(pid=2860)[0m   cpuset_checked))
[2m[36m(pid=2860)[0m wandb: Currently logged in as: teamlab (use `wandb login --relogin` to force relogin)
[2m[36m(pid=2860)[0m 2021-08-12 17:03:40.348287: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


[2m[36m(pid=2860)[0m 
[2m[36m(pid=2860)[0m signal only works in main thread


[2m[36m(pid=2860)[0m wandb: Tracking run with wandb version 0.12.0
[2m[36m(pid=2860)[0m wandb: Syncing run autumn-shape-16
[2m[36m(pid=2860)[0m wandb:  View project at https://wandb.ai/teamlab/torch-turn
[2m[36m(pid=2860)[0m wandb:  View run at https://wandb.ai/teamlab/torch-turn/runs/2togpsno
[2m[36m(pid=2860)[0m wandb: Run data is saved locally in /root/ray_results/DEFAULT_2021-08-12_16-42-53/DEFAULT_569cb_00002_2_batch_size=4,l1=64,l2=64,lr=0.0022421_2021-08-12_17-02-48/wandb/run-20210812_170338-2togpsno
[2m[36m(pid=2860)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(pid=2860)[0m   cpuset_checked))
[2m[36m(pid=2860)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[2m[36m(pid=2860)[0m [1,  2000] loss: 2.129
[2m[36m(pid=2860)[0m [1,  4000] loss: 0.894
[2m[36m(pid=2860)[0m [1,  6000] loss: 0.549
[2m[36m(pid=2860)[0m [1,  8000] loss: 0.394
[2m[36m(pid=2860)[0m [1, 10000] loss: 0.301
Result for DEFAULT_569cb_00002:
  accuracy: 0.4639
  date: 2021-08-12_17-04-43
  done: false
  experiment_id: d7d16f9034bf43159ad792fbb9bea3f2
  hostname: 8f95047dacfb
  iterations_since_restore: 1
  loss: 1.4907208232164384
  node_ip: 172.28.0.2
  pid: 2860
  should_checkpoint: true
  time_since_restore: 67.13392901420593
  time_this_iter_s: 67.13392901420593
  time_total_s: 67.13392901420593
  timestamp: 1628787883
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 569cb_00002
  
== Status ==
Memory usage on this node: 1.8/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 8.000: -1.1552953161671426 | Iter 4.000: -1.1661095013778293 | Iter 2.000: -1.3351672032438218 | Iter 1.000: -1.5119301937788725
Resources requested: 2.0/2 CPUs,



[2m[36m(pid=3080)[0m Files already downloaded and verified
[2m[36m(pid=3080)[0m Files already downloaded and verified


[2m[36m(pid=3080)[0m   cpuset_checked))
[2m[36m(pid=3080)[0m wandb: Currently logged in as: teamlab (use `wandb login --relogin` to force relogin)
[2m[36m(pid=2860)[0m   len(cache))
[2m[36m(pid=3080)[0m 2021-08-12 17:05:52.928254: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


[2m[36m(pid=3080)[0m 
[2m[36m(pid=3080)[0m signal only works in main thread


[2m[36m(pid=3080)[0m wandb: Tracking run with wandb version 0.12.0
[2m[36m(pid=3080)[0m wandb: Syncing run young-elevator-17
[2m[36m(pid=3080)[0m wandb:  View project at https://wandb.ai/teamlab/torch-turn
[2m[36m(pid=3080)[0m wandb:  View run at https://wandb.ai/teamlab/torch-turn/runs/2hhr007r
[2m[36m(pid=3080)[0m wandb: Run data is saved locally in /root/ray_results/DEFAULT_2021-08-12_16-42-53/DEFAULT_569cb_00003_3_batch_size=16,l1=32,l2=128,lr=0.02196_2021-08-12_17-03-34/wandb/run-20210812_170551-2hhr007r
[2m[36m(pid=3080)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(pid=3080)[0m   cpuset_checked))
[2m[36m(pid=3080)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[2m[36m(pid=3080)[0m [1,  2000] loss: 1.851
Result for DEFAULT_569cb_00003:
  accuracy: 0.3859
  date: 2021-08-12_17-06-23
  done: true
  experiment_id: 8c440d271461419d8111ae0310d165e9
  hostname: 8f95047dacfb
  iterations_since_restore: 1
  loss: 1.7040302667617797
  node_ip: 172.28.0.2
  pid: 3080
  should_checkpoint: true
  time_since_restore: 34.601794958114624
  time_this_iter_s: 34.601794958114624
  time_total_s: 34.601794958114624
  timestamp: 1628787983
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 569cb_00003
  
== Status ==
Memory usage on this node: 1.8/12.7 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 8.000: -1.1552953161671426 | Iter 4.000: -1.1661095013778293 | Iter 2.000: -1.3558600577306001 | Iter 1.000: -1.607980230270326
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/7.32 GiB heap, 0.0/3.66 GiB objects (0.0/2.0 CPU_group_0_25fbcaa92d434f489b6d715902470bb5, 0.0/2.0 CPU_group_25fbcaa92d434f489b6d715902470bb5, 0.0/1.0 accelerator_type:K80)



[2m[36m(pid=3220)[0m Files already downloaded and verified


[2m[36m(pid=3080)[0m   len(cache))


[2m[36m(pid=3220)[0m Files already downloaded and verified


[2m[36m(pid=3220)[0m   cpuset_checked))
[2m[36m(pid=3220)[0m wandb: Currently logged in as: teamlab (use `wandb login --relogin` to force relogin)
[2m[36m(pid=3220)[0m 2021-08-12 17:06:29.680368: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


[2m[36m(pid=3220)[0m 
[2m[36m(pid=3220)[0m signal only works in main thread


[2m[36m(pid=3220)[0m wandb: Tracking run with wandb version 0.12.0
[2m[36m(pid=3220)[0m wandb: Syncing run woven-energy-18
[2m[36m(pid=3220)[0m wandb:  View project at https://wandb.ai/teamlab/torch-turn
[2m[36m(pid=3220)[0m wandb:  View run at https://wandb.ai/teamlab/torch-turn/runs/1qcu259a
[2m[36m(pid=3220)[0m wandb: Run data is saved locally in /root/ray_results/DEFAULT_2021-08-12_16-42-53/DEFAULT_569cb_00004_4_batch_size=4,l1=4,l2=16,lr=0.0093997_2021-08-12_17-05-46/wandb/run-20210812_170628-1qcu259a
[2m[36m(pid=3220)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(pid=3220)[0m   cpuset_checked))
[2m[36m(pid=3220)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[2m[36m(pid=3220)[0m [1,  2000] loss: 2.173
[2m[36m(pid=3220)[0m [1,  4000] loss: 1.061
[2m[36m(pid=3220)[0m [1,  6000] loss: 0.727
[2m[36m(pid=3220)[0m [1,  8000] loss: 0.567
[2m[36m(pid=3220)[0m [1, 10000] loss: 0.453
Result for DEFAULT_569cb_00004:
  accuracy: 0.1007
  date: 2021-08-12_17-07-31
  done: true
  experiment_id: 3caaa7f2a82143fead5355a19cc96d29
  hostname: 8f95047dacfb
  iterations_since_restore: 1
  loss: 2.3094754077911377
  node_ip: 172.28.0.2
  pid: 3220
  should_checkpoint: true
  time_since_restore: 65.76523661613464
  time_this_iter_s: 65.76523661613464
  time_total_s: 65.76523661613464
  timestamp: 1628788051
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 569cb_00004
  
== Status ==
Memory usage on this node: 1.8/12.7 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 8.000: -1.1552953161671426 | Iter 4.000: -1.1661095013778293 | Iter 2.000: -1.3558600577306001 | Iter 1.000: -1.7040302667617797
Resources requested: 2.0/2 CPUs, 



[2m[36m(pid=3360)[0m Files already downloaded and verified
[2m[36m(pid=3360)[0m Files already downloaded and verified


[2m[36m(pid=3360)[0m   cpuset_checked))
[2m[36m(pid=3360)[0m wandb: Currently logged in as: teamlab (use `wandb login --relogin` to force relogin)
[2m[36m(pid=3220)[0m   len(cache))
[2m[36m(pid=3360)[0m 2021-08-12 17:07:37.764608: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


[2m[36m(pid=3360)[0m 
[2m[36m(pid=3360)[0m signal only works in main thread


[2m[36m(pid=3360)[0m wandb: Tracking run with wandb version 0.12.0
[2m[36m(pid=3360)[0m wandb: Syncing run zesty-dragon-19
[2m[36m(pid=3360)[0m wandb:  View project at https://wandb.ai/teamlab/torch-turn
[2m[36m(pid=3360)[0m wandb:  View run at https://wandb.ai/teamlab/torch-turn/runs/2ja43ynk
[2m[36m(pid=3360)[0m wandb: Run data is saved locally in /root/ray_results/DEFAULT_2021-08-12_16-42-53/DEFAULT_569cb_00005_5_batch_size=2,l1=128,l2=256,lr=0.0049082_2021-08-12_17-06-23/wandb/run-20210812_170736-2ja43ynk
[2m[36m(pid=3360)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(pid=3360)[0m   cpuset_checked))
[2m[36m(pid=3360)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[2m[36m(pid=3360)[0m [1,  2000] loss: 2.139
[2m[36m(pid=3360)[0m [1,  4000] loss: 0.968
[2m[36m(pid=3360)[0m [1,  6000] loss: 0.652
[2m[36m(pid=3360)[0m [1,  8000] loss: 0.482
[2m[36m(pid=3360)[0m [1, 10000] loss: 0.382
[2m[36m(pid=3360)[0m [1, 12000] loss: 0.320
[2m[36m(pid=3360)[0m [1, 14000] loss: 0.274
[2m[36m(pid=3360)[0m [1, 16000] loss: 0.240
[2m[36m(pid=3360)[0m [1, 18000] loss: 0.215
[2m[36m(pid=3360)[0m [1, 20000] loss: 0.189
Result for DEFAULT_569cb_00005:
  accuracy: 0.2902
  date: 2021-08-12_17-09-36
  done: true
  experiment_id: 674a69e584544874bcde40a753b1952c
  hostname: 8f95047dacfb
  iterations_since_restore: 1
  loss: 1.8889173994570971
  node_ip: 172.28.0.2
  pid: 3360
  should_checkpoint: true
  time_since_restore: 122.42659735679626
  time_this_iter_s: 122.42659735679626
  time_total_s: 122.42659735679626
  timestamp: 1628788176
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 569cb_00005
  
== Status ==
Memory usage o



[2m[36m(pid=3497)[0m Files already downloaded and verified
[2m[36m(pid=3497)[0m Files already downloaded and verified


[2m[36m(pid=3497)[0m   cpuset_checked))
[2m[36m(pid=3497)[0m wandb: Currently logged in as: teamlab (use `wandb login --relogin` to force relogin)
[2m[36m(pid=3360)[0m   len(cache))
[2m[36m(pid=3497)[0m 2021-08-12 17:09:42.464594: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


[2m[36m(pid=3497)[0m 
[2m[36m(pid=3497)[0m signal only works in main thread


[2m[36m(pid=3497)[0m wandb: Tracking run with wandb version 0.12.0
[2m[36m(pid=3497)[0m wandb: Syncing run worthy-sound-20
[2m[36m(pid=3497)[0m wandb:  View project at https://wandb.ai/teamlab/torch-turn
[2m[36m(pid=3497)[0m wandb:  View run at https://wandb.ai/teamlab/torch-turn/runs/14i5l0zg
[2m[36m(pid=3497)[0m wandb: Run data is saved locally in /root/ray_results/DEFAULT_2021-08-12_16-42-53/DEFAULT_569cb_00006_6_batch_size=2,l1=128,l2=8,lr=0.014305_2021-08-12_17-07-31/wandb/run-20210812_170940-14i5l0zg
[2m[36m(pid=3497)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(pid=3497)[0m   cpuset_checked))
[2m[36m(pid=3497)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[2m[36m(pid=3497)[0m [1,  2000] loss: 2.298
[2m[36m(pid=3497)[0m [1,  4000] loss: 1.158
[2m[36m(pid=3497)[0m [1,  6000] loss: 0.773
[2m[36m(pid=3497)[0m [1,  8000] loss: 0.580
[2m[36m(pid=3497)[0m [1, 10000] loss: 0.464
[2m[36m(pid=3497)[0m [1, 12000] loss: 0.386
[2m[36m(pid=3497)[0m [1, 14000] loss: 0.331
[2m[36m(pid=3497)[0m [1, 16000] loss: 0.290
[2m[36m(pid=3497)[0m [1, 18000] loss: 0.257
[2m[36m(pid=3497)[0m [1, 20000] loss: 0.231
Result for DEFAULT_569cb_00006:
  accuracy: 0.0971
  date: 2021-08-12_17-11-48
  done: true
  experiment_id: 821f2f26f5674e649e2c703b1aa1e322
  hostname: 8f95047dacfb
  iterations_since_restore: 1
  loss: 2.3269634853839873
  node_ip: 172.28.0.2
  pid: 3497
  should_checkpoint: true
  time_since_restore: 129.57758283615112
  time_this_iter_s: 129.57758283615112
  time_total_s: 129.57758283615112
  timestamp: 1628788308
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 569cb_00006
  
== Status ==
Memory usage o



[2m[36m(pid=3637)[0m Files already downloaded and verified
[2m[36m(pid=3637)[0m Files already downloaded and verified


[2m[36m(pid=3637)[0m   cpuset_checked))
[2m[36m(pid=3637)[0m wandb: Currently logged in as: teamlab (use `wandb login --relogin` to force relogin)
[2m[36m(pid=3497)[0m   len(cache))
[2m[36m(pid=3637)[0m 2021-08-12 17:11:54.255996: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


[2m[36m(pid=3637)[0m 
[2m[36m(pid=3637)[0m signal only works in main thread


[2m[36m(pid=3637)[0m wandb: Tracking run with wandb version 0.12.0
[2m[36m(pid=3637)[0m wandb: Syncing run frosty-dream-21
[2m[36m(pid=3637)[0m wandb:  View project at https://wandb.ai/teamlab/torch-turn
[2m[36m(pid=3637)[0m wandb:  View run at https://wandb.ai/teamlab/torch-turn/runs/2vcm9c5t
[2m[36m(pid=3637)[0m wandb: Run data is saved locally in /root/ray_results/DEFAULT_2021-08-12_16-42-53/DEFAULT_569cb_00007_7_batch_size=16,l1=256,l2=4,lr=0.0020752_2021-08-12_17-09-36/wandb/run-20210812_171152-2vcm9c5t
[2m[36m(pid=3637)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(pid=3637)[0m   cpuset_checked))
[2m[36m(pid=3637)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[2m[36m(pid=3637)[0m [1,  2000] loss: 2.181
Result for DEFAULT_569cb_00007:
  accuracy: 0.3254
  date: 2021-08-12_17-12-26
  done: false
  experiment_id: b97013c48fce40bc850695c2592fd1b7
  hostname: 8f95047dacfb
  iterations_since_restore: 1
  loss: 1.7713550579071045
  node_ip: 172.28.0.2
  pid: 3637
  should_checkpoint: true
  time_since_restore: 36.14110779762268
  time_this_iter_s: 36.14110779762268
  time_total_s: 36.14110779762268
  timestamp: 1628788346
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 569cb_00007
  
== Status ==
Memory usage on this node: 1.8/12.7 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 8.000: -1.1552953161671426 | Iter 4.000: -1.1661095013778293 | Iter 2.000: -1.3558600577306001 | Iter 1.000: -1.8085298486709593
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/7.32 GiB heap, 0.0/3.66 GiB objects (0.0/2.0 CPU_group_0_25fbcaa92d434f489b6d715902470bb5, 0.0/2.0 CPU_group_25fbcaa92d434f489b6d715902470bb5, 0.0/1.0 accelerator_type:K80)




[2m[36m(pid=3857)[0m Files already downloaded and verified


[2m[36m(pid=3637)[0m   len(cache))


[2m[36m(pid=3857)[0m Files already downloaded and verified


[2m[36m(pid=3857)[0m   cpuset_checked))
[2m[36m(pid=3857)[0m wandb: Currently logged in as: teamlab (use `wandb login --relogin` to force relogin)
[2m[36m(pid=3857)[0m 2021-08-12 17:13:03.912150: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


[2m[36m(pid=3857)[0m 
[2m[36m(pid=3857)[0m signal only works in main thread


[2m[36m(pid=3857)[0m wandb: Tracking run with wandb version 0.12.0
[2m[36m(pid=3857)[0m wandb: Syncing run valiant-terrain-22
[2m[36m(pid=3857)[0m wandb:  View project at https://wandb.ai/teamlab/torch-turn
[2m[36m(pid=3857)[0m wandb:  View run at https://wandb.ai/teamlab/torch-turn/runs/2lsap9n4
[2m[36m(pid=3857)[0m wandb: Run data is saved locally in /root/ray_results/DEFAULT_2021-08-12_16-42-53/DEFAULT_569cb_00008_8_batch_size=8,l1=32,l2=32,lr=0.086172_2021-08-12_17-11-48/wandb/run-20210812_171302-2lsap9n4
[2m[36m(pid=3857)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(pid=3857)[0m   cpuset_checked))
[2m[36m(pid=3857)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


[2m[36m(pid=3857)[0m [1,  2000] loss: 2.326
[2m[36m(pid=3857)[0m [1,  4000] loss: 1.164
Result for DEFAULT_569cb_00008:
  accuracy: 0.0986
  date: 2021-08-12_17-13-45
  done: true
  experiment_id: 7e0db994183d4baf82351bb3f9400e4a
  hostname: 8f95047dacfb
  iterations_since_restore: 1
  loss: 2.3193477834701537
  node_ip: 172.28.0.2
  pid: 3857
  should_checkpoint: true
  time_since_restore: 44.77334547042847
  time_this_iter_s: 44.77334547042847
  time_total_s: 44.77334547042847
  timestamp: 1628788425
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 569cb_00008
  
== Status ==
Memory usage on this node: 1.8/12.7 GiB
Using AsyncHyperBand: num_stopped=9
Bracket: Iter 8.000: -1.1552953161671426 | Iter 4.000: -1.1661095013778293 | Iter 2.000: -1.3765529122173785 | Iter 1.000: -1.8457046394348144
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/7.32 GiB heap, 0.0/3.66 GiB objects (0.0/2.0 CPU_group_0_25fbcaa92d434f489b6d715902470bb5, 0.0/2.0 CPU_group_25fbcaa92d434f489b



[2m[36m(pid=3997)[0m Files already downloaded and verified
[2m[36m(pid=3997)[0m Files already downloaded and verified


[2m[36m(pid=3997)[0m   cpuset_checked))
[2m[36m(pid=3997)[0m wandb: Currently logged in as: teamlab (use `wandb login --relogin` to force relogin)
[2m[36m(pid=3997)[0m 2021-08-12 17:13:50.794034: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


[2m[36m(pid=3997)[0m 
[2m[36m(pid=3997)[0m signal only works in main thread


[2m[36m(pid=3997)[0m wandb: Tracking run with wandb version 0.12.0
[2m[36m(pid=3997)[0m wandb: Syncing run celestial-field-23
[2m[36m(pid=3997)[0m wandb:  View project at https://wandb.ai/teamlab/torch-turn
[2m[36m(pid=3997)[0m wandb:  View run at https://wandb.ai/teamlab/torch-turn/runs/2hsmsm68
[2m[36m(pid=3997)[0m wandb: Run data is saved locally in /root/ray_results/DEFAULT_2021-08-12_16-42-53/DEFAULT_569cb_00009_9_batch_size=2,l1=256,l2=4,lr=0.00045892_2021-08-12_17-12-58/wandb/run-20210812_171349-2hsmsm68
[2m[36m(pid=3997)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(pid=3997)[0m   cpuset_checked))
[2m[36m(pid=3997)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)
[2m[36m(pid=3857)[0m   len(cache))


[2m[36m(pid=3997)[0m [1,  2000] loss: 2.314
[2m[36m(pid=3997)[0m [1,  4000] loss: 1.145
[2m[36m(pid=3997)[0m [1,  6000] loss: 0.735
[2m[36m(pid=3997)[0m [1,  8000] loss: 0.506
[2m[36m(pid=3997)[0m [1, 10000] loss: 0.367
[2m[36m(pid=3997)[0m [1, 12000] loss: 0.298
[2m[36m(pid=3997)[0m [1, 14000] loss: 0.249
[2m[36m(pid=3997)[0m [1, 16000] loss: 0.211
[2m[36m(pid=3997)[0m [1, 18000] loss: 0.186
[2m[36m(pid=3997)[0m [1, 20000] loss: 0.166
Result for DEFAULT_569cb_00009:
  accuracy: 0.3678
  date: 2021-08-12_17-15-48
  done: false
  experiment_id: f3d62598d51f49edb7ffcd3d605c6057
  hostname: 8f95047dacfb
  iterations_since_restore: 1
  loss: 1.650761429449916
  node_ip: 172.28.0.2
  pid: 3997
  should_checkpoint: true
  time_since_restore: 121.2159595489502
  time_this_iter_s: 121.2159595489502
  time_total_s: 121.2159595489502
  timestamp: 1628788548
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 569cb_00009
  
== Status ==
Memory usage on t

2021-08-12 17:17:47,157	INFO tune.py:550 -- Total run time: 2094.02 seconds (2093.59 seconds for the tuning loop).


Result for DEFAULT_569cb_00009:
  accuracy: 0.4744
  date: 2021-08-12_17-17-46
  done: true
  experiment_id: f3d62598d51f49edb7ffcd3d605c6057
  hostname: 8f95047dacfb
  iterations_since_restore: 2
  loss: 1.4194301681835204
  node_ip: 172.28.0.2
  pid: 3997
  should_checkpoint: true
  time_since_restore: 239.6976499557495
  time_this_iter_s: 118.48169040679932
  time_total_s: 239.6976499557495
  timestamp: 1628788666
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 569cb_00009
  
== Status ==
Memory usage on this node: 1.8/12.7 GiB
Using AsyncHyperBand: num_stopped=10
Bracket: Iter 8.000: -1.1552953161671426 | Iter 4.000: -1.1661095013778293 | Iter 2.000: -1.3979915402004495 | Iter 1.000: -1.8085298486709593
Resources requested: 2.0/2 CPUs, 0/1 GPUs, 0.0/7.32 GiB heap, 0.0/3.66 GiB objects (0.0/2.0 CPU_group_0_25fbcaa92d434f489b6d715902470bb5, 0.0/2.0 CPU_group_25fbcaa92d434f489b6d715902470bb5, 0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2021-

[2m[36m(pid=3997)[0m   len(cache))


Files already downloaded and verified
Files already downloaded and verified


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Best trial test set accuracy: 0.611
