In [12]:
!pip install -U ray

Collecting ray
[?25l  Downloading https://files.pythonhosted.org/packages/ea/ed/ff896981d4ac684614236f73c1a20cde5f6cb0e2a590c182f62b22706ab4/ray-1.0.0-cp36-cp36m-manylinux1_x86_64.whl (22.9MB)
[K     |████████████████████████████████| 22.9MB 133kB/s 
Collecting colorful
[?25l  Downloading https://files.pythonhosted.org/packages/b0/8e/e386e248266952d24d73ed734c2f5513f34d9557032618c8910e605dfaf6/colorful-0.5.4-py2.py3-none-any.whl (201kB)
[K     |████████████████████████████████| 204kB 42.7MB/s 
Collecting aiohttp
[?25l  Downloading https://files.pythonhosted.org/packages/7c/39/7eb5f98d24904e0f6d3edb505d4aa60e3ef83c0a58d6fe18244a51757247/aiohttp-3.6.2-cp36-cp36m-manylinux1_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 44.5MB/s 
[?25hCollecting aiohttp-cors
  Downloading https://files.pythonhosted.org/packages/13/e7/e436a0c0eb5127d8b491a9b83ecd2391c6ff7dcd5548dfaec2080a2340fd/aiohttp_cors-0.7.0-py3-none-any.whl
Collecting gpustat
[?25l  Downloading https://fil

In [1]:
import os
import time
import copy

from collections import OrderedDict

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchvision
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler

from torch.utils.tensorboard import SummaryWriter

import matplotlib.pyplot as plt
import numpy as np

print(torch.cuda.is_available())

True


In [19]:
from functools import partial
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

In [5]:
"""
Create train, valid, test iterators for CIFAR-10 [1].
Easily extended to MNIST, CIFAR-100 and Imagenet.
[1]: https://discuss.pytorch.org/t/feedback-on-pytorch-for-kaggle-competitions/2252/4
"""

def plot_images(images, cls_true, cls_pred=None):
    """
    Adapted from https://github.com/Hvass-Labs/TensorFlow-Tutorials/
    """
    fig, axes = plt.subplots(3, 3)

    for i, ax in enumerate(axes.flat):
        # plot img
        ax.imshow(images[i, :, :, :], interpolation='spline16')

        # show true & predicted classes
        cls_true_name = label_names[cls_true[i]]
        if cls_pred is None:
            xlabel = "{0} ({1})".format(cls_true_name, cls_true[i])
        else:
            cls_pred_name = label_names[cls_pred[i]]
            xlabel = "True: {0}\nPred: {1}".format(
                cls_true_name, cls_pred_name
            )
        ax.set_xlabel(xlabel)
        ax.set_xticks([])
        ax.set_yticks([])

    plt.show()

def get_train_valid_loader(data_dir,
                           batch_size,
                           augment,
                           random_seed,
                           valid_size=0.1,
                           shuffle=True,
                           show_sample=False,
                           num_workers=4,
                           pin_memory=False):
    """
    Utility function for loading and returning train and valid
    multi-process iterators over the CIFAR-10 dataset. A sample
    9x9 grid of the images can be optionally displayed.
    If using CUDA, num_workers should be set to 1 and pin_memory to True.
    Params
    ------
    - data_dir: path directory to the dataset.
    - batch_size: how many samples per batch to load.
    - augment: whether to apply the data augmentation scheme
      mentioned in the paper. Only applied on the train split.
    - random_seed: fix seed for reproducibility.
    - valid_size: percentage split of the training set used for
      the validation set. Should be a float in the range [0, 1].
    - shuffle: whether to shuffle the train/validation indices.
    - show_sample: plot 9x9 sample grid of the dataset.
    - num_workers: number of subprocesses to use when loading the dataset.
    - pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
      True if using GPU.
    Returns
    -------
    - train_loader: training set iterator.
    - valid_loader: validation set iterator.
    """
    error_msg = "[!] valid_size should be in the range [0, 1]."
    assert ((valid_size >= 0) and (valid_size <= 1)), error_msg

    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # define transforms
    valid_transform = transforms.Compose([
            transforms.ToTensor(),
            normalize,
    ])
    if augment:
        train_transform = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    else:
        train_transform = transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ])

    # load the dataset
    train_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=train_transform,
    )

    valid_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=valid_transform,
    )

    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler,
        num_workers=num_workers, pin_memory=pin_memory,
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler,
        num_workers=num_workers, pin_memory=pin_memory,
    )

    # visualize some images
    if show_sample:
        sample_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=9, shuffle=shuffle,
            num_workers=num_workers, pin_memory=pin_memory,
        )
        data_iter = iter(sample_loader)
        images, labels = data_iter.next()
        X = images.numpy().transpose([0, 2, 3, 1])
        plot_images(X, labels)

    loaders = {"Train":train_loader, "Valid":valid_loader} 
    dataset_sizes = {"Train": len(train_dataset), "Valid": len(train_dataset) * valid_size}
    return loaders, dataset_sizes


def get_test_loader(data_dir,
                    batch_size,
                    shuffle=True,
                    num_workers=4,
                    pin_memory=False):
    """
    Utility function for loading and returning a multi-process
    test iterator over the CIFAR-10 dataset.
    If using CUDA, num_workers should be set to 1 and pin_memory to True.
    Params
    ------
    - data_dir: path directory to the dataset.
    - batch_size: how many samples per batch to load.
    - shuffle: whether to shuffle the dataset after every epoch.
    - num_workers: number of subprocesses to use when loading the dataset.
    - pin_memory: whether to copy tensors into CUDA pinned memory. Set it to
      True if using GPU.
    Returns
    -------
    - data_loader: test set iterator.
    """
    normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )

    # define transform
    transform = transforms.Compose([
        transforms.ToTensor(),
        normalize,
    ])

    dataset = datasets.CIFAR10(
        root=data_dir, train=False,
        download=True, transform=transform,
    )

    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle,
        num_workers=num_workers, pin_memory=pin_memory,
    )
    loader = {"Test":data_loader}
    dataset_sizes = {"Test": len(dataset)}
    return loader, dataset_sizes

In [6]:
# Training the model
def train_model(model, dataloaders, dataset_sizes, criterion, optimizer, tb_logger, num_epochs, scheduler=None):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['Train', 'Valid']:
            if phase == 'Train':
                if scheduler:
                    scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(DEVICE)
                labels = labels.to(DEVICE)
                # DEVICE --> where the computation is happening
                # Torch.dtypes --> cuda tensors, cpu tensors

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'Train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'Train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            #Log with Tensorboard
            if phase=='Train':
                tb_logger.add_scalar('train_Loss',float(epoch_loss), epoch+1)
                tb_logger.add_scalar('train_Accuracy', float(epoch_acc), epoch+1)
            elif phase=='Valid':
                tb_logger.add_scalar('valid_Loss',float(epoch_loss), epoch+1)
                tb_logger.add_scalar('valid_Accuracy', float(epoch_acc), epoch+1)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'Valid' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return best_model, epoch_loss, best_acc

def test_model(model, dataloaders, dataset_sizes, criterion):

    since = time.time()
    
    with torch.no_grad():
        for phase in ['Test']:
            model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(DEVICE)
                labels = labels.to(DEVICE)

                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Test Acc: {:4f}'.format(epoch_acc))

    return epoch_acc

In [28]:
class Net(nn.Module):
    def __init__(self, c1=32, c2=64, c3=64, l1=64):
        super(Net, self).__init__()

        self.conv_block = nn.Sequential(OrderedDict([
          ('conv1', nn.Conv2d(3, c1, 3)),
          ('relu1', nn.ReLU()),
          ('pool1', nn.MaxPool2d(2,2)),
          ('conv2', nn.Conv2d(c1,c2,3)),
          ('relu2', nn.ReLU()),
          ('pool2', nn.MaxPool2d(2,2)),
          ('conv3', nn.Conv2d(c2,c2,3)),
          ('relu3', nn.ReLU()),
          ('flatten', nn.Flatten())
        ]))
        self.fc_block = nn.Sequential(OrderedDict([
          ('fc1', nn.Linear(1024, l1)),
          ('relu1', nn.ReLU()),
          ('fc2', nn.Linear(l1, 10))
        ]))

    def forward(self, x):
        x = self.conv_block(x)
        x = self.fc_block(x)
        return x

In [8]:
SEED=42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_dir="./data"
batch_size=64
augment=False

valid_size=0.2
shuffle=True

learning_rate = 0.001

trainloaders, train_dataset_sizes = get_train_valid_loader(data_dir=data_dir, batch_size=batch_size,
                       augment=augment, random_seed=SEED, valid_size=valid_size,
                       shuffle=shuffle, show_sample=False, num_workers=1, pin_memory=True)
testloader, test_dataset_sizes = get_test_loader(data_dir=data_dir, batch_size=64, shuffle=False,
                             num_workers=1, pin_memory=True)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Files already downloaded and verified


In [10]:
LOG_DIR = 'runs'
os.makedirs(LOG_DIR, exist_ok=True)

In [None]:
model = Net().to(DEVICE)

In [None]:
if DEVICE == "cuda":
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
model = model.to(DEVICE)

In [21]:
def train(config, checkpoint_dir = None, data_dir="./data"):

    net = Net(config["c1"], config["c2"], config["c3"], config["l1"])

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    scheduler = None
    if config["optim"] == "SGD":
        optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)
    elif config["optim"] == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint
    # should be restored.
    if checkpoint_dir:
        checkpoint = os.path.join(checkpoint_dir, "checkpoint")
        model_state, optimizer_state = torch.load(checkpoint)
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    augment=False
    valid_size=0.2
    shuffle=True
    trainloaders, train_dataset_sizes = get_train_valid_loader(data_dir=data_dir, batch_size=config["batchsize"],
                       augment=augment, random_seed=SEED, valid_size=valid_size,
                       shuffle=shuffle, show_sample=False, num_workers=8, pin_memory=True)
    testloader, test_dataset_sizes = get_test_loader(data_dir=data_dir, batch_size=64, shuffle=False,
                             num_workers=8, pin_memory=True)
    
    tb_logger = SummaryWriter(os.path.join(LOG_DIR,"test_exp2"))
    num_epochs = 10

    model, val_loss, val_acc = train_model(net, trainloaders, train_dataset_sizes, criterion, optimizer, tb_logger, num_epochs)
    
    test_acc = test_model(net, testloader, test_dataset_sizes, criterion)

    tune.report(loss=val_loss, accuracy=val_acc)
    tb_logger.close()


    




In [26]:
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=0):
    data_dir = os.path.abspath("./data")
    # load_data(data_dir)  # Download data for all trials before starting the run
    config = {
        "c1": tune.choice([16, 32, 64]),
        "c2": tune.choice([32, 64, 128]),
        "c3": tune.choice([64, 128, 256]),        
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    result = tune.run(
        partial(train, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    checkpoint_path = os.path.join(best_trial.checkpoint.value, "checkpoint")

    model_state, optimizer_state = torch.load(checkpoint_path)
    best_trained_model.load_state_dict(model_state)

    testloader, test_dataset_sizes = get_test_loader(data_dir=data_dir, batch_size=64, shuffle=False,
                             num_workers=8, pin_memory=True)
    test_acc = test_model(best_trained_model, testloader, test_dataset_sizes, criterion)
    print("Best trial test set accuracy: {}".format(test_acc))

In [29]:
main()

2020-10-10 12:13:17,428	INFO registry.py:65 -- Detected unknown callable for trainable. Converting to class.


== Status ==
Memory usage on this node: 2.8/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2/2 CPUs, 0/1 GPUs, 0.0/7.13 GiB heap, 0.0/2.44 GiB objects (0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT
Number of trials: 10 (9 PENDING, 1 RUNNING)
+---------------------+----------+-------+--------------+------+------+------+------+-------------+
| Trial name          | status   | loc   |   batch_size |   c1 |   c2 |   c3 |   l1 |          lr |
|---------------------+----------+-------+--------------+------+------+------+------+-------------|
| DEFAULT_fab5d_00000 | RUNNING  |       |            2 |   16 |  128 |  128 |   64 | 0.00398859  |
| DEFAULT_fab5d_00001 | PENDING  |       |            8 |   32 |   64 |  128 |  128 | 0.0133473   |
| DEFAULT_fab5d_00002 | PENDING  |       |            4 |   32 |  128 |   64 |    8 | 0.00536858  |
| DEFAULT_fab5d_00003 | PENDING  

[2m[36m(pid=1318)[0m 2020-10-10 12:13:19,390	ERROR function_runner.py:233 -- Runner Thread raised error.
[2m[36m(pid=1318)[0m Traceback (most recent call last):
[2m[36m(pid=1318)[0m   File "/usr/local/lib/python3.6/dist-packages/ray/tune/function_runner.py", line 227, in run
[2m[36m(pid=1318)[0m     self._entrypoint()
[2m[36m(pid=1318)[0m   File "/usr/local/lib/python3.6/dist-packages/ray/tune/function_runner.py", line 290, in entrypoint
[2m[36m(pid=1318)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=1318)[0m   File "/usr/local/lib/python3.6/dist-packages/ray/tune/function_runner.py", line 499, in _trainable_func
[2m[36m(pid=1318)[0m     output = train_func(config, checkpoint_dir=checkpoint_dir)
[2m[36m(pid=1318)[0m   File "<ipython-input-21-6d676047dfc4>", line 14, in train
[2m[36m(pid=1318)[0m KeyError: 'optim'
[2m[36m(pid=1318)[0m Exception in thread Thread-2:
[2m[36m(pid=1318)[0m Traceback (most recent call last):
[2m[36m(pid=1318)

== Status ==
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.13 GiB heap, 0.0/2.44 GiB objects (0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT
Number of trials: 10 (3 ERROR, 7 PENDING)
+---------------------+----------+-------+--------------+------+------+------+------+-------------+
| Trial name          | status   | loc   |   batch_size |   c1 |   c2 |   c3 |   l1 |          lr |
|---------------------+----------+-------+--------------+------+------+------+------+-------------|
| DEFAULT_fab5d_00000 | ERROR    |       |            2 |   16 |  128 |  128 |   64 | 0.00398859  |
| DEFAULT_fab5d_00001 | ERROR    |       |            8 |   32 |   64 |  128 |  128 | 0.0133473   |
| DEFAULT_fab5d_00002 | ERROR    |       |            4 |   32 |  128 |   64 |    8 | 0.00536858  |
| DEFAULT_fab5d_00003 | PENDING  | 

[2m[36m(pid=1380)[0m 2020-10-10 12:13:25,743	ERROR function_runner.py:233 -- Runner Thread raised error.
[2m[36m(pid=1380)[0m Traceback (most recent call last):
[2m[36m(pid=1380)[0m   File "/usr/local/lib/python3.6/dist-packages/ray/tune/function_runner.py", line 227, in run
[2m[36m(pid=1380)[0m     self._entrypoint()
[2m[36m(pid=1380)[0m   File "/usr/local/lib/python3.6/dist-packages/ray/tune/function_runner.py", line 290, in entrypoint
[2m[36m(pid=1380)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=1380)[0m   File "/usr/local/lib/python3.6/dist-packages/ray/tune/function_runner.py", line 499, in _trainable_func
[2m[36m(pid=1380)[0m     output = train_func(config, checkpoint_dir=checkpoint_dir)
[2m[36m(pid=1380)[0m   File "<ipython-input-21-6d676047dfc4>", line 14, in train
[2m[36m(pid=1380)[0m KeyError: 'optim'
[2m[36m(pid=1380)[0m Exception in thread Thread-2:
[2m[36m(pid=1380)[0m Traceback (most recent call last):
[2m[36m(pid=1380)

== Status ==
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.13 GiB heap, 0.0/2.44 GiB objects (0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT
Number of trials: 10 (6 ERROR, 4 PENDING)
+---------------------+----------+-------+--------------+------+------+------+------+-------------+
| Trial name          | status   | loc   |   batch_size |   c1 |   c2 |   c3 |   l1 |          lr |
|---------------------+----------+-------+--------------+------+------+------+------+-------------|
| DEFAULT_fab5d_00000 | ERROR    |       |            2 |   16 |  128 |  128 |   64 | 0.00398859  |
| DEFAULT_fab5d_00001 | ERROR    |       |            8 |   32 |   64 |  128 |  128 | 0.0133473   |
| DEFAULT_fab5d_00002 | ERROR    |       |            4 |   32 |  128 |   64 |    8 | 0.00536858  |
| DEFAULT_fab5d_00003 | ERROR    | 

[2m[36m(pid=1442)[0m 2020-10-10 12:13:32,152	ERROR function_runner.py:233 -- Runner Thread raised error.
[2m[36m(pid=1442)[0m Traceback (most recent call last):
[2m[36m(pid=1442)[0m   File "/usr/local/lib/python3.6/dist-packages/ray/tune/function_runner.py", line 227, in run
[2m[36m(pid=1442)[0m     self._entrypoint()
[2m[36m(pid=1442)[0m   File "/usr/local/lib/python3.6/dist-packages/ray/tune/function_runner.py", line 290, in entrypoint
[2m[36m(pid=1442)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=1442)[0m   File "/usr/local/lib/python3.6/dist-packages/ray/tune/function_runner.py", line 499, in _trainable_func
[2m[36m(pid=1442)[0m     output = train_func(config, checkpoint_dir=checkpoint_dir)
[2m[36m(pid=1442)[0m   File "<ipython-input-21-6d676047dfc4>", line 14, in train
[2m[36m(pid=1442)[0m KeyError: 'optim'
[2m[36m(pid=1442)[0m Exception in thread Thread-2:
[2m[36m(pid=1442)[0m Traceback (most recent call last):
[2m[36m(pid=1442)

== Status ==
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.13 GiB heap, 0.0/2.44 GiB objects (0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT
Number of trials: 10 (9 ERROR, 1 PENDING)
+---------------------+----------+-------+--------------+------+------+------+------+-------------+
| Trial name          | status   | loc   |   batch_size |   c1 |   c2 |   c3 |   l1 |          lr |
|---------------------+----------+-------+--------------+------+------+------+------+-------------|
| DEFAULT_fab5d_00000 | ERROR    |       |            2 |   16 |  128 |  128 |   64 | 0.00398859  |
| DEFAULT_fab5d_00001 | ERROR    |       |            8 |   32 |   64 |  128 |  128 | 0.0133473   |
| DEFAULT_fab5d_00002 | ERROR    |       |            4 |   32 |  128 |   64 |    8 | 0.00536858  |
| DEFAULT_fab5d_00003 | ERROR    | 

[2m[36m(pid=1503)[0m 2020-10-10 12:13:38,580	ERROR function_runner.py:233 -- Runner Thread raised error.
[2m[36m(pid=1503)[0m Traceback (most recent call last):
[2m[36m(pid=1503)[0m   File "/usr/local/lib/python3.6/dist-packages/ray/tune/function_runner.py", line 227, in run
[2m[36m(pid=1503)[0m     self._entrypoint()
[2m[36m(pid=1503)[0m   File "/usr/local/lib/python3.6/dist-packages/ray/tune/function_runner.py", line 290, in entrypoint
[2m[36m(pid=1503)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=1503)[0m   File "/usr/local/lib/python3.6/dist-packages/ray/tune/function_runner.py", line 499, in _trainable_func
[2m[36m(pid=1503)[0m     output = train_func(config, checkpoint_dir=checkpoint_dir)
[2m[36m(pid=1503)[0m   File "<ipython-input-21-6d676047dfc4>", line 14, in train
[2m[36m(pid=1503)[0m KeyError: 'optim'
[2m[36m(pid=1503)[0m Exception in thread Thread-2:
[2m[36m(pid=1503)[0m Traceback (most recent call last):
[2m[36m(pid=1503)

== Status ==
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.13 GiB heap, 0.0/2.44 GiB objects (0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT
Number of trials: 10 (10 ERROR)
+---------------------+----------+-------+--------------+------+------+------+------+-------------+
| Trial name          | status   | loc   |   batch_size |   c1 |   c2 |   c3 |   l1 |          lr |
|---------------------+----------+-------+--------------+------+------+------+------+-------------|
| DEFAULT_fab5d_00000 | ERROR    |       |            2 |   16 |  128 |  128 |   64 | 0.00398859  |
| DEFAULT_fab5d_00001 | ERROR    |       |            8 |   32 |   64 |  128 |  128 | 0.0133473   |
| DEFAULT_fab5d_00002 | ERROR    |       |            4 |   32 |  128 |   64 |    8 | 0.00536858  |
| DEFAULT_fab5d_00003 | ERROR    |       |   

TuneError: ignored