In [10]:
import numpy as np
import torch
import torch.optim as optim
from torchvision import datasets

from ray import tune
from ray.tune import track
from ray.tune.schedulers import ASHAScheduler
from ray.tune.examples.mnist_pytorch import get_data_loaders, ConvNet, train, test

In [17]:
def train_mnist(config):
    train_loader, test_loader = get_data_loaders()
    print(train_loader)
    model = ConvNet()
    print(model.parameters(), model)
    optimizer = optim.SGD(model.parameters(), lr=config["lr"])
    for i in range(10):
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)
        tune.track.log(mean_accuracy=acc)


In [19]:
analysis = tune.run(
    train_mnist, config={"lr": tune.grid_search([0.01, 0.1, 0.3]),
                        "kernel_size": tune.grid_search([(3, 3), (5, 5)])})

print("Best config: ", analysis.get_best_config(metric="mean_accuracy"))

# Get a dataframe for analyzing trial results.
df = analysis.dataframe()

Trial name,status,loc,kernel_size,lr
train_mnist_00000,RUNNING,,"(3, 3)",0.01
train_mnist_00001,PENDING,,"(5, 5)",0.01
train_mnist_00002,PENDING,,"(3, 3)",0.1
train_mnist_00003,PENDING,,"(5, 5)",0.1
train_mnist_00004,PENDING,,"(3, 3)",0.3
train_mnist_00005,PENDING,,"(5, 5)",0.3


[2m[36m(pid=19257)[0m 2020-06-22 17:18:34,792	INFO trainable.py:217 -- Getting current IP.
[2m[36m(pid=19257)[0m <torch.utils.data.dataloader.DataLoader object at 0x7f993c442390>
[2m[36m(pid=19257)[0m <generator object Module.parameters at 0x7f993c4325c8> ConvNet(
[2m[36m(pid=19257)[0m   (conv1): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1))
[2m[36m(pid=19257)[0m   (fc): Linear(in_features=192, out_features=10, bias=True)
[2m[36m(pid=19257)[0m )
Result for train_mnist_00000:
  date: 2020-06-22_17-18-35
  done: false
  experiment_id: 283fd6f7a5fc460a90983af4e80292e0
  experiment_tag: 0_kernel_size=(3, 3),lr=0.01
  hostname: Ravis-MacBook-Pro.local
  iterations_since_restore: 1
  mean_accuracy: 0.096875
  node_ip: 10.9.12.130
  pid: 19257
  time_since_restore: 0.5220098495483398
  time_this_iter_s: 0.5220098495483398
  time_total_s: 0.5220098495483398
  timestamp: 1592871515
  timesteps_since_restore: 0
  training_iteration: 0
  trial_id: '00000'
  


Trial name,status,loc,kernel_size,lr,acc,total time (s),iter
train_mnist_00000,RUNNING,10.9.12.130:19257,"(3, 3)",0.01,0.096875,0.52201,0.0
train_mnist_00001,RUNNING,,"(5, 5)",0.01,,,
train_mnist_00002,RUNNING,,"(3, 3)",0.1,,,
train_mnist_00003,RUNNING,,"(5, 5)",0.1,,,
train_mnist_00004,PENDING,,"(3, 3)",0.3,,,
train_mnist_00005,PENDING,,"(5, 5)",0.3,,,


[2m[36m(pid=19258)[0m 2020-06-22 17:18:35,868	INFO trainable.py:217 -- Getting current IP.
[2m[36m(pid=19260)[0m 2020-06-22 17:18:36,068	INFO trainable.py:217 -- Getting current IP.
[2m[36m(pid=19258)[0m <torch.utils.data.dataloader.DataLoader object at 0x7ffd0ec3f240>
[2m[36m(pid=19258)[0m <generator object Module.parameters at 0x7ffd0ec305c8> ConvNet(
[2m[36m(pid=19258)[0m   (conv1): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1))
[2m[36m(pid=19258)[0m   (fc): Linear(in_features=192, out_features=10, bias=True)
[2m[36m(pid=19258)[0m )
[2m[36m(pid=19264)[0m 2020-06-22 17:18:36,343	INFO trainable.py:217 -- Getting current IP.
[2m[36m(pid=19260)[0m <torch.utils.data.dataloader.DataLoader object at 0x7f82a8442400>
[2m[36m(pid=19260)[0m <generator object Module.parameters at 0x7f82a84325c8> ConvNet(
[2m[36m(pid=19260)[0m   (conv1): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1))
[2m[36m(pid=19260)[0m   (fc): Linear(in_features=192, out_features=10, bias

Trial name,status,loc,kernel_size,lr,acc,total time (s),iter
train_mnist_00000,RUNNING,10.9.12.130:19257,"(3, 3)",0.01,0.44375,4.00553,4.0
train_mnist_00001,RUNNING,10.9.12.130:19258,"(5, 5)",0.01,0.28125,3.92747,6.0
train_mnist_00002,RUNNING,10.9.12.130:19260,"(3, 3)",0.1,0.86875,4.32102,4.0
train_mnist_00003,RUNNING,10.9.12.130:19264,"(5, 5)",0.1,0.740625,1.95942,3.0
train_mnist_00004,PENDING,,"(3, 3)",0.3,,,
train_mnist_00005,PENDING,,"(5, 5)",0.3,,,


Result for train_mnist_00000:
  date: 2020-06-22_17-18-40
  done: false
  experiment_id: 283fd6f7a5fc460a90983af4e80292e0
  experiment_tag: 0_kernel_size=(3, 3),lr=0.01
  hostname: Ravis-MacBook-Pro.local
  iterations_since_restore: 6
  mean_accuracy: 0.565625
  node_ip: 10.9.12.130
  pid: 19257
  time_since_restore: 5.123963832855225
  time_this_iter_s: 1.1184308528900146
  time_total_s: 5.123963832855225
  timestamp: 1592871520
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '00000'
  
Result for train_mnist_00001:
  date: 2020-06-22_17-18-40
  done: false
  experiment_id: 2f61430d42674bb6a495e44ee048ad40
  experiment_tag: 1_kernel_size=(5, 5),lr=0.01
  hostname: Ravis-MacBook-Pro.local
  iterations_since_restore: 8
  mean_accuracy: 0.29375
  node_ip: 10.9.12.130
  pid: 19258
  time_since_restore: 4.3568761348724365
  time_this_iter_s: 0.42940521240234375
  time_total_s: 4.3568761348724365
  timestamp: 1592871520
  timesteps_since_restore: 0
  training_iteration: 7
 

2020-06-22 17:18:42,838	INFO logger.py:271 -- Removed the following hyperparameter values when logging to tensorboard: {'kernel_size': (5, 5)}


Result for train_mnist_00002:
  date: 2020-06-22_17-18-41
  done: false
  experiment_id: 8e52dedfed2f4b968bc0eb4c51d7e934
  experiment_tag: 2_kernel_size=(3, 3),lr=0.1
  hostname: Ravis-MacBook-Pro.local
  iterations_since_restore: 6
  mean_accuracy: 0.784375
  node_ip: 10.9.12.130
  pid: 19260
  time_since_restore: 4.871011972427368
  time_this_iter_s: 0.5499899387359619
  time_total_s: 4.871011972427368
  timestamp: 1592871521
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '00002'
  


2020-06-22 17:18:45,371	INFO logger.py:271 -- Removed the following hyperparameter values when logging to tensorboard: {'kernel_size': (5, 5)}


[2m[36m(pid=19269)[0m 2020-06-22 17:18:45,835	INFO trainable.py:217 -- Getting current IP.


Trial name,status,loc,kernel_size,lr,acc,total time (s),iter
train_mnist_00000,RUNNING,10.9.12.130:19257,"(3, 3)",0.01,0.65625,6.76496,7.0
train_mnist_00001,TERMINATED,,"(5, 5)",0.01,0.415625,6.35257,9.0
train_mnist_00002,RUNNING,10.9.12.130:19260,"(3, 3)",0.1,0.884375,9.29618,9.0
train_mnist_00003,TERMINATED,,"(5, 5)",0.1,0.878125,8.28695,9.0
train_mnist_00004,RUNNING,,"(3, 3)",0.3,,,
train_mnist_00005,RUNNING,,"(5, 5)",0.3,,,


2020-06-22 17:18:46,294	INFO logger.py:271 -- Removed the following hyperparameter values when logging to tensorboard: {'kernel_size': (3, 3)}


[2m[36m(pid=19269)[0m <torch.utils.data.dataloader.DataLoader object at 0x7ffab6c432b0>
[2m[36m(pid=19269)[0m <generator object Module.parameters at 0x7ffab6c335c8> ConvNet(
[2m[36m(pid=19269)[0m   (conv1): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1))
[2m[36m(pid=19269)[0m   (fc): Linear(in_features=192, out_features=10, bias=True)
[2m[36m(pid=19269)[0m )
[2m[33m(pid=raylet)[0m E0622 17:18:46.377467 95387072 node_manager.cc:3537] Failed to send get core worker stats request: IOError: 14: Socket closed
Result for train_mnist_00000:
  date: 2020-06-22_17-18-45
  done: false
  experiment_id: 283fd6f7a5fc460a90983af4e80292e0
  experiment_tag: 0_kernel_size=(3, 3),lr=0.01
  hostname: Ravis-MacBook-Pro.local
  iterations_since_restore: 9
  mean_accuracy: 0.7
  node_ip: 10.9.12.130
  pid: 19257
  time_since_restore: 10.060168981552124
  time_this_iter_s: 3.295210123062134
  time_total_s: 10.060168981552124
  timestamp: 1592871525
  timesteps_since_restore: 0
  training_it

2020-06-22 17:18:47,292	INFO logger.py:271 -- Removed the following hyperparameter values when logging to tensorboard: {'kernel_size': (3, 3)}


[2m[36m(pid=19266)[0m 2020-06-22 17:18:47,908	INFO trainable.py:217 -- Getting current IP.
[2m[36m(pid=19266)[0m <torch.utils.data.dataloader.DataLoader object at 0x7fd49ac41470>
[2m[36m(pid=19266)[0m <generator object Module.parameters at 0x7fd49ac325c8> ConvNet(
[2m[36m(pid=19266)[0m   (conv1): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1))
[2m[36m(pid=19266)[0m   (fc): Linear(in_features=192, out_features=10, bias=True)
[2m[36m(pid=19266)[0m )
Result for train_mnist_00005:
  date: 2020-06-22_17-18-48
  done: false
  experiment_id: 7f581634ba0349e59559c953074fd3dd
  experiment_tag: 5_kernel_size=(5, 5),lr=0.3
  hostname: Ravis-MacBook-Pro.local
  iterations_since_restore: 1
  mean_accuracy: 0.646875
  node_ip: 10.9.12.130
  pid: 19266
  time_since_restore: 0.3607819080352783
  time_this_iter_s: 0.3607819080352783
  time_total_s: 0.3607819080352783
  timestamp: 1592871528
  timesteps_since_restore: 0
  training_iteration: 0
  trial_id: '00005'
  


2020-06-22 17:18:51,085	INFO logger.py:271 -- Removed the following hyperparameter values when logging to tensorboard: {'kernel_size': (3, 3)}


Trial name,status,loc,kernel_size,lr,acc,total time (s),iter
train_mnist_00000,TERMINATED,,"(3, 3)",0.01,0.7375,11.8712,9
train_mnist_00001,TERMINATED,,"(5, 5)",0.01,0.415625,6.35257,9
train_mnist_00002,TERMINATED,,"(3, 3)",0.1,0.884375,9.29618,9
train_mnist_00003,TERMINATED,,"(5, 5)",0.1,0.878125,8.28695,9
train_mnist_00004,TERMINATED,,"(3, 3)",0.3,0.934375,4.60133,9
train_mnist_00005,RUNNING,10.9.12.130:19266,"(5, 5)",0.3,0.8625,2.34869,5


2020-06-22 17:18:52,951	INFO logger.py:271 -- Removed the following hyperparameter values when logging to tensorboard: {'kernel_size': (5, 5)}


Trial name,status,loc,kernel_size,lr,acc,total time (s),iter
train_mnist_00000,TERMINATED,,"(3, 3)",0.01,0.7375,11.8712,9
train_mnist_00001,TERMINATED,,"(5, 5)",0.01,0.415625,6.35257,9
train_mnist_00002,TERMINATED,,"(3, 3)",0.1,0.884375,9.29618,9
train_mnist_00003,TERMINATED,,"(5, 5)",0.1,0.878125,8.28695,9
train_mnist_00004,TERMINATED,,"(3, 3)",0.3,0.934375,4.60133,9
train_mnist_00005,TERMINATED,,"(5, 5)",0.3,0.915625,4.50016,9


Best config:  {'lr': 0.1, 'kernel_size': (5, 5)}


In [16]:
dfs = analysis.trial_dataframes
[d.mean_accuracy.plot() for d in dfs.values()]

[<matplotlib.axes._subplots.AxesSubplot at 0x7fe8fc1e97b8>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7fe8fc1e97b8>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7fe8fc1e97b8>]