In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [1]:
from functools import partial
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

import torch.utils.data as data
from collections import OrderedDict
import numpy as np
import pandas as pd
import pickle
from model.model import Model, DropModel
from glob import glob
import random
import matplotlib.pyplot as plt
from datetime import datetime

In [2]:
levels = ['level0', 'level1', 'level3']

data_dim = 3
in_channels = data_dim + 1 # coord. + age
out_channels = 300
frames = 96
stride = 60
re_order_indices= [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16]
data_type = "3D"
joint_num = 13

In [3]:
import math

def angle(a, b, c):
    ang = math.degrees(math.atan2(c[1]-b[1], c[0]-b[0]) - math.atan2(a[1]-b[1], a[0]-b[0]))
    return ang + 360 if ang < 0 else ang

In [4]:
class TrainDataset(data.Dataset):
    def __init__(self, file_list, label_list, target_level, dim=2, max_frames=48, stride=30):
        super(TrainDataset, self).__init__()
        self.max_frames = max_frames
        
        self.clips = []
        self.labels = []
        self.subject_names = []
        self.ages = []
        self.angles = []
        
        index = 0
        for pkl_filename in file_list:
            # read label
            name = os.path.splitext(os.path.basename(pkl_filename))[0]
            ID = int(name.split('_')[0])
            age = name.split('_')[1]
            age = int(float(age.split('m')[0]))
            age_aug = np.ones((1, 10)) * age / 14.
            self.ages.append(age_aug)
            
            if label_list[index] == target_level:
                self.labels.append(1)
            else:
                self.labels.append(0)

            # read skeletons
            if data_type == "3D":
                skeleton_dict = np.load(pkl_filename)
            else:
                rfile = open(pkl_filename, "rb")
                skeleton_dict = pickle.load(rfile)

            skeleton_np = np.zeros((len(skeleton_dict), joint_num, dim))
            head_angle = 0
            
            for i in range(len(skeleton_dict)):
                if dim == 2 and data_type != "3D":
                    sk = skeleton_dict[i][re_order_indices]
                else:
                    sk = skeleton_dict[i] 
                head_angle += angle(sk[2], sk[7], sk[10])

                G = skeleton_dict[i][7]
                skeleton_np[i] = sk - G
                
            head_angle = np.ones(joint_num) * (head_angle / 360) / len(skeleton_dict)
            self.angles.append(head_angle)

            max_coord = np.max(skeleton_np, axis=(0, 1))
            min_coord = np.min(skeleton_np, axis=(0, 1))
            skeleton_np = (skeleton_np - min_coord) / (max_coord - min_coord)

            age_np = np.ones((len(skeleton_dict), joint_num, 1)) * age / 14.
            skeleton_np = np.concatenate((skeleton_np, age_np), axis=2)
            
            skeletons = []

            data = np.zeros((max_frames, joint_num, dim+1))
            if max_frames < len(skeleton_dict): # drop last k frames with 1 step
                step = len(skeleton_dict) // max_frames
                compensation = len(skeleton_dict) - (max_frames * step)
                j, k = 0, 0
                for i in range(len(skeleton_dict)):
                    if(i == j):
                        data[k] = skeleton_np[i]
                        if k < (max_frames - compensation):
                            j = j + step
                        else:
                            j = j + (step + 1)
                        k += 1
            else: # padding with first k frames
                step = max_frames // len(skeleton_dict)
                compensation = max_frames - (len(skeleton_dict) * step)
                j, repeat_times = 0, 0
                for i in range(max_frames):
                    data[i] = skeleton_np[j]
                    
                    repeat_times += 1
                    if j < compensation:
                        if (repeat_times % (step + 1)) == 0:
                            repeat_times = 0
                            j += 1
                    else:
                        if (repeat_times % step) == 0:
                            repeat_times = 0
                            j += 1
            skeletons.append(data)
            
            self.clips.append(skeletons)
            self.subject_names.append(name)
            index += 1

    def __getitem__(self, index):
        clips = torch.tensor(self.clips[index], dtype=torch.float)
        label = torch.tensor(self.labels[index], dtype=torch.long)
        subject_name = self.subject_names[index]
        head_angle = torch.tensor(self.angles[index], dtype=torch.float)
        
        return clips, label, subject_name, head_angle

    def __len__(self):
        return len(self.subject_names)

In [5]:
class TestDataset(data.Dataset):
    def __init__(self, file_list, label_list, target_level, dim=2, max_frames=48, stride=30):
        super(TestDataset, self).__init__()
        self.max_frames = max_frames
        
        self.clips = []
        self.labels = []
        self.subject_names = []
        self.ages = []
        self.angles = []
        
        index = 0
        for pkl_filename in file_list:
            # read label
            name = os.path.splitext(os.path.basename(pkl_filename))[0]
            ID = int(name.split('_')[0])
            age = name.split('_')[1]
            age = int(float(age.split('m')[0]))
            age_aug = np.ones((1, 10)) * age / 14.
            self.ages.append(age_aug)
            
            if label_list[index] == target_level:
                self.labels.append(1)
            else:
                self.labels.append(0)

            # read skeletons
            if data_type == "3D":
                skeleton_dict = np.load(pkl_filename)
            else:
                rfile = open(pkl_filename, "rb")
                skeleton_dict = pickle.load(rfile)

            skeleton_np = np.zeros((len(skeleton_dict), joint_num, dim))
            head_angle = 0
            for i in range(len(skeleton_dict)):
                if dim == 2 and data_type != "3D":
                    sk = skeleton_dict[i][re_order_indices]
                else:
                    sk = skeleton_dict[i]
                head_angle += angle(sk[2], sk[7], sk[10])
                
                G = skeleton_dict[i][7]
                skeleton_np[i] = sk - G
            head_angle = np.ones(joint_num) * (head_angle / 360) / len(skeleton_dict)
            self.angles.append(head_angle)

            max_coord = np.max(skeleton_np, axis=(0, 1))
            min_coord = np.min(skeleton_np, axis=(0, 1))
            skeleton_np = (skeleton_np - min_coord) / (max_coord - min_coord)

            age_np = np.ones((len(skeleton_dict), joint_num, 1)) * age / 14.
            skeleton_np = np.concatenate((skeleton_np, age_np), axis=2)
            
            skeletons = []

            data = np.zeros((max_frames, joint_num, dim+1))
            if max_frames < len(skeleton_dict): # drop last k frames with 1 step
                step = len(skeleton_dict) // max_frames
                compensation = len(skeleton_dict) - (max_frames * step)
                j, k = 0, 0
                for i in range(len(skeleton_dict)):
                    if(i == j):
                        data[k] = skeleton_np[i]
                        if k < (max_frames - compensation):
                            j = j + step
                        else:
                            j = j + (step + 1)
                        k += 1
            else: # padding with first k frames
                step = max_frames // len(skeleton_dict)
                compensation = max_frames - (len(skeleton_dict) * step)
                j, repeat_times = 0, 0
                for i in range(max_frames):
                    data[i] = skeleton_np[j]
                    repeat_times += 1
                    if j < compensation:
                        if (repeat_times % (step + 1)) == 0:
                            repeat_times = 0
                            j += 1
                    else:
                        if (repeat_times % step) == 0:
                            repeat_times = 0
                            j += 1
            skeletons.append(data)
            
            self.clips.append(skeletons)
            self.subject_names.append(name)
            index += 1

    def __getitem__(self, index):
        clips = torch.tensor(self.clips[index], dtype=torch.float)
        label = torch.tensor(self.labels[index], dtype=torch.long)
        subject_name = self.subject_names[index]
        head_angle = torch.tensor(self.angles[index], dtype=torch.float)
        
        return clips, label, subject_name, head_angle

    def __len__(self):
        return len(self.subject_names)

In [6]:
def load_data(data_dir="../level_data", target_level="level0"):
    train_labels = []
    trainData = []
    # fix evalData
    eval_labels = []
    evalData = []

    index = 0
    train_temp = 0
    eval_temp = 0
    
    for level in levels:
        # load pickle file
        input_dir = "{}/3D/{}".format(data_dir, level)
        subjects = glob(input_dir + "/*")

        if level == target_level:
            eval_num = 8
            data_num = len(subjects)
        else:
            eval_num = 4
            data_num = len(subjects)

        # random choose evaluation data
        eval_index = random.sample(range(data_num), k=eval_num)
        for i in range(data_num):
            if i in eval_index:
                eval_labels.append(index)
                evalData.append(subjects[i])
            else:
                train_labels.append(index)
                trainData.append(subjects[i])
        print("level: {}, train: {}, eval: {}".format(level, len(train_labels)-train_temp, len(eval_labels)-eval_temp))
        train_temp = len(train_labels)
        eval_temp = len(eval_labels)
        index += 1
    train_index = np.arange(len(train_labels))
    np.random.shuffle(train_index)
    train_labels = [train_labels[i] for i in train_index]
    trainData = [trainData[i] for i in train_index]
    print("total | train: {}, eval: {}".format(len(train_labels), len(eval_labels)))

    trainset = TrainDataset(trainData, train_labels, target_level=0, dim=data_dim, max_frames=frames, stride=stride)

    testset = TestDataset(evalData, eval_labels, target_level=0, dim=data_dim, max_frames=frames, stride=stride)

    return trainset, testset

In [7]:
def train(config, checkpoint_dir=None, data_dir=None):
    net = DropModel(num_class=1, num_point=joint_num, num_person=1, graph="graph.h36m.Graph",
                           in_channels=data_dim + 1, out_channels=out_channels, frames=frames)

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=16,
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=16,
        shuffle=True,
        num_workers=8)

    for epoch in range(80):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels, names, angles]
            inputs, labels, names, angles = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # (1, data_dim, frames, num_point, num_person)
            inputs = inputs.permute(0, 4, 2, 3, 1)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 4 == 3:  # print every 4 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                # get the inputs; data is a list of [inputs, labels, names, angles]
                inputs, labels, names, angles = data
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                # (1, data_dim, frames, num_point, num_person)
                inputs = inputs.permute(0, 4, 2, 3, 1)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

In [11]:
def test_accuracy(net, device="cpu"):
    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            inputs, labels, names, angles = data
            inputs = inputs.to(device)
            labels = labels.to(device)
                
            # (1, data_dim, frames, num_point, num_person)
            inputs = inputs.permute(0, 4, 2, 3, 1)
            
            outputs = net(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [10]:
def main(num_samples=10, max_epoch=50, gpus_per_trial=2):
    data_dir = os.path.abspath("../level_data")
    load_data(data_dir)
    # fc = 16, batch_size = 16, lr = 1e-4
    config = {
        "lr": tune.grid_search([1e-4,1e-5,1e-6,1e-7])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_epoch,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["fc", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    result = tune.run(
        partial(train, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = DropModel(num_class=1, num_point=joint_num, num_person=1, graph="graph.h36m.Graph",
                           in_channels=data_dim + 1, out_channels=out_channels, frames=frames)
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))


if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    main(num_samples=1, max_epoch=80, gpus_per_trial=2)

level: level0, train: 59, eval: 8
level: level1, train: 26, eval: 4
level: level3, train: 26, eval: 4
total | train: 111, eval: 16


2022-01-05 11:38:17,077	INFO registry.py:69 -- Detected unknown callable for trainable. Converting to class.


== Status ==
Current time: 2022-01-05 11:38:17 (running for 00:00:00.14)
Memory usage on this node: 9.9/503.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/48 CPUs, 0/3 GPUs, 0.0/482.93 GiB heap, 0.0/9.31 GiB objects (0.0/1.0 accelerator_type:V100S)
Result logdir: /home/auser03/ray_results/DEFAULT_2022-01-05_11-38-17
Number of trials: 4/4 (4 PENDING)
+---------------------+----------+-------+--------+
| Trial name          | status   | loc   |     lr |
|---------------------+----------+-------+--------|
| DEFAULT_eb5fb_00000 | PENDING  |       | 0.0001 |
| DEFAULT_eb5fb_00001 | PENDING  |       | 1e-05  |
| DEFAULT_eb5fb_00002 | PENDING  |       | 1e-06  |
| DEFAULT_eb5fb_00003 | PENDING  |       | 1e-07  |
+---------------------+----------+-------+--------+


[2m[36m(ImplicitFunc pid=14016)[0m 13
== Status ==
Current time: 2

[2m[36m(ImplicitFunc pid=14016)[0m [7,     4] loss: 0.672
Result for DEFAULT_eb5fb_00000:
  accuracy: 0.5217391304347826
  date: 2022-01-05_11-38-46
  done: false
  experiment_id: c61ad25ae62a49cfa894502fdf7d78d0
  hostname: 612c8c920c35
  iterations_since_restore: 7
  loss: 0.6842364370822906
  node_ip: 172.17.0.2
  pid: 14016
  should_checkpoint: true
  time_since_restore: 28.29875946044922
  time_this_iter_s: 2.704444408416748
  time_total_s: 28.29875946044922
  timestamp: 1641353926
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: eb5fb_00000
  
[2m[36m(ImplicitFunc pid=14016)[0m [8,     4] loss: 0.675
== Status ==
Current time: 2022-01-05 11:38:49 (running for 00:00:32.16)
Memory usage on this node: 13.2/503.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: -0.7012248039245605 | Iter 4.000: -0.6967127025127411 | Iter 2.000: -0.7148052453994751 | Iter 1.000: -0.6978976726531982
Resources 

[2m[36m(ImplicitFunc pid=14016)[0m [15,     4] loss: 0.667
Result for DEFAULT_eb5fb_00000:
  accuracy: 0.5217391304347826
  date: 2022-01-05_11-39-07
  done: false
  experiment_id: c61ad25ae62a49cfa894502fdf7d78d0
  hostname: 612c8c920c35
  iterations_since_restore: 15
  loss: 0.6944402158260345
  node_ip: 172.17.0.2
  pid: 14016
  should_checkpoint: true
  time_since_restore: 49.31320023536682
  time_this_iter_s: 2.6134769916534424
  time_total_s: 49.31320023536682
  timestamp: 1641353947
  timesteps_since_restore: 0
  training_iteration: 15
  trial_id: eb5fb_00000
  
[2m[36m(ImplicitFunc pid=14016)[0m [16,     4] loss: 0.686
== Status ==
Current time: 2022-01-05 11:39:10 (running for 00:00:53.07)
Memory usage on this node: 13.2/503.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 32.000: None | Iter 16.000: -0.6847175359725952 | Iter 8.000: -0.7012248039245605 | Iter 4.000: -0.6967127025127411 | Iter 2.000: -0.7148052453994751 | Iter 1.000: -0.6978976

[2m[36m(ImplicitFunc pid=14016)[0m [23,     4] loss: 0.665
Result for DEFAULT_eb5fb_00000:
  accuracy: 0.6521739130434783
  date: 2022-01-05_11-39-28
  done: false
  experiment_id: c61ad25ae62a49cfa894502fdf7d78d0
  hostname: 612c8c920c35
  iterations_since_restore: 23
  loss: 0.6667390465736389
  node_ip: 172.17.0.2
  pid: 14016
  should_checkpoint: true
  time_since_restore: 70.09257626533508
  time_this_iter_s: 2.597081184387207
  time_total_s: 70.09257626533508
  timestamp: 1641353968
  timesteps_since_restore: 0
  training_iteration: 23
  trial_id: eb5fb_00000
  
[2m[36m(ImplicitFunc pid=14016)[0m [24,     4] loss: 0.670
== Status ==
Current time: 2022-01-05 11:39:30 (running for 00:01:13.85)
Memory usage on this node: 13.2/503.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 32.000: None | Iter 16.000: -0.6847175359725952 | Iter 8.000: -0.7012248039245605 | Iter 4.000: -0.6967127025127411 | Iter 2.000: -0.7148052453994751 | Iter 1.000: -0.69789767

[2m[36m(ImplicitFunc pid=14016)[0m [31,     4] loss: 0.647
Result for DEFAULT_eb5fb_00000:
  accuracy: 0.4782608695652174
  date: 2022-01-05_11-39-49
  done: false
  experiment_id: c61ad25ae62a49cfa894502fdf7d78d0
  hostname: 612c8c920c35
  iterations_since_restore: 31
  loss: 0.7194889485836029
  node_ip: 172.17.0.2
  pid: 14016
  should_checkpoint: true
  time_since_restore: 90.90459895133972
  time_this_iter_s: 2.5329577922821045
  time_total_s: 90.90459895133972
  timestamp: 1641353989
  timesteps_since_restore: 0
  training_iteration: 31
  trial_id: eb5fb_00000
  
[2m[36m(ImplicitFunc pid=14016)[0m [32,     4] loss: 0.656
== Status ==
Current time: 2022-01-05 11:39:51 (running for 00:01:34.67)
Memory usage on this node: 13.2/503.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 32.000: -0.6861645579338074 | Iter 16.000: -0.6847175359725952 | Iter 8.000: -0.7012248039245605 | Iter 4.000: -0.6967127025127411 | Iter 2.000: -0.7148052453994751 | Iter 1.

[2m[36m(ImplicitFunc pid=14016)[0m [39,     4] loss: 0.634
Result for DEFAULT_eb5fb_00000:
  accuracy: 0.5217391304347826
  date: 2022-01-05_11-40-09
  done: false
  experiment_id: c61ad25ae62a49cfa894502fdf7d78d0
  hostname: 612c8c920c35
  iterations_since_restore: 39
  loss: 0.7622119188308716
  node_ip: 172.17.0.2
  pid: 14016
  should_checkpoint: true
  time_since_restore: 111.65450429916382
  time_this_iter_s: 2.570871353149414
  time_total_s: 111.65450429916382
  timestamp: 1641354009
  timesteps_since_restore: 0
  training_iteration: 39
  trial_id: eb5fb_00000
  
[2m[36m(ImplicitFunc pid=14016)[0m [40,     4] loss: 0.610
== Status ==
Current time: 2022-01-05 11:40:12 (running for 00:01:55.42)
Memory usage on this node: 13.2/503.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 32.000: -0.6861645579338074 | Iter 16.000: -0.6847175359725952 | Iter 8.000: -0.7012248039245605 | Iter 4.000: -0.6967127025127411 | Iter 2.000: -0.7148052453994751 | Iter 1

[2m[36m(ImplicitFunc pid=14016)[0m [47,     4] loss: 0.644
Result for DEFAULT_eb5fb_00000:
  accuracy: 0.43478260869565216
  date: 2022-01-05_11-40-30
  done: false
  experiment_id: c61ad25ae62a49cfa894502fdf7d78d0
  hostname: 612c8c920c35
  iterations_since_restore: 47
  loss: 0.6618370413780212
  node_ip: 172.17.0.2
  pid: 14016
  should_checkpoint: true
  time_since_restore: 132.44556522369385
  time_this_iter_s: 2.6437506675720215
  time_total_s: 132.44556522369385
  timestamp: 1641354030
  timesteps_since_restore: 0
  training_iteration: 47
  trial_id: eb5fb_00000
  
[2m[36m(ImplicitFunc pid=14016)[0m [48,     4] loss: 0.611
== Status ==
Current time: 2022-01-05 11:40:33 (running for 00:02:16.24)
Memory usage on this node: 13.2/503.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 32.000: -0.6861645579338074 | Iter 16.000: -0.6847175359725952 | Iter 8.000: -0.7012248039245605 | Iter 4.000: -0.6967127025127411 | Iter 2.000: -0.7148052453994751 | Iter

[2m[36m(ImplicitFunc pid=14016)[0m [55,     4] loss: 0.597
Result for DEFAULT_eb5fb_00000:
  accuracy: 0.5652173913043478
  date: 2022-01-05_11-40-51
  done: false
  experiment_id: c61ad25ae62a49cfa894502fdf7d78d0
  hostname: 612c8c920c35
  iterations_since_restore: 55
  loss: 0.7060602009296417
  node_ip: 172.17.0.2
  pid: 14016
  should_checkpoint: true
  time_since_restore: 153.13500332832336
  time_this_iter_s: 2.5839312076568604
  time_total_s: 153.13500332832336
  timestamp: 1641354051
  timesteps_since_restore: 0
  training_iteration: 55
  trial_id: eb5fb_00000
  
[2m[36m(ImplicitFunc pid=14016)[0m [56,     4] loss: 0.593
== Status ==
Current time: 2022-01-05 11:40:54 (running for 00:02:36.95)
Memory usage on this node: 13.2/503.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: None | Iter 32.000: -0.6861645579338074 | Iter 16.000: -0.6847175359725952 | Iter 8.000: -0.7012248039245605 | Iter 4.000: -0.6967127025127411 | Iter 2.000: -0.7148052453994751 | Iter 

[2m[36m(ImplicitFunc pid=14016)[0m [63,     4] loss: 0.594
Result for DEFAULT_eb5fb_00000:
  accuracy: 0.5217391304347826
  date: 2022-01-05_11-41-12
  done: false
  experiment_id: c61ad25ae62a49cfa894502fdf7d78d0
  hostname: 612c8c920c35
  iterations_since_restore: 63
  loss: 0.6687841415405273
  node_ip: 172.17.0.2
  pid: 14016
  should_checkpoint: true
  time_since_restore: 174.0817379951477
  time_this_iter_s: 2.542228937149048
  time_total_s: 174.0817379951477
  timestamp: 1641354072
  timesteps_since_restore: 0
  training_iteration: 63
  trial_id: eb5fb_00000
  
[2m[36m(ImplicitFunc pid=14016)[0m [64,     4] loss: 0.580
== Status ==
Current time: 2022-01-05 11:41:14 (running for 00:02:57.84)
Memory usage on this node: 13.2/503.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: -0.6240427196025848 | Iter 32.000: -0.6861645579338074 | Iter 16.000: -0.6847175359725952 | Iter 8.000: -0.7012248039245605 | Iter 4.000: -0.6967127025127411 | Iter 2.000: -0.714805245399

[2m[36m(ImplicitFunc pid=14016)[0m [71,     4] loss: 0.568
Result for DEFAULT_eb5fb_00000:
  accuracy: 0.4782608695652174
  date: 2022-01-05_11-41-33
  done: false
  experiment_id: c61ad25ae62a49cfa894502fdf7d78d0
  hostname: 612c8c920c35
  iterations_since_restore: 71
  loss: 0.7556345164775848
  node_ip: 172.17.0.2
  pid: 14016
  should_checkpoint: true
  time_since_restore: 194.9252576828003
  time_this_iter_s: 2.6080586910247803
  time_total_s: 194.9252576828003
  timestamp: 1641354093
  timesteps_since_restore: 0
  training_iteration: 71
  trial_id: eb5fb_00000
  
[2m[36m(ImplicitFunc pid=14016)[0m [72,     4] loss: 0.583
== Status ==
Current time: 2022-01-05 11:41:35 (running for 00:03:18.67)
Memory usage on this node: 13.2/503.5 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 64.000: -0.6240427196025848 | Iter 32.000: -0.6861645579338074 | Iter 16.000: -0.6847175359725952 | Iter 8.000: -0.7012248039245605 | Iter 4.000: -0.6967127025127411 | Iter 2.000: -0.71480524539

[2m[36m(ImplicitFunc pid=14016)[0m [79,     4] loss: 0.550
Result for DEFAULT_eb5fb_00000:
  accuracy: 0.4782608695652174
  date: 2022-01-05_11-41-54
  done: false
  experiment_id: c61ad25ae62a49cfa894502fdf7d78d0
  hostname: 612c8c920c35
  iterations_since_restore: 79
  loss: 0.7838318943977356
  node_ip: 172.17.0.2
  pid: 14016
  should_checkpoint: true
  time_since_restore: 215.84570574760437
  time_this_iter_s: 2.716395139694214
  time_total_s: 215.84570574760437
  timestamp: 1641354114
  timesteps_since_restore: 0
  training_iteration: 79
  trial_id: eb5fb_00000
  
[2m[36m(ImplicitFunc pid=14016)[0m [80,     4] loss: 0.618
Result for DEFAULT_eb5fb_00000:
  accuracy: 0.6521739130434783
  date: 2022-01-05_11-41-56
  done: true
  experiment_id: c61ad25ae62a49cfa894502fdf7d78d0
  hostname: 612c8c920c35
  iterations_since_restore: 80
  loss: 0.634435772895813
  node_ip: 172.17.0.2
  pid: 14016
  should_checkpoint: true
  time_since_restore: 218.41146421432495
  time_this_iter_s: 

[2m[36m(ImplicitFunc pid=14022)[0m level: level0, train: 59, eval: 8
[2m[36m(ImplicitFunc pid=14022)[0m level: level1, train: 26, eval: 4
[2m[36m(ImplicitFunc pid=14022)[0m level: level3, train: 26, eval: 4
[2m[36m(ImplicitFunc pid=14022)[0m total | train: 111, eval: 16
== Status ==
Current time: 2022-01-05 11:42:17 (running for 00:04:00.43)
Memory usage on this node: 12.6/503.5 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 64.000: -0.6240427196025848 | Iter 32.000: -0.6861645579338074 | Iter 16.000: -0.6847175359725952 | Iter 8.000: -0.7012248039245605 | Iter 4.000: -0.6967127025127411 | Iter 2.000: -0.7148052453994751 | Iter 1.000: -0.7125451564788818
Resources requested: 2.0/48 CPUs, 2.0/3 GPUs, 0.0/482.93 GiB heap, 0.0/9.31 GiB objects (0.0/1.0 accelerator_type:V100S)
Result logdir: /home/auser03/ray_results/DEFAULT_2022-01-05_11-38-17
Number of trials: 4/4 (1 PENDING, 1 RUNNING, 2 TERMINATED)
+---------------------+------------+------------------+--------+-----

[2m[36m(ImplicitFunc pid=14025)[0m level: level0, train: 59, eval: 8
[2m[36m(ImplicitFunc pid=14025)[0m level: level1, train: 26, eval: 4
[2m[36m(ImplicitFunc pid=14025)[0m level: level3, train: 26, eval: 4
[2m[36m(ImplicitFunc pid=14025)[0m total | train: 111, eval: 16
== Status ==
Current time: 2022-01-05 11:42:39 (running for 00:04:22.03)
Memory usage on this node: 12.7/503.5 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 64.000: -0.6240427196025848 | Iter 32.000: -0.6861645579338074 | Iter 16.000: -0.6847175359725952 | Iter 8.000: -0.7012248039245605 | Iter 4.000: -0.6979185938835144 | Iter 2.000: -0.7032419443130493 | Iter 1.000: -0.6978976726531982
Resources requested: 2.0/48 CPUs, 2.0/3 GPUs, 0.0/482.93 GiB heap, 0.0/9.31 GiB objects (0.0/1.0 accelerator_type:V100S)
Result logdir: /home/auser03/ray_results/DEFAULT_2022-01-05_11-38-17
Number of trials: 4/4 (1 RUNNING, 3 TERMINATED)
+---------------------+------------+------------------+--------+----------+-----

2022-01-05 11:42:52,563	INFO tune.py:626 -- Total run time: 275.49 seconds (275.35 seconds for the tuning loop).


Result for DEFAULT_eb5fb_00003:
  accuracy: 0.5217391304347826
  date: 2022-01-05_11-42-52
  done: true
  experiment_id: 91a7cde17b4c4df1b3fb61bfdb73b59e
  hostname: 612c8c920c35
  iterations_since_restore: 4
  loss: 0.7214157581329346
  node_ip: 172.17.0.2
  pid: 14025
  should_checkpoint: true
  time_since_restore: 20.168113470077515
  time_this_iter_s: 2.8163154125213623
  time_total_s: 20.168113470077515
  timestamp: 1641354172
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: eb5fb_00003
  
== Status ==
Current time: 2022-01-05 11:42:52 (running for 00:04:35.36)
Memory usage on this node: 13.1/503.5 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 64.000: -0.6240427196025848 | Iter 32.000: -0.6861645579338074 | Iter 16.000: -0.6847175359725952 | Iter 8.000: -0.7012248039245605 | Iter 4.000: -0.6991244852542877 | Iter 2.000: -0.6916786432266235 | Iter 1.000: -0.6868937909603119
Resources requested: 0/48 CPUs, 0/3 GPUs, 0.0/482.93 GiB heap, 0.0/9.31 GiB objects (

NameError: name 'test_data' is not defined