In [1]:
# 1. import everything you need (e.g., model, dataset, etc.)
# 2. Use Random Search to find the best hyperparams(e.g., batch_size, augmentation, optimizer, loss_fn, etc.)
# 3. Report the best train configuration
# 4. Save the best values (for hyperparams)
# 5. Use the found values in "train.ipynb" to train the model with them

# Code Cells for GoogleColab
> This way we can execute our code from github without any hassles, just:
>> 1. add all the packages needed (that is not in Colab) in "requirements.txt"
>> 2. The github repo must be public, if the Colab account (e.g., <golab_pro_owner>@gmail.com) has not been granted access to that repo.

In [2]:
# # Mount the Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# Clone the Github Repo
# !git clone https://github.com/tekboart/semantic-segmentaion-pytorch

In [4]:
# %pwd

In [5]:
# Go to the repo's main dir
# %cd semantic-segmentaion-pytorch/

In [6]:
# Install the required packages
# !pip install -r requirements.txt

In [None]:
# Allow Tensorboard to write to the tmp directory:
# !export TMPDIR=/tmp/$USER; mkdir -p $TMPDIR; tensorboard --logdir=~/ray_results

In [7]:
# reload modules
from importlib import reload

# load pretrained segmentation models (written in pytorch)
import segmentation_models_pytorch as smp
from segmentation_models_pytorch.encoders import get_preprocessing_fn

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
from torch.utils.data import DataLoader

# torchvision
from torchviz import make_dot
import torchvision.transforms.v2 as TF

# torchmetrics
from torchmetrics.classification import Dice, BinaryJaccardIndex

# Ray Tune
import ray
from ray import tune
from ray.air import Checkpoint, session
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.optuna import OptunaSearch
from ray.air.config import ScalingConfig

# Serialize/Deserialize Json files
import json

# Data Augmentation
import albumentations as A
import albumentations.augmentations.functional as F
from albumentations.pytorch import ToTensorV2

# get data/time with desired format
from datetime import datetime

time_format = "%Y.%m.%d@%H-%M-%S"

# work with images
import cv2
from PIL import Image

# slice Iterables and turn to GEN
from itertools import islice

# keep numpy use to a min
# as we store our torch.Tensors to GPU Vram but numpy in RAM (it only supports CPU)
import pandas as pd
import numpy as np
import random

# to have a progress bar
from tqdm import tqdm

# To use pretrained segmentation models (implement in PyTorch)

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint
# uncomment if wan't to globally aloow sns to handle plot style
# it adds unwanted style (i.e., grid) to .imshow()
# so better to use it a context manager to style only what I want
# >>> with sns.axes_style('darkgrid'):
# >>>     plt.imshow(...)
# sns.set_theme(
#     context="notebook",
#     style="darkgrid",
#     palette="deep",
#     font="sans-serif",
#     font_scale=1,
#     color_codes=True,
#     rc={'axes.grid': False},
# )

# OS/File/Path management
import sys
import os

# Misc
from functools import partial

# load my custom Classes/Functions/etc.
from utils.training import fit_fn
from utils.dataset import get_loaders, SegmentaionDataset
from utils.models.unet import UnetScratch
from utils.visualization import (
    image_mask_plot,
    ImageAntiStandardize,
    plot_metrics
)
from utils.metrics import (
    AccuracyBinarySegment,
    DiceBinarySegment,
    JaccardBinarySegment,
    PrecisionBinarySegment,
    RecallBinarySegment,
    F1BinarySegment,
    DiceBCELoss,
)

2023-07-14 11:54:24.069911: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Remove sources of non-determinism

In [8]:
# use to seed the RNG for all devices (both CPU and CUDA).
torch.manual_seed(0)
# for custom operators, you might need to set python seed as well:
random.seed(0)
# If you or any of the libraries you are using rely on NumPy, you can seed the global NumPy RNG with:
np.random.seed(0)

# configure PyTorch to use deterministic algorithms instead of nondeterministic ones 
# A CAVEAT: throws an error if an operation is known to be nondeterministic (and without a deterministic alternative).
# that is why we used warn_only to avoid raising Error
torch.use_deterministic_algorithms(True, warn_only=True)
# CUDA convolution determinism
torch.backends.cudnn.deterministic = True

# -- DataLoader
# make it deterministic but allow it to random data order
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

DATA_LOADER_GEN = torch.Generator()
DATA_LOADER_GEN.manual_seed(0)

<torch._C.Generator at 0x7f208ff76d10>

# Define the Search Hyperparameters Space
> that is, the hyperparameters and their correspondign values to search for.

In [9]:
# Create the Config dict
config = {
    'lr': tune.loguniform(1e-4, 1e-3),
    'scheduler_factor': tune.choice([.5, .3, .1]),
    'batch_size': tune.choice([4, 8, 16, 32]),
    'image_size': tune.choice([352, 256, 160]),  # sizes must be divisible by 32 
    'pretrained_arch': tune.choice([smp.UnetPlusPlus, smp.FPN, smp.DeepLabV3Plus]),
    #TODO: Maybe remove ResNet152 as it's both (1) comp expensive and (2) can overfit very easily (considering its #params)
    'pretrained_encoder': tune.choice(['mobilenet_v2', 'timm-mobilenetv3_large_100', 'timm-efficientnet-b8']),
    # 'loss_fn': tune.choice([nn.BCEWithLogitsLoss(), DiceBCELoss(from_logits=True)]),
    # 'train_augmentation': tune.choice([
    #     #TODO: As my last model's val & test metrics were very different, then maybe I need stronger augmentations for the model to generalize better.
    #     # We didn't included the resize/Flip/etc. that we know would cause no problem. so:
    #     # Just use this A.Compose([*hyper_config['train_augmentation'], A.HorizontalFlip(...), A.Resize(..), etc.])
    #     [
    #         A.Rotate(limit=5, p=0.5),  # Use only when the img_height==img_width
    #         A.RandomRotate90(p=0.5),  # Use only when the img_height==img_width
    #         A.Transpose(p=0.5),  # Use only when the img_height==img_width
    #         # A.CenterCrop(
    #             #TODO: Remember to add vars image_height & image_width in the train func
    #             # int(0.9 * image_height),
    #             # int(0.9 * image_width),
    #             # p=0.3,
    #         # ),
    #         A.ColorJitter(
    #             brightness=0.3,
    #             contrast=0.05,
    #             saturation=0.1,
    #             hue=0.05,
    #             p=0.8,
    #             # always_apply=True,
    #         ),
    #         A.ImageCompression(quality_lower=90, quality_upper=100, p=0.3),
    #         A.RGBShift(
    #             r_shift_limit=5,
    #             g_shift_limit=5,
    #             b_shift_limit=2,
    #             p=0.8,
    #             # always_apply=True,
    #         ),
    #     ],
    #     [
    #         A.Rotate(limit=5, p=0.5),  # Use only when the img_height==img_width
    #         A.RandomRotate90(p=0.5),  # Use only when the img_height==img_width
    #         A.Transpose(p=0.5),  # Use only when the img_height==img_width
    #         A.ColorJitter(
    #             brightness=0.3,
    #             contrast=0.05,
    #             saturation=0.1,
    #             hue=0.05,
    #             p=0.8,
    #             # always_apply=True,
    #         ),
    #         A.RGBShift(
    #             r_shift_limit=5,
    #             g_shift_limit=5,
    #             b_shift_limit=2,
    #             p=0.8,
    #             # always_apply=True,
    #         ),
    #     ],
    #     [
    #         # means use no fancy Augmentation. Just some flip and 90deg rotations.
    #     ]
    # ])

}

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device used for calculation (CPU\Cuda):", device)

Device used for calculation (CPU\Cuda): cuda:0


# Create The Model Func
> Everything needed for trainin a model needs to be within its scope (should not use any var from outside)

In [11]:
# Create a model_creator_for_hyperserach to create + Compile a model (to be ready to be trained)
# TODO: Can I make it reusable for other models as well?
# A: I don't think
def model_creator(config):
    """
    instantiate + Compile + train (aka .fit()) a network

    Parameters
    ----------
    hyper_config: dict
        Includes all the hyperparmeters needed to be tuned.
    """
    DATA_LOADER_GEN = torch.Generator()
    DATA_LOADER_GEN.manual_seed(0)

    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = torch.device("cuda:0")
    # print("Device used for calculation (CPU\Cuda):", device)

    # Load all the needed variables (Ray Tune asks all the used variables to be inside this func scope)
    hyper_params = {
        # "device": str(device),
        "lr": 1e-3,
        "lr_finetune": 1e-4,  # for transfer learning (phase 2) (= lr / 1000)
        "epochs": 10,  #XXX: Set the #epochs to at least 10 for the real run in Colab
        "epochs_finetune": 10,  # for transfer learning (phase 2)
        "batch_size": 16,
        "num_workers": 2,
        # Use Height == Width to use 90-rotations/transpose in data aug
        "image_height": 352,
        "image_width": 352,
        "input_channels": 3,
        "num_classes": 1,
        "data_format": "channels_first",
        "pin_mem": True,
        "scheduler_step": 5,
        "scheduler_factor": 0.5,
        "scheduler_factor_finetune": 0.5,  # for transfer learning (phase 2)
        "train_img_dir": os.path.join("data", "traincrop", "img"),
        "train_mask_dir": os.path.join("data", "traincrop", "mask"),
        "val_img_dir": os.path.join("data", "valcrop", "img"),
        "val_mask_dir": os.path.join("data", "valcrop", "mask"),
        "test_img_dir": os.path.join("data", "testcrop", "img"),
        "test_mask_dir": os.path.join("data", "testcrop", "mask"),
        "pretrained_model_encoder": "timm-mobilenetv3_large_100",
    }

    # Load a Pretrained Model
    # TODO: Load the preprocess_fn before, so no need to create it for each run
    model_arch = config["pretrained_arch"]

    # create the model
    model = model_arch(
        # choose encoder
        encoder_name=config["pretrained_encoder"],
        # use `imagenet` pre-trained weights for encoder initialization
        encoder_weights="imagenet",
        # model input channels (1 for gray-scale images, 3 for RGB, etc.)
        in_channels=hyper_params["input_channels"],
        # model output channels (number of classes in your dataset)
        classes=hyper_params["num_classes"],
    )

    preprocess_input = get_preprocessing_fn(
        config["pretrained_encoder"], pretrained="imagenet"
    )

    # Create Data Augmentation
    train_transform = A.Compose(
        [
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.1),
            A.Rotate(limit=5, p=0.5),  # Use only when the img_height==img_width
            A.RandomRotate90(p=0.5),  # Use only when the img_height==img_width
            A.Transpose(p=0.5),  # Use only when the img_height==img_width
            A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.0, p=0.5),
            A.CenterCrop(
                int(0.95 * hyper_params["image_height"]),
                int(0.95 * hyper_params["image_width"]),
                p=0.3,
            ),
            A.ColorJitter(
                brightness=0.0,
                contrast=0.0,
                saturation=0.1,
                hue=0.05,
                p=0.8,
            ),
            A.ImageCompression(quality_lower=90, quality_upper=100, p=0.1),
            A.RGBShift(
                r_shift_limit=5,
                g_shift_limit=5,
                b_shift_limit=2,
                p=0.8,
            ),
            A.Resize(
                height=hyper_params["image_height"], width=hyper_params["image_width"]
            ),
        ],
    )

    # we don't want TTA, just some resize, normalization, etc.
    val_transform = A.Compose(
        [
            A.Resize(height=config["image_size"], width=config["image_size"]),
        ],
    )

    # Create the Datasets
    train_ds = SegmentaionDataset(
        image_dir=hyper_params["train_img_dir"],
        mask_dir=hyper_params["train_mask_dir"],
        transform=train_transform,
        preprocess_fn=preprocess_input,
        mask_suffix="",
        subset=[0, 10],
    )
    val_ds = SegmentaionDataset(
        image_dir=hyper_params["val_img_dir"],
        mask_dir=hyper_params["val_mask_dir"],
        transform=val_transform,
        preprocess_fn=preprocess_input,
        mask_suffix="",
        subset=[0, 5],
    )
    train_loader, val_loader = get_loaders(
        train_ds,
        val_ds,
        batch_size=config["batch_size"],
        num_workers=hyper_params["num_workers"],
        pin_memory=hyper_params["pin_mem"],
        worker_init_fn=seed_worker,
        generator=DATA_LOADER_GEN,
    )

    # Define Metrics
    metrics = {
        "jaccard (IOU)": JaccardBinarySegment(from_logits=True),
        "recall": RecallBinarySegment(from_logits=True),
        "precision": PrecisionBinarySegment(from_logits=True),
        "dice (F1-Score)": DiceBinarySegment(from_logits=True),
    }

    # Define Loss_fn, Optimizer, and Scheduler
    loss_fn = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])
    scheduler = ReduceLROnPlateau(
        optimizer,
        mode="min",
        factor=config["scheduler_factor"],
        # factor=hyper_params["scheduler_factor"],
        patience=2,
        min_lr=1e-6,
        threshold=1e-3,
    )

    # train the model (and get the history)
    # TODO: must change the fit_fn to use session.report(...)
    # TODO: use verbose=False to make the output of ray tune clean
    history = fit_fn(
        model,
        train_loader,
        optimizer,
        loss_fn,
        scheduler,
        metrics=metrics,
        val_loader=val_loader,
        epochs=hyper_params["epochs"],
        device=device,
        ray_tune=True,
    )

In [12]:
# Create a partial func of the model_creator (using functools.partial)
# as we only change the hyperparams of interest for each run of Ray Tune (i.e., hyperparam_tuner)

# Create Tuner Func

In [13]:
def hyperparam_tuner(
    config: dict,
    num_samples: int = 10,
    max_num_epochs: int = 10,
    tune_metric: str = "val_loss",
    tune_metric_mode: str = "min",
):
    """
    Given a model and a set of hyperparameters,
    tries to find the best combination and return the best performing model.

    Parameters
    ----------
    hyper_config: dict
    max_num_epochs: int
    num_samples: int
        Define the total number of samples/combinatin to use to train the model.
        In other words, the number of models to train for finding the best hyperparameters.
    tune_metric: str
        The name of the metric (e.g., val_loss) by which the performance of mdoels are assessed.
    tune_metric_mode: str
        Either "min" (default) or "max", depending on the chosen tune_metric. Specifies whether "min" values are desired, or vice versa.
    """
    # use ASHAScheduler to stop training early on if the model's plight is doomed
    scheduler_ray = ASHAScheduler(
        # metric="loss",  # this is specified in the Tuner's tune.TuneConifg, so only one must be present.
        # mode="min",  # this is specified in the Tuner's tune.TuneConifg, so only one must be present.
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2,
    )

    # method 1 (less control): from Pytorch Tutorials
    # results = tune.run(
    #     model_creator,
    #     resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
    #     config=hyper_config,
    #     num_samples=num_samples,
    #     scheduler=scheduler_ray,
    # )

    # method 2 (more control): from Ray Tune official documentaions
    tuner = tune.Tuner(
        # Use Only CPU
        # model_creator,
        # Use only GPU
        # tune.with_resources(model_creator, {"gpu": 1}),
        # Use both CPU & GPU --> the numbers define how many model to be trained in parallel
        tune.with_resources(model_creator, {"gpu": 1, "cpu": 0}),  #FIXME: change the "gpu" and "cou" values for Colab Pro
        param_space=config,
        tune_config=tune.TuneConfig(
            num_samples=num_samples,
            metric=tune_metric,
            mode=tune_metric_mode,
            search_alg=OptunaSearch(),  # Optuna Works better than Random Search (BOn Pytorch official YouTube)
            scheduler=scheduler_ray,
            chdir_to_trial_dir=False,  # if True (default) the Ray tune will change the current dir, thus no relative path (e.g., path to our data) doesn't work.
        ),
    )
    results = tuner.fit()

    return results

# Run the Tuner

In [14]:
if __name__ == "__main__":
    train_timestamp = datetime.today().strftime(time_format)
    # FIXME: chane num_samples=20 and max_num_epochs=10 for Colab Pro
    results = hyperparam_tuner(
        config=config,
        num_samples=60,  #FIXME: Change this to 60
        max_num_epochs=10,
        # tune_metric="val_dice (F1-Score)",
        # tune_metric_mode="max",
        tune_metric="val_loss",
        tune_metric_mode="min",
    )

2023-07-14 11:54:28,509	INFO worker.py:1636 -- Started a local Ray instance.
2023-07-14 11:54:29,821	INFO tune.py:226 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.
  return ot.distributions.LogUniformDistribution(
[I 2023-07-14 11:54:29,846] A new study created in memory with name: optuna


0,1
Current time:,2023-07-14 11:55:33
Running for:,00:01:03.71
Memory:,15.7/31.2 GiB

Trial name,# failures,error file
model_creator_1614ecc4,1,"/home/tekboart/ray_results/model_creator_2023-07-14_11-54-26/model_creator_1614ecc4_3_batch_size=4,image_size=352,lr=0.0003,pretrained_arch=ref_ph_c9fd45ce,pretrained_encoder=timm-efficientne_2023-07-14_11-54-49/error.txt"
model_creator_303487b8,1,"/home/tekboart/ray_results/model_creator_2023-07-14_11-54-26/model_creator_303487b8_5_batch_size=8,image_size=256,lr=0.0002,pretrained_arch=ref_ph_d08b895f,pretrained_encoder=timm-efficientne_2023-07-14_11-55-01/error.txt"

Trial name,status,loc,batch_size,image_size,lr,pretrained_arch,pretrained_encoder,scheduler_factor,iter,total time (s),val_jaccard (IOU),val_recall,val_precision
model_creator_186d809e,TERMINATED,192.168.1.103:641971,2,352,0.000701042,<class 'segment_f240,timm-mobilenetv_0a80,0.1,10.0,12.8294,0.103169,0.983342,0.103305
model_creator_7742e1ea,TERMINATED,192.168.1.103:641971,4,256,0.000884811,<class 'segment_f240,mobilenet_v2,0.1,1.0,1.64262,0.0738334,0.999216,0.0738349
model_creator_b23d6090,TERMINATED,192.168.1.103:644942,4,256,0.000334998,<class 'segment_f240,timm-efficientnet-b8,0.3,2.0,12.9775,0.0738971,0.999956,0.0738972
model_creator_34bb705e,TERMINATED,192.168.1.103:645647,8,352,0.000101363,<class 'segment_b2d0,timm-mobilenetv_0a80,0.3,10.0,11.2311,0.277246,0.36927,0.526631
model_creator_1614ecc4,ERROR,192.168.1.103:641971,4,352,0.000332456,<class 'segment_5c80,timm-efficientnet-b8,0.1,,,,,
model_creator_303487b8,ERROR,192.168.1.103:644942,8,256,0.000244315,<class 'segment_b2d0,timm-efficientnet-b8,0.5,,,,,


Epoch 1:   0%|          | 0/5 [00:00<?, ?it/s]
Epoch 1:  40%|████      | 2/5 [00:00<00:00, 18.75it/s]


[2m[36m(model_creator pid=641971)[0m ---------------------------------- epoch 1/10 ---------------------------------


Epoch 1: 100%|██████████| 5/5 [00:00<00:00, 26.89it/s]


[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m loss:                0.795886  
[2m[36m(model_creator pid=641971)[0m dice (F1-Score):     0.12      
[2m[36m(model_creator pid=641971)[0m precision:           0.07      
[2m[36m(model_creator pid=641971)[0m recall:              0.69      
[2m[36m(model_creator pid=641971)[0m jaccard (IOU):       0.07      


Trial name,date,done,hostname,iterations_since_restore,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,training_iteration,trial_id,val_dice (F1-Score),val_jaccard (IOU),val_loss,val_precision,val_recall
model_creator_1614ecc4,2023-07-14_11-54-50,,manjaro-dev,,192.168.1.103,641971,,,,1689323090,,1614ecc4,,,,,
model_creator_186d809e,2023-07-14_11-54-49,True,manjaro-dev,10.0,192.168.1.103,641971,12.829359531402588,1.230342149734497,12.829359531402588,1689323089,10.0,186d809e,0.1811168616016706,0.103168665120999,1.2298118472099304,0.1033045214911301,0.9833421508471172
model_creator_303487b8,2023-07-14_11-55-14,,manjaro-dev,,192.168.1.103,644942,,,,1689323114,,303487b8,,,,,
model_creator_34bb705e,2023-07-14_11-55-33,True,manjaro-dev,10.0,192.168.1.103,645647,11.23106050491333,1.0824763774871826,11.23106050491333,1689323133,10.0,34bb705e,0.43413046002388,0.2772456109523773,0.1430406719446182,0.5266313552856445,0.3692695796489715
model_creator_7742e1ea,2023-07-14_11-54-50,True,manjaro-dev,1.0,192.168.1.103,641971,1.642615556716919,1.642615556716919,1.642615556716919,1689323090,1.0,7742e1ea,0.1360522434115409,0.0738334152847528,29.7331485748291,0.0738349184393882,0.9992161691188812
model_creator_b23d6090,2023-07-14_11-55-14,True,manjaro-dev,2.0,192.168.1.103,644942,12.977480173110962,5.5364062786102295,12.977480173110962,1689323114,2.0,b23d6090,0.1361655853688717,0.0738970972597599,4.14430034160614,0.0738971810787916,0.9999564588069916


Epoch 2:   0%|          | 0/5 [00:00<?, ?it/s]
Epoch 2:  60%|██████    | 3/5 [00:00<00:00, 24.29it/s]


[2m[36m(model_creator pid=641971)[0m val_loss:            2.232531  
[2m[36m(model_creator pid=641971)[0m val_dice (F1-Score): 0.13      
[2m[36m(model_creator pid=641971)[0m val_precision:       0.07      
[2m[36m(model_creator pid=641971)[0m val_recall:          0.92      
[2m[36m(model_creator pid=641971)[0m val_jaccard (IOU):   0.07      
[2m[36m(model_creator pid=641971)[0m ---------------------------------- epoch 2/10 ---------------------------------


Epoch 2: 100%|██████████| 5/5 [00:00<00:00, 26.43it/s]


[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m loss:                0.642723  
[2m[36m(model_creator pid=641971)[0m dice (F1-Score):     0.20      
[2m[36m(model_creator pid=641971)[0m precision:           0.11      
[2m[36m(model_creator pid=641971)[0m recall:              0.89      
[2m[36m(model_creator pid=641971)[0m jaccard (IOU):       0.11      


Epoch 3:   0%|          | 0/5 [00:00<?, ?it/s]
Epoch 3:  60%|██████    | 3/5 [00:00<00:00, 23.62it/s]


[2m[36m(model_creator pid=641971)[0m val_loss:            2.548792  
[2m[36m(model_creator pid=641971)[0m val_dice (F1-Score): 0.14      
[2m[36m(model_creator pid=641971)[0m val_precision:       0.07      
[2m[36m(model_creator pid=641971)[0m val_recall:          0.99      
[2m[36m(model_creator pid=641971)[0m val_jaccard (IOU):   0.07      
[2m[36m(model_creator pid=641971)[0m ---------------------------------- epoch 3/10 ---------------------------------


Epoch 3: 100%|██████████| 5/5 [00:00<00:00, 27.86it/s]


[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m loss:                0.520741  
[2m[36m(model_creator pid=641971)[0m dice (F1-Score):     0.33      
[2m[36m(model_creator pid=641971)[0m precision:           0.21      
[2m[36m(model_creator pid=641971)[0m recall:              0.92      
[2m[36m(model_creator pid=641971)[0m jaccard (IOU):       0.21      


Epoch 4:   0%|          | 0/5 [00:00<?, ?it/s]
Epoch 4:  60%|██████    | 3/5 [00:00<00:00, 24.41it/s]


[2m[36m(model_creator pid=641971)[0m val_loss:            3.716569  
[2m[36m(model_creator pid=641971)[0m val_dice (F1-Score): 0.13      
[2m[36m(model_creator pid=641971)[0m val_precision:       0.07      
[2m[36m(model_creator pid=641971)[0m val_recall:          1.00      
[2m[36m(model_creator pid=641971)[0m val_jaccard (IOU):   0.07      
[2m[36m(model_creator pid=641971)[0m ---------------------------------- epoch 4/10 ---------------------------------


Epoch 4: 100%|██████████| 5/5 [00:00<00:00, 27.56it/s]


[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m loss:                0.417163  
[2m[36m(model_creator pid=641971)[0m dice (F1-Score):     0.55      
[2m[36m(model_creator pid=641971)[0m precision:           0.43      
[2m[36m(model_creator pid=641971)[0m recall:              0.94      
[2m[36m(model_creator pid=641971)[0m jaccard (IOU):       0.41      


Epoch 5:   0%|          | 0/5 [00:00<?, ?it/s]


[2m[36m(model_creator pid=641971)[0m val_loss:            3.837365  
[2m[36m(model_creator pid=641971)[0m val_dice (F1-Score): 0.13      
[2m[36m(model_creator pid=641971)[0m val_precision:       0.07      
[2m[36m(model_creator pid=641971)[0m val_recall:          0.99      
[2m[36m(model_creator pid=641971)[0m val_jaccard (IOU):   0.07      
[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m >>> lr_rate was decayed to: 0.000070
[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m ---------------------------------- epoch 5/10 ---------------------------------


Epoch 5: 100%|██████████| 5/5 [00:00<00:00, 26.44it/s]


[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m loss:                0.379964  
[2m[36m(model_creator pid=641971)[0m dice (F1-Score):     0.59      
[2m[36m(model_creator pid=641971)[0m precision:           0.50      
[2m[36m(model_creator pid=641971)[0m recall:              0.90      
[2m[36m(model_creator pid=641971)[0m jaccard (IOU):       0.44      


Epoch 6:   0%|          | 0/5 [00:00<?, ?it/s]


[2m[36m(model_creator pid=641971)[0m val_loss:            2.539305  
[2m[36m(model_creator pid=641971)[0m val_dice (F1-Score): 0.14      
[2m[36m(model_creator pid=641971)[0m val_precision:       0.08      
[2m[36m(model_creator pid=641971)[0m val_recall:          0.99      
[2m[36m(model_creator pid=641971)[0m val_jaccard (IOU):   0.08      
[2m[36m(model_creator pid=641971)[0m ---------------------------------- epoch 6/10 ---------------------------------


Epoch 6: 100%|██████████| 5/5 [00:00<00:00, 28.37it/s]


[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m loss:                0.378052  
[2m[36m(model_creator pid=641971)[0m dice (F1-Score):     0.57      
[2m[36m(model_creator pid=641971)[0m precision:           0.47      
[2m[36m(model_creator pid=641971)[0m recall:              0.91      
[2m[36m(model_creator pid=641971)[0m jaccard (IOU):       0.42      


Epoch 7:   0%|          | 0/5 [00:00<?, ?it/s]
Epoch 7:  60%|██████    | 3/5 [00:00<00:00, 25.44it/s]


[2m[36m(model_creator pid=641971)[0m val_loss:            2.301511  
[2m[36m(model_creator pid=641971)[0m val_dice (F1-Score): 0.14      
[2m[36m(model_creator pid=641971)[0m val_precision:       0.08      
[2m[36m(model_creator pid=641971)[0m val_recall:          0.99      
[2m[36m(model_creator pid=641971)[0m val_jaccard (IOU):   0.08      
[2m[36m(model_creator pid=641971)[0m ---------------------------------- epoch 7/10 ---------------------------------


Epoch 7: 100%|██████████| 5/5 [00:00<00:00, 28.01it/s]


[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m loss:                0.355753  
[2m[36m(model_creator pid=641971)[0m dice (F1-Score):     0.68      
[2m[36m(model_creator pid=641971)[0m precision:           0.57      
[2m[36m(model_creator pid=641971)[0m recall:              0.94      
[2m[36m(model_creator pid=641971)[0m jaccard (IOU):       0.55      


Epoch 8:   0%|          | 0/5 [00:00<?, ?it/s]
Epoch 8:  60%|██████    | 3/5 [00:00<00:00, 22.79it/s]


[2m[36m(model_creator pid=641971)[0m val_loss:            1.867247  
[2m[36m(model_creator pid=641971)[0m val_dice (F1-Score): 0.15      
[2m[36m(model_creator pid=641971)[0m val_precision:       0.08      
[2m[36m(model_creator pid=641971)[0m val_recall:          0.99      
[2m[36m(model_creator pid=641971)[0m val_jaccard (IOU):   0.08      
[2m[36m(model_creator pid=641971)[0m ---------------------------------- epoch 8/10 ---------------------------------


Epoch 8: 100%|██████████| 5/5 [00:00<00:00, 27.53it/s]


[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m loss:                0.358053  
[2m[36m(model_creator pid=641971)[0m dice (F1-Score):     0.63      
[2m[36m(model_creator pid=641971)[0m precision:           0.55      
[2m[36m(model_creator pid=641971)[0m recall:              0.90      
[2m[36m(model_creator pid=641971)[0m jaccard (IOU):       0.51      


Epoch 9:   0%|          | 0/5 [00:00<?, ?it/s]
Epoch 9:  40%|████      | 2/5 [00:00<00:00, 18.82it/s]


[2m[36m(model_creator pid=641971)[0m val_loss:            1.979770  
[2m[36m(model_creator pid=641971)[0m val_dice (F1-Score): 0.14      
[2m[36m(model_creator pid=641971)[0m val_precision:       0.08      
[2m[36m(model_creator pid=641971)[0m val_recall:          0.99      
[2m[36m(model_creator pid=641971)[0m val_jaccard (IOU):   0.08      
[2m[36m(model_creator pid=641971)[0m ---------------------------------- epoch 9/10 ---------------------------------


Epoch 9: 100%|██████████| 5/5 [00:00<00:00, 27.73it/s]


[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m loss:                0.344987  
[2m[36m(model_creator pid=641971)[0m dice (F1-Score):     0.66      
[2m[36m(model_creator pid=641971)[0m precision:           0.57      
[2m[36m(model_creator pid=641971)[0m recall:              0.90      
[2m[36m(model_creator pid=641971)[0m jaccard (IOU):       0.53      


Epoch 10:   0%|          | 0/5 [00:00<?, ?it/s]
Epoch 10:  20%|██        | 1/5 [00:00<00:00,  9.46it/s]


[2m[36m(model_creator pid=641971)[0m val_loss:            1.458895  
[2m[36m(model_creator pid=641971)[0m val_dice (F1-Score): 0.16      
[2m[36m(model_creator pid=641971)[0m val_precision:       0.09      
[2m[36m(model_creator pid=641971)[0m val_recall:          0.99      
[2m[36m(model_creator pid=641971)[0m val_jaccard (IOU):   0.09      
[2m[36m(model_creator pid=641971)[0m --------------------------------- epoch 10/10 ---------------------------------


Epoch 10: 100%|██████████| 5/5 [00:00<00:00, 25.33it/s]


[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m loss:                0.337612  
[2m[36m(model_creator pid=641971)[0m dice (F1-Score):     0.73      
[2m[36m(model_creator pid=641971)[0m precision:           0.63      
[2m[36m(model_creator pid=641971)[0m recall:              0.93      
[2m[36m(model_creator pid=641971)[0m jaccard (IOU):       0.59      


2023-07-14 11:54:49,134	INFO tensorboardx.py:275 -- Removed the following hyperparameter values when logging to tensorboard: {'pretrained_arch': ('__ref_ph', 'c70cbae7')}


[2m[36m(model_creator pid=641971)[0m val_loss:            1.229812  
[2m[36m(model_creator pid=641971)[0m val_dice (F1-Score): 0.18      
[2m[36m(model_creator pid=641971)[0m val_precision:       0.10      
[2m[36m(model_creator pid=641971)[0m val_recall:          0.98      
[2m[36m(model_creator pid=641971)[0m val_jaccard (IOU):   0.10      


Epoch 1:   0%|          | 0/3 [00:00<?, ?it/s]


[2m[36m(model_creator pid=641971)[0m ---------------------------------- epoch 1/10 ---------------------------------


Epoch 1: 100%|██████████| 3/3 [00:00<00:00, 15.12it/s]


[2m[36m(model_creator pid=641971)[0m 
[2m[36m(model_creator pid=641971)[0m loss:                1.141170  
[2m[36m(model_creator pid=641971)[0m dice (F1-Score):     0.09      
[2m[36m(model_creator pid=641971)[0m precision:           0.05      
[2m[36m(model_creator pid=641971)[0m recall:              0.93      
[2m[36m(model_creator pid=641971)[0m jaccard (IOU):       0.05      


2023-07-14 11:54:50,791	INFO tensorboardx.py:275 -- Removed the following hyperparameter values when logging to tensorboard: {'pretrained_arch': ('__ref_ph', 'c70cbae7')}


[2m[36m(model_creator pid=641971)[0m val_loss:            29.733149 
[2m[36m(model_creator pid=641971)[0m val_dice (F1-Score): 0.14      
[2m[36m(model_creator pid=641971)[0m val_precision:       0.07      
[2m[36m(model_creator pid=641971)[0m val_recall:          1.00      
[2m[36m(model_creator pid=641971)[0m val_jaccard (IOU):   0.07      


Epoch 1:   0%|          | 0/3 [00:00<?, ?it/s]
Epoch 1:  33%|███▎      | 1/3 [00:00<00:00,  7.10it/s]


[2m[36m(model_creator pid=641971)[0m ---------------------------------- epoch 1/10 ---------------------------------


Epoch 1: 100%|██████████| 3/3 [00:00<00:00, 15.12it/s]
2023-07-14 11:54:55,422	ERROR tune_controller.py:873 -- Trial task failed for trial model_creator_1614ecc4
Traceback (most recent call last):
  File "/home/tekboart/.local/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/tekboart/.local/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 18, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/tekboart/.local/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/tekboart/.local/lib/python3.11/site-packages/ray/_private/worker.py", line 2540, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=641971, ip=192.168

[2m[36m(model_creator pid=644942)[0m ---------------------------------- epoch 1/10 ---------------------------------


Epoch 1: 100%|██████████| 3/3 [00:00<00:00, 14.78it/s]


[2m[36m(model_creator pid=644942)[0m 
[2m[36m(model_creator pid=644942)[0m loss:                0.815731  
[2m[36m(model_creator pid=644942)[0m dice (F1-Score):     0.10      
[2m[36m(model_creator pid=644942)[0m precision:           0.06      
[2m[36m(model_creator pid=644942)[0m recall:              0.72      
[2m[36m(model_creator pid=644942)[0m jaccard (IOU):       0.06      
[2m[36m(model_creator pid=644942)[0m val_loss:            7.508049  
[2m[36m(model_creator pid=644942)[0m val_dice (F1-Score): 0.14      
[2m[36m(model_creator pid=644942)[0m val_precision:       0.07      
[2m[36m(model_creator pid=644942)[0m val_recall:          1.00      
[2m[36m(model_creator pid=644942)[0m val_jaccard (IOU):   0.07      
[2m[36m(model_creator pid=644942)[0m ---------------------------------- epoch 2/10 ---------------------------------


Epoch 2:   0%|          | 0/3 [00:00<?, ?it/s]
Epoch 2:  33%|███▎      | 1/3 [00:00<00:00,  6.74it/s]
Epoch 2: 100%|██████████| 3/3 [00:00<00:00, 14.49it/s]


[2m[36m(model_creator pid=644942)[0m 
[2m[36m(model_creator pid=644942)[0m loss:                0.684503  
[2m[36m(model_creator pid=644942)[0m dice (F1-Score):     0.17      
[2m[36m(model_creator pid=644942)[0m precision:           0.09      
[2m[36m(model_creator pid=644942)[0m recall:              0.94      
[2m[36m(model_creator pid=644942)[0m jaccard (IOU):       0.09      


2023-07-14 11:55:14,396	INFO tensorboardx.py:275 -- Removed the following hyperparameter values when logging to tensorboard: {'pretrained_arch': ('__ref_ph', 'c70cbae7')}


[2m[36m(model_creator pid=644942)[0m val_loss:            4.144300  
[2m[36m(model_creator pid=644942)[0m val_dice (F1-Score): 0.14      
[2m[36m(model_creator pid=644942)[0m val_precision:       0.07      
[2m[36m(model_creator pid=644942)[0m val_recall:          1.00      
[2m[36m(model_creator pid=644942)[0m val_jaccard (IOU):   0.07      


Epoch 1:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(model_creator pid=644942)[0m ---------------------------------- epoch 1/10 ---------------------------------


Epoch 1: 100%|██████████| 2/2 [00:00<00:00,  8.40it/s]
2023-07-14 11:55:16,618	ERROR tune_controller.py:873 -- Trial task failed for trial model_creator_303487b8
Traceback (most recent call last):
  File "/home/tekboart/.local/lib/python3.11/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
             ^^^^^^^^^^^^^^^
  File "/home/tekboart/.local/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 18, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/tekboart/.local/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/tekboart/.local/lib/python3.11/site-packages/ray/_private/worker.py", line 2540, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=644942, ip=192.168

[2m[36m(model_creator pid=645647)[0m ---------------------------------- epoch 1/10 ---------------------------------


Epoch 1: 100%|██████████| 2/2 [00:00<00:00,  9.44it/s]


[2m[36m(model_creator pid=645647)[0m 
[2m[36m(model_creator pid=645647)[0m loss:                2.553252  
[2m[36m(model_creator pid=645647)[0m dice (F1-Score):     0.07      
[2m[36m(model_creator pid=645647)[0m precision:           0.04      
[2m[36m(model_creator pid=645647)[0m recall:              0.94      
[2m[36m(model_creator pid=645647)[0m jaccard (IOU):       0.04      


Epoch 2:   0%|          | 0/2 [00:00<?, ?it/s]
Epoch 2:  50%|█████     | 1/2 [00:00<00:00,  5.37it/s]


[2m[36m(model_creator pid=645647)[0m val_loss:            1.330276  
[2m[36m(model_creator pid=645647)[0m val_dice (F1-Score): 0.12      
[2m[36m(model_creator pid=645647)[0m val_precision:       0.06      
[2m[36m(model_creator pid=645647)[0m val_recall:          0.85      
[2m[36m(model_creator pid=645647)[0m val_jaccard (IOU):   0.06      
[2m[36m(model_creator pid=645647)[0m ---------------------------------- epoch 2/10 ---------------------------------


Epoch 2: 100%|██████████| 2/2 [00:00<00:00,  9.40it/s]


[2m[36m(model_creator pid=645647)[0m 
[2m[36m(model_creator pid=645647)[0m loss:                2.458742  
[2m[36m(model_creator pid=645647)[0m dice (F1-Score):     0.07      
[2m[36m(model_creator pid=645647)[0m precision:           0.04      
[2m[36m(model_creator pid=645647)[0m recall:              0.87      
[2m[36m(model_creator pid=645647)[0m jaccard (IOU):       0.04      


Epoch 3:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(model_creator pid=645647)[0m val_loss:            0.481332  
[2m[36m(model_creator pid=645647)[0m val_dice (F1-Score): 0.18      
[2m[36m(model_creator pid=645647)[0m val_precision:       0.11      
[2m[36m(model_creator pid=645647)[0m val_recall:          0.43      
[2m[36m(model_creator pid=645647)[0m val_jaccard (IOU):   0.10      
[2m[36m(model_creator pid=645647)[0m ---------------------------------- epoch 3/10 ---------------------------------


Epoch 3: 100%|██████████| 2/2 [00:00<00:00,  9.97it/s]


[2m[36m(model_creator pid=645647)[0m 
[2m[36m(model_creator pid=645647)[0m loss:                0.732139  
[2m[36m(model_creator pid=645647)[0m dice (F1-Score):     0.08      
[2m[36m(model_creator pid=645647)[0m precision:           0.04      
[2m[36m(model_creator pid=645647)[0m recall:              0.40      
[2m[36m(model_creator pid=645647)[0m jaccard (IOU):       0.04      


Epoch 4:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(model_creator pid=645647)[0m val_loss:            0.256219  
[2m[36m(model_creator pid=645647)[0m val_dice (F1-Score): 0.09      
[2m[36m(model_creator pid=645647)[0m val_precision:       0.14      
[2m[36m(model_creator pid=645647)[0m val_recall:          0.07      
[2m[36m(model_creator pid=645647)[0m val_jaccard (IOU):   0.05      
[2m[36m(model_creator pid=645647)[0m ---------------------------------- epoch 4/10 ---------------------------------


Epoch 4: 100%|██████████| 2/2 [00:00<00:00,  7.81it/s]


[2m[36m(model_creator pid=645647)[0m 
[2m[36m(model_creator pid=645647)[0m loss:                0.471068  
[2m[36m(model_creator pid=645647)[0m dice (F1-Score):     0.10      
[2m[36m(model_creator pid=645647)[0m precision:           0.11      
[2m[36m(model_creator pid=645647)[0m recall:              0.14      
[2m[36m(model_creator pid=645647)[0m jaccard (IOU):       0.06      


Epoch 5:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(model_creator pid=645647)[0m val_loss:            0.223094  
[2m[36m(model_creator pid=645647)[0m val_dice (F1-Score): 0.04      
[2m[36m(model_creator pid=645647)[0m val_precision:       0.19      
[2m[36m(model_creator pid=645647)[0m val_recall:          0.02      
[2m[36m(model_creator pid=645647)[0m val_jaccard (IOU):   0.02      
[2m[36m(model_creator pid=645647)[0m ---------------------------------- epoch 5/10 ---------------------------------


Epoch 5: 100%|██████████| 2/2 [00:00<00:00,  8.31it/s]


[2m[36m(model_creator pid=645647)[0m 
[2m[36m(model_creator pid=645647)[0m loss:                0.185487  
[2m[36m(model_creator pid=645647)[0m dice (F1-Score):     0.19      
[2m[36m(model_creator pid=645647)[0m precision:           0.23      
[2m[36m(model_creator pid=645647)[0m recall:              0.17      
[2m[36m(model_creator pid=645647)[0m jaccard (IOU):       0.11      


Epoch 6:   0%|          | 0/2 [00:00<?, ?it/s]
Epoch 6:  50%|█████     | 1/2 [00:00<00:00,  5.59it/s]


[2m[36m(model_creator pid=645647)[0m val_loss:            0.219459  
[2m[36m(model_creator pid=645647)[0m val_dice (F1-Score): 0.03      
[2m[36m(model_creator pid=645647)[0m val_precision:       0.19      
[2m[36m(model_creator pid=645647)[0m val_recall:          0.02      
[2m[36m(model_creator pid=645647)[0m val_jaccard (IOU):   0.02      
[2m[36m(model_creator pid=645647)[0m ---------------------------------- epoch 6/10 ---------------------------------


Epoch 6: 100%|██████████| 2/2 [00:00<00:00,  9.82it/s]


[2m[36m(model_creator pid=645647)[0m 
[2m[36m(model_creator pid=645647)[0m loss:                0.142049  
[2m[36m(model_creator pid=645647)[0m dice (F1-Score):     0.23      
[2m[36m(model_creator pid=645647)[0m precision:           0.48      
[2m[36m(model_creator pid=645647)[0m recall:              0.18      
[2m[36m(model_creator pid=645647)[0m jaccard (IOU):       0.14      


Epoch 7:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(model_creator pid=645647)[0m val_loss:            0.206738  
[2m[36m(model_creator pid=645647)[0m val_dice (F1-Score): 0.05      
[2m[36m(model_creator pid=645647)[0m val_precision:       0.21      
[2m[36m(model_creator pid=645647)[0m val_recall:          0.03      
[2m[36m(model_creator pid=645647)[0m val_jaccard (IOU):   0.02      
[2m[36m(model_creator pid=645647)[0m ---------------------------------- epoch 7/10 ---------------------------------


Epoch 7: 100%|██████████| 2/2 [00:00<00:00, 10.49it/s]


[2m[36m(model_creator pid=645647)[0m 
[2m[36m(model_creator pid=645647)[0m loss:                0.149899  
[2m[36m(model_creator pid=645647)[0m dice (F1-Score):     0.20      
[2m[36m(model_creator pid=645647)[0m precision:           0.59      
[2m[36m(model_creator pid=645647)[0m recall:              0.12      
[2m[36m(model_creator pid=645647)[0m jaccard (IOU):       0.11      


Epoch 8:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(model_creator pid=645647)[0m val_loss:            0.185922  
[2m[36m(model_creator pid=645647)[0m val_dice (F1-Score): 0.11      
[2m[36m(model_creator pid=645647)[0m val_precision:       0.29      
[2m[36m(model_creator pid=645647)[0m val_recall:          0.07      
[2m[36m(model_creator pid=645647)[0m val_jaccard (IOU):   0.06      
[2m[36m(model_creator pid=645647)[0m ---------------------------------- epoch 8/10 ---------------------------------


Epoch 8:  50%|█████     | 1/2 [00:00<00:00,  4.83it/s]
Epoch 8: 100%|██████████| 2/2 [00:00<00:00,  8.59it/s]


[2m[36m(model_creator pid=645647)[0m 
[2m[36m(model_creator pid=645647)[0m loss:                0.101207  
[2m[36m(model_creator pid=645647)[0m dice (F1-Score):     0.49      
[2m[36m(model_creator pid=645647)[0m precision:           0.78      
[2m[36m(model_creator pid=645647)[0m recall:              0.37      
[2m[36m(model_creator pid=645647)[0m jaccard (IOU):       0.33      


Epoch 9:   0%|          | 0/2 [00:00<?, ?it/s]
Epoch 9:  50%|█████     | 1/2 [00:00<00:00,  5.65it/s]


[2m[36m(model_creator pid=645647)[0m val_loss:            0.170239  
[2m[36m(model_creator pid=645647)[0m val_dice (F1-Score): 0.19      
[2m[36m(model_creator pid=645647)[0m val_precision:       0.37      
[2m[36m(model_creator pid=645647)[0m val_recall:          0.13      
[2m[36m(model_creator pid=645647)[0m val_jaccard (IOU):   0.10      
[2m[36m(model_creator pid=645647)[0m ---------------------------------- epoch 9/10 ---------------------------------


Epoch 9: 100%|██████████| 2/2 [00:00<00:00,  9.82it/s]


[2m[36m(model_creator pid=645647)[0m 
[2m[36m(model_creator pid=645647)[0m loss:                0.089101  
[2m[36m(model_creator pid=645647)[0m dice (F1-Score):     0.69      
[2m[36m(model_creator pid=645647)[0m precision:           0.78      
[2m[36m(model_creator pid=645647)[0m recall:              0.62      
[2m[36m(model_creator pid=645647)[0m jaccard (IOU):       0.54      


Epoch 10:   0%|          | 0/2 [00:00<?, ?it/s]


[2m[36m(model_creator pid=645647)[0m val_loss:            0.157735  
[2m[36m(model_creator pid=645647)[0m val_dice (F1-Score): 0.28      
[2m[36m(model_creator pid=645647)[0m val_precision:       0.44      
[2m[36m(model_creator pid=645647)[0m val_recall:          0.20      
[2m[36m(model_creator pid=645647)[0m val_jaccard (IOU):   0.16      
[2m[36m(model_creator pid=645647)[0m --------------------------------- epoch 10/10 ---------------------------------


Epoch 10: 100%|██████████| 2/2 [00:00<00:00,  9.44it/s]


[2m[36m(model_creator pid=645647)[0m 
[2m[36m(model_creator pid=645647)[0m loss:                0.070277  
[2m[36m(model_creator pid=645647)[0m dice (F1-Score):     0.66      
[2m[36m(model_creator pid=645647)[0m precision:           0.68      
[2m[36m(model_creator pid=645647)[0m recall:              0.67      
[2m[36m(model_creator pid=645647)[0m jaccard (IOU):       0.49      


2023-07-14 11:55:33,561	INFO tensorboardx.py:275 -- Removed the following hyperparameter values when logging to tensorboard: {'pretrained_arch': ('__ref_ph', 'd08b895f')}
2023-07-14 11:55:33,571	ERROR tune.py:1107 -- Trials did not complete: [model_creator_1614ecc4, model_creator_303487b8]
2023-07-14 11:55:33,571	INFO tune.py:1111 -- Total run time: 63.75 seconds (63.70 seconds for the tuning loop).
- /home/tekboart/ray_results/model_creator_2023-07-14_11-54-26/model_creator_1614ecc4_3_batch_size=4,image_size=352,lr=0.0003,pretrained_arch=ref_ph_c9fd45ce,pretrained_encoder=timm-efficientne_2023-07-14_11-54-49
- /home/tekboart/ray_results/model_creator_2023-07-14_11-54-26/model_creator_303487b8_5_batch_size=8,image_size=256,lr=0.0002,pretrained_arch=ref_ph_d08b895f,pretrained_encoder=timm-efficientne_2023-07-14_11-55-01


[2m[36m(model_creator pid=645647)[0m val_loss:            0.143041  
[2m[36m(model_creator pid=645647)[0m val_dice (F1-Score): 0.43      
[2m[36m(model_creator pid=645647)[0m val_precision:       0.53      
[2m[36m(model_creator pid=645647)[0m val_recall:          0.37      
[2m[36m(model_creator pid=645647)[0m val_jaccard (IOU):   0.28      


# Report the best found hyperparams
> You can find the important reports of ... in:
* ~/ray_results

> Just make sure to delete them if don't need them anymore

> The files include:
1. The result.json (of the best model)
1. The saved checkpoint (of the best model)
1. The parameters (of the best model)

In [70]:
# Get the configs for the best performing model
best_trial = results.get_best_result("val_loss", 'min', "last")
print(f"Best trial config: {best_trial.config}")

# Define the used metrics
metrics = ["loss", "jaccard (IOU)", "dice (F1-Score)", "recall", "precision"]
# add "val_" at the beginning
metrics_val = [f'val_{metric}' for metric in metrics]

for metric in metrics_val:
    print(f"Best trial final {metric}: {best_trial.metrics[metric]}")

Best trial config: {'lr': 0.00010136333446857242, 'scheduler_factor': 0.3, 'batch_size': 8, 'image_size': 352, 'pretrained_arch': <class 'segmentation_models_pytorch.decoders.fpn.model.FPN'>, 'pretrained_encoder': 'timm-mobilenetv3_large_100'}
Best trial final val_loss: 0.14304067194461823
Best trial final val_jaccard (IOU): 0.2772456109523773
Best trial final val_dice (F1-Score): 0.43413046002388
Best trial final val_recall: 0.36926957964897156
Best trial final val_precision: 0.5266313552856445


In [69]:
# print the results as a DataFrame
df_results = results.get_dataframe()
df_results.insert(loc=0, column="Model", value=range(1, len(df_results.index) + 1))
df_results.to_csv(f"outputs//{train_timestamp}@history_training_train_val.csv", index=False)
print(" Metrics During Training ".center(79, " "))
display(df_results)
# del df_results

                            Metrics During Training                            


Unnamed: 0,Model,val_jaccard (IOU),val_recall,val_precision,val_dice (F1-Score),val_loss,time_this_iter_s,done,training_iteration,trial_id,...,node_ip,time_since_restore,iterations_since_restore,config/batch_size,config/image_size,config/lr,config/pretrained_arch,config/pretrained_encoder,config/scheduler_factor,logdir
0,1,0.103169,0.983342,0.103305,0.181117,1.229812,1.230342,True,10,186d809e,...,192.168.1.103,12.82936,10,2,352,0.000701,<class 'segmentation_models_pytorch.decoders.u...,timm-mobilenetv3_large_100,0.1,/home/tekboart/ray_results/model_creator_2023-...
1,2,0.073833,0.999216,0.073835,0.136052,29.733149,1.642616,True,1,7742e1ea,...,192.168.1.103,1.642616,1,4,256,0.000885,<class 'segmentation_models_pytorch.decoders.u...,mobilenet_v2,0.1,/home/tekboart/ray_results/model_creator_2023-...
2,3,0.073897,0.999956,0.073897,0.136166,4.1443,5.536406,True,2,b23d6090,...,192.168.1.103,12.97748,2,4,256,0.000335,<class 'segmentation_models_pytorch.decoders.u...,timm-efficientnet-b8,0.3,/home/tekboart/ray_results/model_creator_2023-...
3,4,0.277246,0.36927,0.526631,0.43413,0.143041,1.082476,True,10,34bb705e,...,192.168.1.103,11.231061,10,8,352,0.000101,<class 'segmentation_models_pytorch.decoders.f...,timm-mobilenetv3_large_100,0.3,/home/tekboart/ray_results/model_creator_2023-...


# Plot the Metrics (for all of the trained models)

In [68]:
# Obtain a trial dataframe from all run trials of this `tune.run` call.
# dfs = {result: results.get_dataframe()[result] for result in results.get_dataframe()}

# plt.plot(dfs['val_loss'])

# Plot by epoch
# ax = None  # This plots everything on the same plot
# for d in dfs.values():
    # ax = d['val_loss'].plot(ax=ax, legend=False)
    # plt.plot(d['val_loss'])
# ax.set_xlabel('Epochs')
# ax.set_ylabel("val_loss")

# Save the results
> Results are the config and metrics of all the trained models (not just the best one)

> We can use results later on to report different performance in table.

In [None]:
# save hyperparams as a JSON file
#TODO: Check the type of results
hyper_params_export_name = (
    f"outputs{os.sep}hyperparams_search{os.sep}{train_timestamp}@ray_tuner_hyperparams.json"
)
with open(hyper_params_export_name, "w") as f:
    json.dump(dict(map(lambda x: (x[0], str(x[1])), results)), f)

# save the hyperparams to a csv file
# df_hyperparam = pd.DataFrame(hyper_params, index=[0]).T
# df_hyperparam.to_csv(f'outputs/hyperparams/{start_train_time}@hyperparams.csv', index=True, header=None)
# print('The HyperParameters'.ljust(79, " "))
# display(df_hyperparam)
# del df_hyperparam