In [1]:
!nvidia-smi

Mon Jul  3 11:28:16 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:06:00.0 Off |                  N/A |
| 23%   32C    P8     9W / 250W |      0MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Setup environment

In [2]:
# sync python module
%load_ext autoreload
%autoreload 2

## Setup config

In [37]:
import os

workspace_dir = '/nfs/Workspace/CardiacSeg'
model_name = 'unet3d'
data_name = 'mmwhs'
exp_name = 'exp_2_tn2'
data_dict_file_name = 'exp_2.json'

tune_mode = 'train'

root_exp_dir = os.path.join(
    workspace_dir, 
    'exps',
    'exps',
    model_name,
    data_name,
    'tune_results'
)

root_data_dir = os.path.join(
    workspace_dir, 
    'dataset',
    data_name
)

data_dir = os.path.join(root_data_dir)
data_dicts_json = os.path.join(workspace_dir, 'exps', 'data_dicts', data_name, data_dict_file_name)

model_dir = os.path.join('./', 'models')
log_dir = os.path.join('./', 'logs')
eval_dir = os.path.join('./', 'evals')

best_checkpoint = os.path.join(model_dir, 'best_model.pth')
final_checkpoint = os.path.join(model_dir, 'final_model.pth')

pretrain_exp_name = 'exp_50'
pretrain_data_name = 'image_cas'
pretrain_model_dir = os.path.join(
    workspace_dir,
    'exps',
    'exps',
    model_name,
    pretrain_data_name,
    'pretrain',
    pretrain_exp_name,
    'models'
)
pretrain_checkpoint = os.path.join(pretrain_model_dir, 'model_bestValRMSE.pt')

os.makedirs(root_exp_dir, exist_ok=True)

%cd {root_exp_dir}/../

/nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a1/mmwhs


## Train UNETCNX

In [39]:
# training
!PYTHONPATH=/nfs/Workspace/CardiacSeg /opt/conda/bin/python /nfs/Workspace/CardiacSeg/expers/tune.py \
--tune_mode={tune_mode} \
--exp_name={exp_name} \
--data_name={data_name} \
--data_dir={data_dir} \
--root_exp_dir={root_exp_dir} \
--model_name={model_name}\
--model_dir={model_dir} \
--log_dir={log_dir} \
--eval_dir={eval_dir} \
--start_epoch=0 \
--val_every=20 \
--max_early_stop_count=20 \
--max_epoch=8000  \
--data_dicts_json={data_dicts_json} \
--pin_memory \
--out_channels=8 \
--patch_size=4 \
--feature_size=48 \
--drop_rate=0.1 \
--depths 3 3 9 3 \
--kernel_size 7 \
--exp_rate 4 \
--norm_name='layer' \
--a_min=-70 \
--a_max=677 \
--space_x=1.0 \
--space_y=1.0 \
--space_z=1.0 \
--roi_x=96 \
--roi_y=96 \
--roi_z=96 \
--optim='AdamW' \
--lr=7e-4 \
--weight_decay=5e-4 \
--checkpoint={final_checkpoint} \
--use_init_weights \
--deep_sup \
--infer_post_process \
# --resume_tuner \
# --test_mode \
# --save_eval_csv \

test mode
resume tuner form /nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a1/mmwhs/tune_results
run test mode ...
cuda is available
model: unetcnx_a1
patch size: 4
ker size: 7
exp rate: 4
feature sizes: [48, 96, 192, 384]
depths: [3, 3, 9, 3]
drop rate: 0.1
use init weights: True
is conv stem: False
use init weights
use deep sup
loss: dice ce loss
optimzer: AdamW
{'lr': 0.0007, 'weight_decay': 0.0005}
=> loaded checkpoint '/nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a1/mmwhs/tune_results/exp_2_tn2/main_ba545_00000_0_exp=exp_exp_2_tn2_2023-06-02_13-38-16/models/best_model.pth' (epoch 881) (bestacc 0.9047536253929138) (early stop count 0)
load json from /nfs/Workspace/CardiacSeg/exps/data_dicts/mmwhs/exp_2.json
train files (8): ['ct_train_1001_image', 'ct_train_1002_image', 'ct_train_1003_image', 'ct_train_1004_image', 'ct_train_1005_image', 'ct_train_1006_image', 'ct_train_1007_image', 'ct_train_1008_image']
val files (2): ['ct_train_1009_image', 'ct_train_1010_image']
test files (10): ['

## Train other model

In [None]:
# training
!PYTHONPATH=/nfs/Workspace/CardiacSeg /opt/conda/bin/python /nfs/Workspace/CardiacSeg/expers/tune.py \
--tune_mode={tune_mode} \
--exp_name={exp_name} \
--data_name={data_name} \
--data_dir={data_dir} \
--root_exp_dir={root_exp_dir} \
--model_name={model_name}\
--model_dir={model_dir} \
--log_dir={log_dir} \
--eval_dir={eval_dir} \
--start_epoch=0 \
--val_every=20 \
--max_early_stop_count=20 \
--max_epoch=8000  \
--data_dicts_json={data_dicts_json} \
--pin_memory \
--out_channels=8 \
--patch_size=4 \
--feature_size=48 \
--drop_rate=0.1 \
--depths 3 3 9 3 \
--kernel_size 7 \
--exp_rate 4 \
--norm_name='layer' \
--a_min=-70 \
--a_max=677 \
--space_x=1.0 \
--space_y=1.0 \
--space_z=1.0 \
--roi_x=96 \
--roi_y=96 \
--roi_z=96 \
--optim='AdamW' \
--lr=7e-4 \
--weight_decay=5e-4 \
--checkpoint={final_checkpoint} \
--use_init_weights \
--infer_post_process \
# --resume_tuner \
# --test_mode \
# --save_eval_csv \

## Analysis

In [5]:
!PYTHONPATH=/nfs/Workspace/CardiacSeg /opt/conda/bin/python /nfs/Workspace/CardiacSeg/expers/tune_anal.py \
--exp_name={exp_name} \
--local_dir={root_exp_dir}

Loading results from /nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a0/mmwhs/tune_results/tmp...
2023-05-05 05:57:27,424 - No `self.trials`. Drawing logdirs from checkpoint file. This may result in some information that is out of sync, as checkpointing is periodic.
Trial 7ffb2_00000:  {'exp': {'exp': 'tmp'}} 0.85568607 0.22231789

Best trial 7ffb2_00000: 
config: {'exp': {'exp': 'tmp'}}
tt_dice: 0.85568607
tt_hd95: 21.87371429569567
inf_dice: 0.22231789
inf_hd95: 114.227495408918
best log dir: /nfs/Workspace/CardiacSeg/exps/exps/unetcnx_a0/mmwhs/tune_results/tmp/main_7ffb2_00000_0_exp=exp_tmp_2023-04-23_03-32-40
final early stop count: 20
final epoch: 1920
best val dice: 0.9036121964454651
[0m