In [1]:
import numpy as np
import os
import pandas as pd
import albumentations as A
from albumentations.pytorch import ToTensorV2
import wandb
from dotenv import load_dotenv

from src.model.model import save_model, load_model
from src.dataset.df import df_dataset, check_dataset
from src.utils.common import set_seed
from src.utils.optim_thresh import calc_optim_thresh
from src.experiment.experiment import train, valid
from src.experiment.initialize import init_dataset, init_model, init_exp

In [2]:
class cfg:
    debug = False
    check_dataset = False

    # = data CFG ====================================================

    dataset_path = "/kaggle/working/dataset/cropped_xy_256_128_z_5_5/"
    dataset = "base2d"
    negative_sample_rate = 0.1
    # = experiment CFG =================================================

    project = "SenNet"
    exp_name = os.path.basename(os.getcwd())
    notes = "basel 3_axis negative_sample=0.1"

    # = model CFG ======================================================

    model_arch = "Unet"
    backbone = "efficientnet-b0"
    in_chans = 5
    target_size = 5

    # = training CFG ===================================================

    epochs = 20

    train_batch_size = 64
    valid_batch_size = train_batch_size

    loss = "DiceLoss"
    metrics = "Dice"
    lr = 1e-4
    thresholds_to_test = range(2, 101, 4)
    num_workers = 24

    # = augmentation ===================================================

    image_size = 256
    train_aug = [
        # A.RandomResizedCrop(image_size, image_size, scale=(0.8, 1.25)),
        # A.ShiftScaleRotate(p=0.75),
        # A.OneOf(
        #     [
        #         A.GaussNoise(var_limit=[10, 50]),
        #         A.GaussianBlur(),
        #         A.MotionBlur(),
        #     ],
        #     p=0.4,
        # ),
        # A.GridDistortion(num_steps=5, distort_limit=0.3, p=0.5),
        ToTensorV2(transpose_mask=True),
    ]

    valid_aug = [
        ToTensorV2(transpose_mask=True),
    ]


load_dotenv("/kaggle/key.env")
set_seed()

In [3]:
def filter_dataset(df):
    # labelが全くないものは90%の確率で除外
    df["random"] = np.random.rand(len(df))
    df = df[(df["sum"] > 0) | (df["random"] < cfg.negative_sample_rate)]
    df = df.reset_index(drop=True)
    df = df.drop(["random"], axis=1)
    return df


df = df_dataset(cfg)
df = filter_dataset(df)
# df = df.sample(1000).reset_index(drop=True)
if cfg.debug:
    df = df.sample(10000).reset_index(drop=True)
display(df)

if cfg.check_dataset:
    check_dataset(df, cfg)

Unnamed: 0,image_path,label_path,fname,kidney,x,y,z,std,sum,fold0,fold1
0,/kaggle/working/dataset/cropped_xy_256_128_z_5...,/kaggle/working/dataset/cropped_xy_256_128_z_5...,x0_y0_z120_std0033_sum0,kidney_1_dense,0,0,120,33,0,train,valid
1,/kaggle/working/dataset/cropped_xy_256_128_z_5...,/kaggle/working/dataset/cropped_xy_256_128_z_5...,x0_y0_z140_std0033_sum0,kidney_1_dense,0,0,140,33,0,train,valid
2,/kaggle/working/dataset/cropped_xy_256_128_z_5...,/kaggle/working/dataset/cropped_xy_256_128_z_5...,x0_y0_z225_std0033_sum0,kidney_1_dense,0,0,225,33,0,train,valid
3,/kaggle/working/dataset/cropped_xy_256_128_z_5...,/kaggle/working/dataset/cropped_xy_256_128_z_5...,x0_y0_z240_std0035_sum0,kidney_1_dense,0,0,240,35,0,train,valid
4,/kaggle/working/dataset/cropped_xy_256_128_z_5...,/kaggle/working/dataset/cropped_xy_256_128_z_5...,x0_y0_z260_std0036_sum0,kidney_1_dense,0,0,260,36,0,train,valid
...,...,...,...,...,...,...,...,...,...,...,...
143589,/kaggle/working/dataset/cropped_xy_256_128_z_5...,/kaggle/working/dataset/cropped_xy_256_128_z_5...,x896_y256_z975_std0257_sum899,kidney_3_sparse,896,256,975,257,899,,train
143590,/kaggle/working/dataset/cropped_xy_256_128_z_5...,/kaggle/working/dataset/cropped_xy_256_128_z_5...,x896_y256_z980_std0259_sum3087,kidney_3_sparse,896,256,980,259,3087,,train
143591,/kaggle/working/dataset/cropped_xy_256_128_z_5...,/kaggle/working/dataset/cropped_xy_256_128_z_5...,x896_y256_z985_std0259_sum5991,kidney_3_sparse,896,256,985,259,5991,,train
143592,/kaggle/working/dataset/cropped_xy_256_128_z_5...,/kaggle/working/dataset/cropped_xy_256_128_z_5...,x896_y256_z990_std0262_sum7311,kidney_3_sparse,896,256,990,262,7311,,train


In [4]:
df_train = df[df["fold1"] == "train"]
df_valid = df[df["fold1"] == "valid"]
df_valid = df_valid.sample(10000).reset_index(drop=True)
df = pd.concat([df_train, df_valid])

In [5]:
if cfg.debug:
    print("!!!Debug mode!!!\n")
    cfg.epochs = 5

for fold in range(1, 2):
    train_dataloader, valid_dataloader = init_dataset(fold, df, cfg)
    model, scaler, criterion, optimizer, scheduler, metrics = init_model(cfg)
    slacknotify = init_exp(fold, cfg)

    path_best = f"./{cfg.exp_name}/{cfg.exp_name}_best_fold{fold}.pth"
    path_last = f"./{cfg.exp_name}/{cfg.exp_name}_last_fold{fold}.pth"

    best_loss = float("inf")
    for epoch in range(cfg.epochs):
        train(model, train_dataloader, optimizer, criterion, scheduler, scaler, epoch, cfg)
        loss, pred_list, true_list = valid(model, valid_dataloader, criterion, epoch, cfg)

        if loss < best_loss:
            print(f"loss : {loss:.4f}\tSAVED MODEL\n")
            slacknotify.send_reply(f"epoch : {epoch}\tscore : {loss:.4f}\tBEST")
            best_loss = loss
            save_model(model, cfg, path_best, loss=loss)
        else:
            print(f"loss : {loss:.4f}\n")
            slacknotify.send_reply(f"epoch : {epoch}\tscore : {loss:.4f}")

    last_score, last_thresh = calc_optim_thresh(pred_list, true_list, metrics, cfg)
    save_model(model, cfg, path_last, loss=loss, score=last_score, thresh=last_thresh)
    wandb.config.update({"last_score": last_score, "last_thresh": last_thresh})

    best_model = load_model(model, path_best)
    loss, pred_list, true_list = valid(best_model, valid_dataloader, criterion, epoch, cfg, log=False)

    best_score, best_thresh = calc_optim_thresh(pred_list, true_list, metrics, cfg)
    save_model(best_model, cfg, path_best, loss=loss, score=best_score, thresh=best_thresh)
    wandb.config.update({"best_score": best_score, "best_thresh": best_thresh})

    slacknotify.send_reply(
        f"{cfg.exp_name}_fold{fold} training finished\nbest score : {best_score:.4f} last score : {last_score:.4f}",
        True,
    )

    if wandb.run:
        wandb.finish()

model_arch:  Unet
backbone:  efficientnet-b0


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mwelshonionman[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 0/20  Mem : 7.24GB  LR : 1.00E-04  Loss: 0.3581: 100%|██████████| 1385/1385 [05:35<00:00,  4.13it/s]
Val Loss: 0.2506: 100%|██████████| 157/157 [00:48<00:00,  3.24it/s]


loss : 0.2506	SAVED MODEL



Epoch 1/20  Mem : 9.4GB  LR : 1.00E-03  Loss: 0.0717: 100%|██████████| 1385/1385 [05:26<00:00,  4.24it/s]
Val Loss: 0.2018: 100%|██████████| 157/157 [00:38<00:00,  4.09it/s]


loss : 0.2018	SAVED MODEL



Epoch 2/20  Mem : 9.4GB  LR : 1.00E-04  Loss: 0.0499: 100%|██████████| 1385/1385 [05:03<00:00,  4.56it/s]
Val Loss: 0.1982: 100%|██████████| 157/157 [00:38<00:00,  4.13it/s]


loss : 0.1982	SAVED MODEL



Epoch 3/20  Mem : 9.4GB  LR : 9.94E-05  Loss: 0.0480: 100%|██████████| 1385/1385 [05:05<00:00,  4.53it/s]
Val Loss: 0.1987: 100%|██████████| 157/157 [00:37<00:00,  4.19it/s]


loss : 0.1987



Epoch 4/20  Mem : 9.4GB  LR : 9.76E-05  Loss: 0.0463: 100%|██████████| 1385/1385 [05:03<00:00,  4.56it/s]
Val Loss: 0.1895: 100%|██████████| 157/157 [00:37<00:00,  4.17it/s]


loss : 0.1895	SAVED MODEL



Epoch 5/20  Mem : 9.4GB  LR : 9.46E-05  Loss: 0.0447: 100%|██████████| 1385/1385 [05:05<00:00,  4.53it/s]
Val Loss: 0.1965: 100%|██████████| 157/157 [00:37<00:00,  4.15it/s]


loss : 0.1965



Epoch 6/20  Mem : 9.4GB  LR : 9.05E-05  Loss: 0.0434: 100%|██████████| 1385/1385 [05:06<00:00,  4.53it/s]
Val Loss: 0.1828: 100%|██████████| 157/157 [00:38<00:00,  4.03it/s]


loss : 0.1828	SAVED MODEL



Epoch 7/20  Mem : 9.4GB  LR : 8.54E-05  Loss: 0.0421: 100%|██████████| 1385/1385 [05:11<00:00,  4.45it/s]
Val Loss: 0.1823: 100%|██████████| 157/157 [00:37<00:00,  4.19it/s]


loss : 0.1823	SAVED MODEL



Epoch 8/20  Mem : 9.4GB  LR : 7.94E-05  Loss: 0.0412: 100%|██████████| 1385/1385 [05:11<00:00,  4.45it/s]
Val Loss: 0.1972: 100%|██████████| 157/157 [00:38<00:00,  4.09it/s]


loss : 0.1972



Epoch 9/20  Mem : 9.4GB  LR : 7.27E-05  Loss: 0.0401: 100%|██████████| 1385/1385 [05:09<00:00,  4.47it/s]
Val Loss: 0.2103: 100%|██████████| 157/157 [00:37<00:00,  4.24it/s]


loss : 0.2103



Epoch 10/20  Mem : 9.4GB  LR : 6.55E-05  Loss: 0.0390: 100%|██████████| 1385/1385 [04:53<00:00,  4.72it/s]
Val Loss: 0.1924: 100%|██████████| 157/157 [00:41<00:00,  3.76it/s]


loss : 0.1924



Epoch 11/20  Mem : 9.4GB  LR : 5.79E-05  Loss: 0.0379: 100%|██████████| 1385/1385 [04:48<00:00,  4.80it/s]
Val Loss: 0.1805: 100%|██████████| 157/157 [00:36<00:00,  4.30it/s]


loss : 0.1805	SAVED MODEL



Epoch 12/20  Mem : 9.4GB  LR : 5.01E-05  Loss: 0.0369: 100%|██████████| 1385/1385 [04:47<00:00,  4.82it/s]
Val Loss: 0.1839: 100%|██████████| 157/157 [00:35<00:00,  4.41it/s]


loss : 0.1839



Epoch 13/20  Mem : 9.4GB  LR : 4.22E-05  Loss: 0.0361: 100%|██████████| 1385/1385 [04:44<00:00,  4.86it/s]
Val Loss: 0.1771: 100%|██████████| 157/157 [00:35<00:00,  4.49it/s]


loss : 0.1771	SAVED MODEL



Epoch 14/20  Mem : 9.4GB  LR : 3.46E-05  Loss: 0.0355: 100%|██████████| 1385/1385 [04:43<00:00,  4.88it/s]
Val Loss: 0.1811: 100%|██████████| 157/157 [00:35<00:00,  4.38it/s]


loss : 0.1811



Epoch 15/20  Mem : 9.4GB  LR : 2.74E-05  Loss: 0.0348: 100%|██████████| 1385/1385 [04:44<00:00,  4.87it/s]
Val Loss: 0.1811: 100%|██████████| 157/157 [00:35<00:00,  4.40it/s]


loss : 0.1811



Epoch 16/20  Mem : 9.4GB  LR : 2.07E-05  Loss: 0.0343: 100%|██████████| 1385/1385 [04:43<00:00,  4.89it/s]
Val Loss: 0.1814: 100%|██████████| 157/157 [00:35<00:00,  4.45it/s]


loss : 0.1814



Epoch 17/20  Mem : 9.4GB  LR : 1.47E-05  Loss: 0.0339: 100%|██████████| 1385/1385 [04:42<00:00,  4.90it/s]
Val Loss: 0.1847: 100%|██████████| 157/157 [00:35<00:00,  4.46it/s]


loss : 0.1847



Epoch 18/20  Mem : 9.4GB  LR : 9.64E-06  Loss: 0.0336: 100%|██████████| 1385/1385 [04:56<00:00,  4.68it/s]
Val Loss: 0.1852: 100%|██████████| 157/157 [00:36<00:00,  4.29it/s]


loss : 0.1852



Epoch 19/20  Mem : 9.4GB  LR : 5.54E-06  Loss: 0.0335: 100%|██████████| 1385/1385 [04:48<00:00,  4.81it/s]
Val Loss: 0.1849: 100%|██████████| 157/157 [00:36<00:00,  4.36it/s]


loss : 0.1849



100%|██████████| 25/25 [03:43<00:00,  8.93s/it]
Val Loss: 0.1772:  76%|███████▋  | 120/157 [00:32<00:10,  3.69it/s]


OSError: Caught OSError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/kaggle/src/dataset/common.py", line 25, in __getitem__
    image = np.load(self.df["image_path"][idx]).astype(np.float32)
  File "/usr/local/lib/python3.10/dist-packages/numpy/lib/npyio.py", line 405, in load
    fid = stack.enter_context(open(os_fspath(file), "rb"))
OSError: [Errno 12] Cannot allocate memory: '/kaggle/working/dataset/cropped_xy_256_128_z_5_5//image/kidney_1_dense_axis1/x896_y0_z870_std0206_sum2030.npy'
