In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
%load_ext autoreload
%autoreload 2
import torch
import torch.nn as nn
import sys
sys.path.append('../scripts/')
from unet_custom_implementation import Unet, Unet_Leaky, UNet
import torchvision
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import wandb
import segmentation_models_pytorch as smp
from tqdm import tqdm
from torch.functional import F

In [2]:
multiple_gpus = True
if torch.cuda.is_available():
    if torch.cuda.device_count() > 1:
        multiple_gpus = True
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
csv_file = pd.read_csv('../../ship_data/train_ship_segmentations_v2.csv')
csv_file = csv_file.groupby('ImageId')['EncodedPixels'].apply(list).reset_index()
image_ids, pixels = csv_file['ImageId'].values.tolist(), csv_file['EncodedPixels'].values.tolist()

In [4]:
csv_file['fixed_inputs'] = csv_file['ImageId'].apply(lambda x: '../../ship_data/train_v2/' + x)
csv_file['mask_paths'] = csv_file['ImageId'].apply(lambda x: '../../ship_data/masks_v1/train/' + x.split('.')[0] + '.' + 'png')

In [5]:
csv_file = csv_file[csv_file['fixed_inputs'] != '../../files/train_v2/6384c3e78.jpg']

In [6]:
def split_datasets(csv_file, test_size = 0.01):
    train, test = train_test_split(csv_file, test_size = test_size, random_state=42)
    train, val = train_test_split(train, test_size = test_size, random_state=42)
    return train, val, test

In [7]:
train, val, test = split_datasets(csv_file)

In [8]:
class GetData(Dataset):
    def __init__(self, csv_file: pd.DataFrame):
        self.img_paths = csv_file['fixed_inputs'].values.tolist()
        self.mask_paths = csv_file['mask_paths'].values.tolist()
    
    def __len__(self) -> int:
        return len(self.img_paths)
    
    def __getitem__(self, index):
        img = torchvision.io.read_file(self.img_paths[index])
        img = torchvision.io.decode_jpeg(img)
        mask = torchvision.io.read_file(self.mask_paths[index])
        mask = torchvision.io.decode_image(mask)
        img = torchvision.transforms.functional.resize(img, (512, 512))
        mask = torchvision.transforms.functional.resize(mask, (512, 512))
        img = img / 255
        mask = mask / 255
        mask = torch.where(mask < 1.0, 0.0, 1.0)
        return img, mask

In [9]:
# Loss function
def dice_bce_loss(inputs, targets, smooth = 1e-5):
    # remove if your model inherently handles sigmoid
    # number_of_pixels = inputs.shape[0] * (512 * 512 * 3)
    # sigmoid = nn.Sigmoid()
    # inputs = sigmoid(inputs)
    inputs = inputs.view(-1)
    targets = targets.view(-1)
    intersection = (inputs * targets).sum()
    dice_loss = (2. * intersection + smooth) / (inputs.sum() + targets.sum() + smooth)
    dice_loss = 1 - dice_loss
    # Pixel wise log loss is calculated not number of images
    # I checked reduce by mean is correct measure.
    BCE = nn.functional.binary_cross_entropy(inputs, targets, reduce='mean')
    final = dice_loss + BCE
    return final

In [10]:
# IOU metric
# SMOOTH = 1e-5
def iou_score(inputs, targets, thres = 0.5, smooth=1e-5):
    # sigmoid = nn.Sigmoid()
    # inputs = sigmoid(inputs)
    if thres != None:
        inputs = (inputs > thres).float()
    inputs = inputs.view(-1)
    targets = targets.view(-1)
    intersection = torch.sum(inputs * targets)
    unioun = torch.sum(inputs + targets) - intersection
    # TP = torch.sum(torch.logical_and(inputs == 1, targets == 1))
    # FP = torch.sum(torch.logical_and(inputs == 1, targets == 0))
    # FN = torch.sum(torch.logical_and(inputs == 0, targets == 1))
    iou = (intersection + smooth) / (unioun + smooth)
    return iou

In [11]:
def train_model(model, train_set, val_set, epochs):
    # wandb.init(project = 'ship-segmentation-pytorch-wb')
    # model = nn.DataParallel(model)
    model = model.to(device)
    datadict = {
        'train': train_set,
        'val' : val_set
    }
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    for epoch in range(epochs):
        train_loss, train_iou = 0.0, 0.0
        val_loss, val_iou = 0.0, 0.0
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
            running_loss, running_iou = 0.0, 0.0
            with tqdm(datadict[phase], unit='batch') as tepoch:
                for img, label in tepoch:
                    img = img.to(device)
                    label = label.to(device)
                    optimizer.zero_grad()
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(img)
                        # return outputs, label
                        loss = dice_bce_loss(outputs, label)
                        iou = iou_score(outputs, label)
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()
                    running_loss += loss.item()
                    running_iou += iou.item()
                    tepoch.set_postfix(loss = loss.item(), iou = iou.item())
            if phase == 'train':
                train_loss = running_loss / len(datadict['train'])
                train_iou = running_iou / len(datadict['train'])
                print(f'Train Loss: {train_loss}')
                print(f'Train IOU: {train_iou}')
            else:
                val_loss = running_loss / len(datadict['val'])
                val_iou = running_iou / len(datadict['val'])
                print(f'Val Loss: {val_loss}')
                print(f'Val IOU: {val_iou}')
        # wandb.log({
        #     'train_loss' : train_loss,
        #     'val_loss' : val_loss,
        #     'train_iou' : train_iou,
        #     'val_iou' : val_iou
        # })

In [12]:
train_dataset = GetData(train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=22)
val_dataset = GetData(val)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=True, num_workers=22)

In [28]:
model = Unet(3)
model_1 = UNet()

In [29]:
x, y = model(torch.randn(1, 3, 512, 512))
x_, y_ = model_1(torch.randn(1, 3, 512, 512))

In [30]:
x_.shape

torch.Size([1, 512, 64, 64])

In [31]:
y_.shape

torch.Size([1, 512, 64, 64])

In [26]:
x.shape

torch.Size([1, 512, 68, 68])

In [27]:
y.shape

torch.Size([1, 512, 72, 72])

In [13]:
train_model(UNet(num_classes=1), train_loader, val_loader, 10)

../aten/src/ATen/native/cuda/Loss.cu:118: operator(): block: [118,0,0], thread: [32,0,0] Assertion `input_val >= zero && input_val <= one` failed.
../aten/src/ATen/native/cuda/Loss.cu:118: operator(): block: [118,0,0], thread: [33,0,0] Assertion `input_val >= zero && input_val <= one` failed.
../aten/src/ATen/native/cuda/Loss.cu:118: operator(): block: [118,0,0], thread: [34,0,0] Assertion `input_val >= zero && input_val <= one` failed.
../aten/src/ATen/native/cuda/Loss.cu:118: operator(): block: [118,0,0], thread: [35,0,0] Assertion `input_val >= zero && input_val <= one` failed.
../aten/src/ATen/native/cuda/Loss.cu:118: operator(): block: [118,0,0], thread: [36,0,0] Assertion `input_val >= zero && input_val <= one` failed.
../aten/src/ATen/native/cuda/Loss.cu:118: operator(): block: [118,0,0], thread: [37,0,0] Assertion `input_val >= zero && input_val <= one` failed.
../aten/src/ATen/native/cuda/Loss.cu:118: operator(): block: [118,0,0], thread: [38,0,0] Assertion `input_val >= zero 

RuntimeError: CUDA error: device-side assert triggered

In [14]:
x.shape

torch.Size([32, 1, 512, 512])

In [16]:
y.shape

torch.Size([32, 1, 512, 512])

In [19]:
print(x.view(-1).shape)
print(y.view(-1).shape)

torch.Size([8388608])
torch.Size([8388608])


In [20]:
def train_model(model, train_set, val_set, epochs):
    # wandb.init(project = 'ship-segmentation-pytorch-wb')
    model = nn.DataParallel(model)
    model = model.to(device)
    datadict = {
        'train': train_set,
        'val' : val_set
    }
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.01)
    loss_, iou_ = [], []
    for epoch in range(epochs):
        # train_loss, train_iou = 0.0, 0.0
        # val_loss, val_iou = 0.0, 0.0
        for phase in ['train']:
            if phase == 'train':
                model.train()
            # running_loss, running_iou = 0.0, 0.0
            running_loss, running_iou = [], []
            with tqdm(datadict[phase], unit='batch') as tepoch:
                for img, label in tepoch:
                    img = img.to(device)
                    label = label.to(device)
                    optimizer.zero_grad()
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(img)
                        loss, _ = dice_bce_loss(outputs, label)
                        iou = iou_score(outputs, label)
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()
                        # print(loss.item())
                        # print(iou.item())
                        # break
                    running_loss.append(loss.item())
                    running_iou.append(iou.item())
            #         running_loss += loss.item()
            #         running_iou += iou.item()
                    tepoch.set_postfix(loss = loss.item(), iou = iou.item())
            # if phase == 'train':
            #     train_loss = running_loss / len(datadict['train'])
            #     train_iou = running_iou / len(datadict['train'])
            #     print(f'Loss: {train_loss}')
            #     print(f'IOU: {train_iou}')
        loss_.append(running_loss)
        iou_.append(running_iou)
    return loss_, iou_

In [21]:
train_dataset = GetData(train[:10000])
train_loader = DataLoader(train_dataset, batch_size=482, shuffle=True, num_workers=22)
val_dataset = GetData(val[:10000])
val_loader = DataLoader(val_dataset, batch_size=482, shuffle=True, num_workers=22)

In [23]:
loss, iou = train_model(Unet(3), train_loader, val_loader, 10)

100%|██████████| 21/21 [00:21<00:00,  1.02s/batch, iou=4.75e-9, loss=1.61]
100%|██████████| 21/21 [00:21<00:00,  1.00s/batch, iou=5.26e-9, loss=1.54]
100%|██████████| 21/21 [00:21<00:00,  1.01s/batch, iou=6.14e-9, loss=1.47]
100%|██████████| 21/21 [00:21<00:00,  1.01s/batch, iou=5.15e-9, loss=1.37]
  0%|          | 0/21 [00:00<?, ?batch/s]Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fa7c910cdc0>
Traceback (most recent call last):
  File "/home/paperspace/.local/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__
    self._shutdown_workers()
  File "/home/paperspace/.local/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1493, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProces

In [27]:
sum(loss[-1])

21.312373638153076

In [28]:
sum(loss[-2])

21.419628858566284

In [30]:
sum(iou[-1])

8.845404120272349e-08

In [31]:
sum(iou[-2])

8.887729507911502e-08