In [1]:
import segmentation_models_pytorch as smp
import torch
import torchvision
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import PIL
import wandb

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda', index=0)

In [4]:
csv_file = pd.read_csv('../../files/train_ship_segmentations_v2.csv')
csv_file = csv_file.groupby('ImageId')['EncodedPixels'].apply(list).reset_index()
image_ids, pixels = csv_file['ImageId'].values.tolist(), csv_file['EncodedPixels'].values.tolist()

In [5]:
csv_file['fixed_inputs'] = csv_file['ImageId'].apply(lambda x: '../../files/train_v2/' + x)
csv_file['mask_paths'] = csv_file['ImageId'].apply(lambda x: '../../files/masks_v1/train/' + x.split('.')[0] + '.' + 'png')

In [6]:
csv_file['fixed_inputs'] = csv_file['ImageId'].apply(lambda x: '../../files/train_v2/' + x)
csv_file['mask_paths'] = csv_file['ImageId'].apply(lambda x: '../../files/masks_v1/train/' + x.split('.')[0] + '.' + 'png')

In [7]:
for x in tqdm(csv_file['fixed_inputs'].values.tolist()):
    if os.path.exists(x) == False:
        print(x)

100%|██████████| 192556/192556 [00:00<00:00, 423318.74it/s]


In [8]:
for x in tqdm(csv_file['mask_paths'].values.tolist()):
    if os.path.exists(x) == False:
        print(x)

100%|██████████| 192556/192556 [00:00<00:00, 401489.76it/s]


In [9]:
csv_file['fixed_inputs'].values.tolist()[0]

'../../files/train_v2/00003e153.jpg'

In [10]:
csv_file = csv_file[csv_file['fixed_inputs'] != '../../files/train_v2/6384c3e78.jpg']

In [11]:
def split_datasets(csv_file, test_size = 0.01):
    train, test = train_test_split(csv_file, test_size = test_size, random_state=42)
    train, val = train_test_split(train, test_size = test_size, random_state=42)
    return train, val, test

In [12]:
train, val, test = split_datasets(csv_file)

In [13]:
class Version1Dataset(Dataset):
    def __init__(self, csv_file):
        self.input_images = csv_file['fixed_inputs'].values
        self.mask_images = csv_file['mask_paths'].values
    
    def __len__(self):
        return len(self.input_images)
    
    def __getitem__(self, idx):
        img = torchvision.io.read_file(self.input_images[idx])
        img = torchvision.io.decode_jpeg(img, torchvision.io.ImageReadMode.RGB)
        mask = torchvision.io.read_file(self.mask_images[idx])
        mask = torchvision.io.decode_image(mask, torchvision.io.ImageReadMode.GRAY)
        img = torchvision.transforms.Resize((512, 512))(img)
        mask = torchvision.transforms.Resize((512, 512))(mask)
        img = img / 255
        mask = mask / 255
        return img, mask

In [14]:
train_dataset = Version1Dataset(train)
train_dataloader = DataLoader(train_dataset, shuffle = True, batch_size = 32)

In [15]:
def get_model():
    model = smp.Unet(
        encoder_name="inceptionv4",
        encoder_weights=None,
        in_channels=3,             
        classes=1,                
    )
    return model

In [16]:
# create loss, optimizer and other stuff

In [17]:
# Loss function
def dice_bce_loss(inputs, targets, smooth = 1e-5):
    # remove if your model inherently handles sigmoid
    number_of_pixels = inputs.shape[0] * (512 * 512 * 3)
    sigmoid = nn.Sigmoid()
    inputs = sigmoid(inputs)
    inputs = inputs.view(-1)
    targets = targets.view(-1)
    intersection = (inputs * targets).sum()
    dice_loss = (2. * intersection + smooth) / (inputs.sum() + targets.sum() + smooth)
    dice_loss = 1 - dice_loss
    # Pixel wise log loss is calculated not number of images
    # I checked reduce by mean is correct measure.
    BCE = nn.functional.binary_cross_entropy(inputs, targets, reduce='mean')
    final = dice_loss + BCE
    return final, number_of_pixels

In [18]:
# IOU metric
# SMOOTH = 1e-5
def iou_score(inputs, targets, smooth=1e-5):
    inputs = (inputs > 0.5).float()
    inputs = inputs.view(-1)
    targets = targets.view(-1)
    intersection = torch.sum(inputs * targets)
    unioun = torch.sum(inputs + targets) - intersection
    # TP = torch.sum(torch.logical_and(inputs == 1, targets == 1))
    # FP = torch.sum(torch.logical_and(inputs == 1, targets == 0))
    # FN = torch.sum(torch.logical_and(inputs == 0, targets == 1))
    iou = (intersection + smooth) / (unioun + smooth)
    return iou

In [19]:
# gather_datasets
train_dataset = Version1Dataset(train)
train_dataloader = DataLoader(train_dataset, shuffle = True, batch_size = 88, num_workers=24, prefetch_factor=2)
val_dataset = Version1Dataset(val)
val_dataloader = DataLoader(val_dataset, shuffle = False, batch_size = 128)
test_dataset = Version1Dataset(test)
test_dataloader = DataLoader(test_dataset, shuffle = False, batch_size = 128)

In [20]:
len(train_dataloader)

2145

In [21]:
train_image_size = 1000 * (512 * 512 * 3)
val_image_size = len(val) * (512 * 512 * 3)
train_batches = len(train_dataloader)
val_batches = len(val_dataloader)

In [22]:
# model = nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4])

In [23]:
# x = torch.rand((32, 512, 512, 3)).to(device)

In [24]:
# next(model.parameters()).is_cuda

In [25]:
# model(x)

In [26]:
def train_model(model, train_dataset, val_dataset, epochs = 10):
    data_pointers = {
        'train': train_dataset,
        'val': val_dataset
    }
    optimizer = optim.SGD(model.parameters(), lr=0.001)
    if next(model.parameters()).is_cuda == False:
        # model = nn.DataParallel(model)
        model = model.to(device)
    for epoch in range(epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train() # set model to train phase
            else:
                model.eval() # set model to eval phase
            running_loss = 0.0
            running_iou = 0.0
            # TODO: Implement IOU score as metric
            count = 0
            for imgs, labels in tqdm(data_pointers[phase]):
                imgs = imgs.to(device)
                labels = labels.to(device)

                # init optimizer
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase=='train'):
                    outputs = model(imgs)
                    loss, _ = dice_bce_loss(outputs, labels)
                    # iou = iou_score(outputs, labels)
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                running_loss += loss.item()
                # running_iou += iou 
                # if count % 10 == 0:
                #     print(count)
                # count += 1

            if phase == 'train':
                epoch_loss = running_loss / train_batches
                # epoch_iou = running_iou / train_batches
            else:
                epoch_loss = running_loss / val_batches
                # epoch_iou = running_iou / val_batches
            print(f'{phase} Loss: {float(epoch_loss)}')
            print(f'{phase} IOU: {float(epoch_iou)}')
        if epoch == 2:
            break

In [27]:
# train_model(model, train_dataloader, val_dataloader)

Data augmentation with some changes in code base

In [28]:
def get_positive_samples(csv_file):
    sample_list = []
    for x in csv_file['EncodedPixels'].values.tolist():
        if type(x[0]) == str:
            sample_list.append(1)
        else:
            sample_list.append(-1)
    csv_file['sample_type'] = sample_list
    return csv_file

In [29]:
train_updated = get_positive_samples(train)

In [30]:
class AugDataset(Dataset):
    def __init__(self, csv_file):
        self.input_images = csv_file['fixed_inputs'].values
        self.mask_images = csv_file['mask_paths'].values
        self.mask_type = csv_file['sample_type'].values
        self.negative_index = np.where(self.mask_type == -1)[0]
        self.brightness_factors = np.random.uniform(1.0, 2.0, size = len(csv_file))
        self.contrast_factors = np.random.uniform(2.0, 3.5, size = len(csv_file))

    def __len__(self):
        return len(self.input_images)
    
    def change_every_epoch(self):
        new_values = np.random.randint(0, 2, size=(self.__len__()))
        new_values[self.negative_index] = -1
        self.mask_type = new_values
        self.brightness_factors = np.random.uniform(1.0, 2.0, size = self.__len__())
        self.contrast_factors = np.random.uniform(2.0, 3.5, size = self.__len__())


    def aug(self, img, mask, brightness_factor, contrast_factor):
        img = torchvision.transforms.functional.hflip(img)
        mask = torchvision.transforms.functional.hflip(mask)
        img = torchvision.transforms.functional.adjust_brightness(img, brightness_factor)
        img = torchvision.transforms.functional.adjust_contrast(img, contrast_factor)
        return img, mask

    def __getitem__(self, idx):
        # All positive sample images will be augmented with 
        # flip horizontally
        # adjusting brightness
        # adjusting contrast
        img = torchvision.io.read_file(self.input_images[idx])
        img = torchvision.io.decode_jpeg(img)
        mask = torchvision.io.read_file(self.mask_images[idx])
        mask = torchvision.io.decode_image(mask)
        if self.mask_type[idx] != -1:
            if self.mask_type[idx] == 1:
                img, mask = self.aug(img, mask, self.brightness_factors[idx], self.contrast_factors[idx])
        img = torchvision.transforms.functional.resize(img, (512, 512))
        mask = torchvision.transforms.functional.resize(mask, (512, 512))
        img = img / 255
        mask = mask / 255
        mask = torch.where(mask < 1.0, 0.0, 1.0)
        return img, mask

In [31]:
# data = AugDataset(train)
# train_loader = DataLoader(data, shuffle=False)
# img_1 = train_loader.dataset.__getitem__(6)
# train_loader.dataset.change_every_epoch()
# train_loader = DataLoader(data, shuffle=False)
# img_2 = train_loader.dataset.__getitem__(6)

In [32]:
# def train_model(model, train_dataset, val_dataset, weights_path, load_weights = None, epochs = 100):
#     # If fails reduce protobuf to lower version pip install protobuf==3.19
#     wandb.init(project='ship-segmentation-pytorch-wb',
#                config = {
#                    'arch' : 'Unet- Incepv4',
#                    'epochs' : 10
#                }
#                )
#     data_pointers = {
#         'train': train_dataset,
#         'val': val_dataset
#     }
#     model = get_model()
#     if load_weights != None:
#         model.load_state_dict(torch.load(load_weights))
#     optimizer = optim.SGD(model.parameters(), lr=0.001)
#     if next(model.parameters()).is_cuda == False:
#         model = nn.DataParallel(model)
#         model = model.to(device)
#     for epoch in range(epochs):
#         for phase in ['train', 'val']:
#             if phase == 'train':
#                 model.train() # set model to train phase
#             else:
#                 model.eval() # set model to eval phase
#             running_loss = 0.0
#             running_iou = 0.0
#             # TODO: Implement IOU score as metric
#             count = 0
#             for imgs, labels in tqdm(data_pointers[phase]):
#                 imgs = imgs.to(device)
#                 labels = labels.to(device)

#                 # init optimizer
#                 optimizer.zero_grad()

#                 with torch.set_grad_enabled(phase=='train'):
#                     outputs = model(imgs)
#                     loss, _ = dice_bce_loss(outputs, labels)
#                     iou = iou_score(outputs, labels)
#                     if phase == 'train':
#                         loss.backward()
#                         optimizer.step()
#                 running_loss += loss.item()
#                 running_iou += iou 
#                 # if count % 10 == 0:
#                 #     print(count)
#                 # count += 1

#             if phase == 'train':
#                 epoch_loss = running_loss / len(train_dataset)
#                 epoch_iou = running_iou / len(train_dataset)
#                 wandb.log(
#                     {'epoch_loss' : epoch_loss,
#                     'epoch_iou' : epoch_iou}
#                 )
#             else:
#                 epoch_loss = running_loss / len(val_dataset)
#                 epoch_iou = running_iou / len(val_dataset)
#                 wandb.log(
#                     {'val_epoch_loss' : epoch_loss,
#                     'val_epoch_iou' : epoch_iou}
#                 )
#             print(f'{phase} Loss: {float(epoch_loss)}')
#             print(f'{phase} IOU: {float(epoch_iou)}')
#         train_dataset.dataset.change_every_epoch()
#         if os.path.exists(weights_path) == False:
#             os.makedirs(weights_path)
#         # torch.save(model.state_dict(), f'{weights_path}{epoch}.pth')
#         torch.save({
#             'epoch': epoch,
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': optimizer.state_dict(),
#             'loss': loss
#         }, f'{weights_path}{epoch}.pth')
#         # if epoch == 2:
#         #     break

In [33]:
# # gather_datasets
# train_dataset = AugDataset(train_updated)
# train_dataloader = DataLoader(train_dataset, shuffle = True, batch_size = 280, num_workers=120, prefetch_factor=4)
# val_dataset = Version1Dataset(val)
# val_dataloader = DataLoader(val_dataset, shuffle = False, batch_size = 64)
# test_dataset = Version1Dataset(test)
# test_dataloader = DataLoader(test_dataset, shuffle = False, batch_size = 64)

In [34]:
# gather_datasets
train_batch_size = 240
num_workers = 120
train_dataset = AugDataset(train_updated)
train_dataloader = DataLoader(train_dataset, shuffle = True, batch_size = train_batch_size, num_workers=num_workers, prefetch_factor=4)
val_dataset = Version1Dataset(val)
val_dataloader = DataLoader(val_dataset, shuffle = False, batch_size = 64)
test_dataset = Version1Dataset(test)
test_dataloader = DataLoader(test_dataset, shuffle = False, batch_size = 64)

In [35]:
def train_model(model, train_dataset, val_dataset, weights_path, train_batch_size, num_workers, load_weights = None, epochs = 2):
    # If fails reduce protobuf to lower version pip install protobuf==3.19
    wandb.init(project='ship-segmentation-pytorch-wb',
               config = {
                   'arch' : 'Unet- Incepv4',
                   'epochs' : epochs,
                   'where_weights_stored' : weights_path,
                   'weights_loaded': False if load_weights == None else load_weights,
                   'optimizer' : 'SGD',
                   'lr_schdular' : False,
                   'Data augmentation' : True,
                   'batch_size': train_batch_size,
                   'num_workers' : num_workers
               }
               )
    data_pointers = {
        'train': train_dataset,
        'val': val_dataset
    }
    model = get_model()
    if load_weights != None:
        model.load_state_dict(torch.load(load_weights))
    optimizer = optim.SGD(model.parameters(), lr=0.001)
    if next(model.parameters()).is_cuda == False:
        model = nn.DataParallel(model)
        model = model.to(device)
    for epoch in range(epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train() # set model to train phase
            else:
                model.eval() # set model to eval phase
            running_loss = 0.0
            running_iou = 0.0
            with tqdm(data_pointers[phase], unit='batch') as tepoch:
                for imgs, labels in tepoch:
                    tepoch.set_description(f'Epoch: {epoch}')
                    imgs = imgs.to(device)
                    labels = labels.to(device)
                    optimizer.zero_grad()
                    with torch.set_grad_enabled(phase=='train'):
                        outputs = model(imgs)
                        loss, _ = dice_bce_loss(outputs, labels)
                        iou = iou_score(outputs, labels)
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()
                    running_loss += loss.item()
                    running_iou += iou.item()
                    tepoch.set_postfix(loss = loss.item(), iou = iou.item())
            # Please remove wandb.log and put it outside of this scope. Make sure every epoch run only one time.
            if phase == 'train':
                epoch_loss = running_loss / len(train_dataset)
                epoch_iou = running_iou / len(train_dataset)
                wandb.log(
                    {'epoch_loss' : epoch_loss,
                    'epoch_iou' : epoch_iou}
                )
            else:
                epoch_loss = running_loss / len(val_dataset)
                epoch_iou = running_iou / len(val_dataset)
                wandb.log(
                    {'val_epoch_loss' : epoch_loss,
                    'val_epoch_iou' : epoch_iou}
                )
            print(f'{phase} Loss: {float(epoch_loss)}')
            print(f'{phase} IOU: {float(epoch_iou)}')
        train_dataset.dataset.change_every_epoch()
        if os.path.exists(weights_path) == False:
            os.makedirs(weights_path)
        # Please refer this link for saving and loading models
        # Link: https://pytorch.org/tutorials/beginner/saving_loading_models.html
        torch.save({
            'epoch': epoch,
            # Please change this..... if you are using single GPU to model.state_dict().
            'model_state_dict': model.module.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss
        }, f'{weights_path}{epoch}.pth')

In [36]:
train_model(get_model(), train_dataloader, val_dataloader, '../../weights/torch_final_model_v2/', train_batch_size=train_batch_size, num_workers=num_workers, epochs=100)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myashchks87[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch: 0: 100%|██████████| 787/787 [16:47<00:00,  1.28s/batch, iou=4.61e-10, loss=1.06]  


train Loss: 1.154797849994451
train IOU: 0.00047935742841316727


Epoch: 0: 100%|██████████| 30/30 [01:03<00:00,  2.13s/batch, iou=6.94e-10, loss=1.06]


val Loss: 1.059402048587799
val IOU: 5.194180742925288e-10


Epoch: 1: 100%|██████████| 787/787 [15:04<00:00,  1.15s/batch, iou=6.91e-10, loss=1.03] 


train Loss: 1.0437772743583786
train IOU: 1.259647800058385e-07


Epoch: 1: 100%|██████████| 30/30 [01:01<00:00,  2.05s/batch, iou=6.94e-10, loss=1.03]


val Loss: 1.0297101060549418
val IOU: 5.194511395097597e-10


Epoch: 2: 100%|██████████| 787/787 [14:29<00:00,  1.10s/batch, iou=5.28e-10, loss=1.02]


train Loss: 1.0262706759924205
train IOU: 2.0306791644979353e-08


Epoch: 2: 100%|██████████| 30/30 [01:00<00:00,  2.02s/batch, iou=6.94e-10, loss=1.02]


val Loss: 1.0211864471435548
val IOU: 5.194511395097597e-10


Epoch: 3: 100%|██████████| 787/787 [14:52<00:00,  1.13s/batch, iou=3.56e-10, loss=1.02] 


train Loss: 1.018673309680013
train IOU: 1.596524331098667e-10


Epoch: 3: 100%|██████████| 30/30 [00:57<00:00,  1.90s/batch, iou=6.94e-10, loss=1.02]


val Loss: 1.016065486272176
val IOU: 5.194511395097597e-10


Epoch: 4: 100%|██████████| 787/787 [14:43<00:00,  1.12s/batch, iou=3.31e-10, loss=1.01] 


train Loss: 1.0139154781986222
train IOU: 1.588845461998719e-10


Epoch: 4: 100%|██████████| 30/30 [00:50<00:00,  1.68s/batch, iou=6.94e-10, loss=1.01]


val Loss: 1.0110196987787883
val IOU: 5.194511395097597e-10


Epoch: 5: 100%|██████████| 787/787 [15:05<00:00,  1.15s/batch, iou=3.7e-10, loss=1.01]  


train Loss: 1.009134249584193
train IOU: 1.5959625252730314e-10


Epoch: 5: 100%|██████████| 30/30 [01:02<00:00,  2.09s/batch, iou=6.94e-10, loss=1.01]


val Loss: 1.0102710167566935
val IOU: 5.194511395097597e-10


Epoch: 6: 100%|██████████| 787/787 [16:03<00:00,  1.22s/batch, iou=3.66e-10, loss=0.991]


train Loss: 0.9988958101714884
train IOU: 1.5943512421627063e-10


Epoch: 6: 100%|██████████| 30/30 [01:04<00:00,  2.15s/batch, iou=6.94e-10, loss=1.01] 


val Loss: 1.0073349893093109
val IOU: 5.194507490813294e-10


Epoch: 7: 100%|██████████| 787/787 [15:48<00:00,  1.21s/batch, iou=6.03e-10, loss=0.966] 


train Loss: 0.9712136419634353
train IOU: 1.6018686380877134e-10


Epoch: 7: 100%|██████████| 30/30 [01:03<00:00,  2.10s/batch, iou=6.94e-10, loss=1]    


val Loss: 0.9902433613936107
val IOU: 5.194511395097597e-10


Epoch: 8: 100%|██████████| 787/787 [14:38<00:00,  1.12s/batch, iou=8.51e-10, loss=0.927]


train Loss: 0.921319494480705
train IOU: 1.59657424184835e-10


Epoch: 8: 100%|██████████| 30/30 [00:55<00:00,  1.84s/batch, iou=6.94e-10, loss=0.987]


val Loss: 0.9170106669267019
val IOU: 5.194511395097597e-10


Epoch: 9: 100%|██████████| 787/787 [15:12<00:00,  1.16s/batch, iou=5.04e-10, loss=0.915]


train Loss: 0.830247030930719
train IOU: 1.5972922751509382e-10


Epoch: 9: 100%|██████████| 30/30 [01:04<00:00,  2.15s/batch, iou=6.94e-10, loss=0.933]


val Loss: 0.845662393172582
val IOU: 5.194511395097597e-10


Epoch: 10: 100%|██████████| 787/787 [16:07<00:00,  1.23s/batch, iou=3.59e-10, loss=0.86]  


train Loss: 0.763179057283656
train IOU: 1.5944791700618148e-10


Epoch: 10: 100%|██████████| 30/30 [01:04<00:00,  2.14s/batch, iou=6.94e-10, loss=0.711]


val Loss: 0.751601121822993
val IOU: 5.194511395097597e-10


Epoch: 11: 100%|██████████| 787/787 [15:55<00:00,  1.21s/batch, iou=3.95e-10, loss=0.704] 


train Loss: 0.7155984892324087
train IOU: 1.5938118125506166e-10


Epoch: 11: 100%|██████████| 30/30 [01:04<00:00,  2.15s/batch, iou=6.94e-10, loss=0.672]


val Loss: 0.7100715835889181
val IOU: 5.194511395097597e-10


Epoch: 12: 100%|██████████| 787/787 [14:43<00:00,  1.12s/batch, iou=4.4e-10, loss=0.614]  


train Loss: 0.6832775172163298
train IOU: 1.5945522532785486e-10


Epoch: 12: 100%|██████████| 30/30 [00:57<00:00,  1.92s/batch, iou=6.94e-10, loss=0.594]


val Loss: 0.6775505284468333
val IOU: 5.194511395097597e-10


Epoch: 13: 100%|██████████| 787/787 [15:08<00:00,  1.15s/batch, iou=6.07e-10, loss=0.609] 


train Loss: 0.6487475935809949
train IOU: 1.5916729131407776e-10


Epoch: 13: 100%|██████████| 30/30 [01:02<00:00,  2.08s/batch, iou=6.94e-10, loss=0.599]


val Loss: 0.6602074980735779
val IOU: 5.194511395097597e-10


Epoch: 14:  60%|█████▉    | 469/787 [10:41<05:51,  1.11s/batch, iou=1.48e-10, loss=0.63]  