In [1]:
import os
print(os.listdir("../input/carvana-image-masking-challenge/"))

import zipfile
import shutil

DATASET_DIR = '../input/carvana-image-masking-challenge/'
WORKING_DIR = '/kaggle/working/'

if len(os.listdir(WORKING_DIR)) <= 1:

    with zipfile.ZipFile(DATASET_DIR + 'train.zip', 'r') as zip_file:
        zip_file.extractall(WORKING_DIR)

    with zipfile.ZipFile(DATASET_DIR + 'train_masks.zip', 'r') as zip_file:
        zip_file.extractall(WORKING_DIR)
    
    print(
        len(os.listdir(WORKING_DIR + 'train')),
        len(os.listdir(WORKING_DIR + 'train_masks'))
    )
    
    # Move some of the images (5088-4600) to the validation directory
    train_dir = WORKING_DIR + 'train/'
    val_dir = WORKING_DIR + 'val/'
    os.mkdir(val_dir)
    for file in sorted(os.listdir(train_dir))[4600:]:
      shutil.move(train_dir + file, val_dir)
    
    # move their masks as well
    masks_dir = WORKING_DIR + 'train_masks/'
    val_masks_dir = WORKING_DIR + 'val_masks/'
    os.mkdir(val_masks_dir)
    for file in sorted(os.listdir(masks_dir))[4600:]:
      shutil.move(masks_dir + file, val_masks_dir)

    os.mkdir(WORKING_DIR + 'saved_images')

['train_masks.zip', '29bb3ece3180_11.jpg', 'train_masks.csv.zip', 'train.zip', 'metadata.csv.zip', 'sample_submission.csv.zip', 'test.zip', 'test_hq.zip', 'train_hq.zip']
5088 5088


## Utils

In [2]:
def check_accuracy_binary(loader,model,device):
    num_correct = 0
    num_pixels = 0
    dice_score = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device).unsqueeze(1)
            preds = torch.sigmoid(model(x))
            preds = (preds > 0.5).float()
            num_correct += (preds == y).sum()
            num_pixels += torch.numel(preds)
            dice_score += (2 * (preds * y).sum()) / ((preds + y).sum() + 1e-8)

    print(
        f'Got {num_correct}/{num_pixels} with acc {num_correct/num_pixels*100:.2f}'
    )
    print(f'Dice score: {dice_score/len(loader)}')
    model.train()
    return dice_score/len(loader)

from torchvision.utils import save_image


def save_predictions_as_imgs(loader, model, device, folder="saved_images/"):
    if not os.path.exists(folder):
        os.makedirs(folder)
    num_examples = 0
    
    model.eval()
    for idx, (x, y) in enumerate(loader):
        x = x.to(device=device)
        with torch.no_grad():
            preds = torch.sigmoid(model(x))
            preds = (preds > 0.5).float()
        
        for i in range(preds.size(0)):  # Iterate over each image in the batch
            torchvision.utils.save_image(preds[i], os.path.join(folder, f"pred_{idx}_{i}.png"))
            torchvision.utils.save_image(y[i].unsqueeze(0), os.path.join(folder, f"mask_{idx}_{i}.png"))  # unsqueeze adds a channel dimension to the tensor
            num_examples += 1
            if num_examples == 10:
                model.train()
                return

- Tips
    - What is dice score?
        - Dice score is a metric to evaluate the performance of a segmentation model. It is defined as the intersection of the predicted mask and the ground truth mask divided by the average of the number of pixels in the predicted mask and the ground truth mask.
        - formula : $Dice = \frac{2 \times |X \cap Y|}{|X| + |Y|}$

# Semantic Segmentation with UNet

- We will build a model from scratch, and set up the data loading pipeline which will contain data augmentation using `albumentations` library.

- we will train the model on the [Carvana Image Masking Challenge](https://www.kaggle.com/c/carvana-image-masking-challenge) dataset.

## Unet Architecture

- The U-Net architecture is introduced in the paper titled [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597) by Olaf Ronneberger, Philipp Fischer, and Thomas Brox.
    - they designed this architecture for biomedical image segmentation, but it can be used for any image segmentation task.

- The U-Net architecture is symmetric, and it consists of two parts:
    - Contracting path (Encoder)
        - The contracting path is a typical convolutional network that consists of repeated application of convolutions, downsampling, and ReLU activations.
    - Expansive path (Decoder)
        - The expansive path consists of upsampling, concatenation with the corresponding cropped feature map from the contracting path, followed by convolution, and ReLU activation.
    - Skip connections
        - The skip connections are the concatenation of feature maps from contracting path with the corresponding feature maps in the expansive path.

- The U-Net architecture is shown below:
    - ![unet](https://lmb.informatik.uni-freiburg.de/people/ronneber/u-net/u-net-architecture.png)

- the pattern they followed in the paper is:
    - 2 3x3 convolutions with ReLU activation followed by 2x2 max pooling with stride 2 for downsampling. (this is repeated 4 times)
    - 2 3x3 convolutions with ReLU activation for upsampling. (this is repeated 4 times)
    - skip connections are added between the corresponding feature maps in the contracting and expansive paths.
    - There is a 1x1 convolution at the end of the network, which is used to map each 64-component feature vector to the desired number of classes (1x1 convolution preserves the area and changes the depth).

- Some things we will do that are different from the original U-Net architecture:
    - We will use `Same` padding in the convolution layers to keep the spatial dimensions the same, unlike the original architecture which uses `Valid` padding.
        - as a result, They did cropping in the skip connections to concatenate feature maps of the same spatial level (because the dimensions did not match)
        - but for use the dimensions will match, so we will not need cropping and we will simply add the feature maps from the contracting path to the expansive path.
        - The carvana winners used `Same` padding in their implementation (so it doesn't seem to affect the performance much).
    - Instead of the transposed convolution, we might use a bilinear upsampling layer
        - This layer will upsample the input by a factor of 2. It uses bilinear interpolation to upsample the input.
        - in Gans, in Pro Gans, they used bilinear upsampling instead of transposed convolution because it produces better results.

In [3]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

## Model

- we will use the class `DoubleConv` to define a block of two 3x3 convolutions with ReLU activation. because this block is used multiple times in the network.

In [4]:
class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(DoubleConv, self).__init__()
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1,bias=False), # same padding, we set bias to False because we will use a batchnorm layer after this (which cancels out the bias)
            nn.BatchNorm2d(out_channels), # batchnorm layer, not in the original U-Net
            nn.ReLU(inplace=True), # inplace true means it will modify the input directly, without allocating any additional output. It can sometimes slightly decrease the memory usage, but may not always be a valid operation (because the original input is destroyed)
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1,bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )
    def forward(self, x):
        return self.double_conv(x)
    

class UNet(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(UNet, self).__init__()
        # the channels of the Unet are hardcoded, but you can change them to your needs
        self.features = [64, 128, 256, 512] 
        self.downs = nn.ModuleList()
        self.ups = nn.ModuleList()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2) # max pooling layer

        # Encoder
        for feature in self.features:
            self.downs.append(DoubleConv(in_channels, feature))
            # update in_channels for the next layer
            in_channels = feature
            # we will not add the max pooling layer here, because we want to save the outputs before the max pooling layer (for the skip connections)

        # the bottleneck layer (which is the bottom of the U)
        # this is a single level with no symmetrical upsampling (that is why we did not include 1024 in the features list) and we did it outside the loops
        self.bottleneck = DoubleConv(self.features[-1], self.features[-1]*2) 

        # Decoder
        for feature in reversed(self.features):
            # the shift introduced because of the bottleneck layer will cause each transpose convolution to take 2*feature as input (check the diagram)
            self.ups.append(nn.ConvTranspose2d(feature*2, feature, kernel_size=2, stride=2))  # kernel size 2, stride will double the width and height
            # the double conv will take double the number of features as input because of the concatenation of the skip connection
            self.ups.append(DoubleConv(feature*2, feature)) 

        # the final layer, which is a 1x1 convolution
        self.final_layer = nn.Conv2d(self.features[0], out_channels, kernel_size=1)

    def forward(self, x):
        skip_connections = []

        # Encoder
        for level in self.downs:
            x = level(x)
            skip_connections.append(x)
            x = self.pool(x)

        # Bottleneck
        x = self.bottleneck(x)

        # reverse the skip connections (because we want to concatenate the layers in the reverse order)
        skip_connections = skip_connections[::-1]

        # Decoder
        for i in range(0, len(self.ups), 2): # iterate with a step of 2 because we have 2 layers for each level
            x = self.ups[i](x)
            # add the skip connection
            skip_connection = skip_connections[i//2] # i//2 because we move here with a step of 2, but we want to move with a step of 1 in the skip connections list
            
            # Defensive Act: check if the dimensions of the skip connection and the x are not the same (that might result when choosing an input image which size is not divisible by 2^4), which will lead to an odd number of pixels, which max pool will floor to the nearest integer, and the transpose convolution will double the width and height to that integer, which will cause the dimensions to be different by 1 pixel
            if x.shape != skip_connection.shape:
                # interpolate x to the size of the skip connection
                x = F.interpolate(x, size=skip_connection.shape[2:], mode='bilinear', align_corners=True)

            # concatenate the skip connection
            x = torch.cat((skip_connection, x), dim=1) # dim=1 because we want to concatenate along the channels
            x = self.ups[i+1](x) # the double conv layer

        # Final layer
        x = self.final_layer(x)

        return x
    

# test the model
dummy = torch.randn((3, 3, 160, 160))                        
model = UNet(in_channels=3, out_channels=1)
model(dummy).shape

torch.Size([3, 1, 160, 160])

- Tips
    - We used something called ModuleList to store the layers in the network. this is different from a regular python list because it registers the layers in the network, so they can be used in the forward pass.
        - long story short, it is compatible with the PyTorch model functions and used when we want to store layers in a list.

    - ConvTranspose2d is used for upsampling in the expansive path. it takes the following arguments:

    - We had to choose an input size that is divisible by 16 (because we will divide by 2 four times in the contracting path).
        - so 2*2*2*2 = 16
        - the reason for that is that if it is not divisible by 16, we will have an odd number of pixels, and the max pool layer will floor it when it reduces the size by half, leading to a mismatch in the dimensions when we add the skip connections later
        - other thing we can do is to resize the skip connections or the upsampled feature maps to match the dimensions. (which we did above)

        

## Dataset

In [5]:
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import numpy as np


class CarvanaDataset(Dataset):
    def __init__(self, image_dir, mask_dir, transform=None):
        self.image_dir = image_dir
        self.mask_dir = mask_dir
        self.transform = transform
        self.images = os.listdir(image_dir)

    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, index):
        img_path = os.path.join(self.image_dir, self.images[index])
        mask_path = os.path.join(self.mask_dir, self.images[index].replace(".jpg", "_mask.gif"))
        # read the image and the mask (label), and store them in the form of numpy arrays (for the albumentations library)
        image = np.array(Image.open(img_path).convert("RGB")) # we might not need to do this because the images are loaded as RGB by default
        mask = np.array(Image.open(mask_path).convert("L"), dtype=np.float32) # convert("L") will convert the image to grayscale
        # binarize the mask
        mask[mask == 255.0] = 1.0

        # apply the transformations if they exist
        if self.transform is not None:
            augmentations = self.transform(image=image, mask=mask)
            image = augmentations["image"]
            mask = augmentations["mask"]
            
        return image, mask 

In [6]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

train_image_dir = '/kaggle/working/train'
train_mask_dir = '/kaggle/working/train_masks'
val_image_dir = '/kaggle/working/val'
val_mask_dir = '/kaggle/working/val_masks'
## Dataset hyperparameters
batch_size = 16
image_height = 320
image_width = 480
pin_memory = True

train_transform = A.Compose(
    [
        A.Resize(height=image_height, width=image_width),
        A.Rotate(limit=35, p=1.0),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.1),
        # this will only divide by 255 (since mean = 0 and std = 1)
        A.Normalize(
            mean=[0.0, 0.0, 0.0],
            std=[1.0, 1.0, 1.0],
            max_pixel_value=255.0,
        ),
        ToTensorV2(),
    ],
)

# for the validation transforms, we will only resize and normalize without any augmentations
val_transform = A.Compose(
    [
        A.Resize(height=image_height, width=image_width),
        A.Normalize(
            mean=[0.0, 0.0, 0.0],
            std=[1.0, 1.0, 1.0],
            max_pixel_value=255.0,
        ),
        ToTensorV2(),
    ],
)


train_dataset = CarvanaDataset(image_dir=train_image_dir,mask_dir=train_mask_dir, transform=train_transform)
val_dataset = CarvanaDataset(image_dir=val_image_dir,mask_dir=val_mask_dir, transform=val_transform)

In [7]:
len(train_dataset), len(val_dataset)

(4600, 488)

In [8]:
train_dataset[0][0].shape, train_dataset[0][1].shape

(torch.Size([3, 320, 480]), torch.Size([320, 480]))

In [9]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=pin_memory)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=pin_memory)

In [10]:
for images, masks in train_loader:
    print(images.shape, masks.shape)
    break

torch.Size([16, 3, 320, 480]) torch.Size([16, 320, 480])


## Initializations

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_accuracy = 0

## Model hyperparameters
in_channels = 3
out_channels = 1
learning_rate = 1e-4
num_epochs = 20

model = UNet(in_channels=in_channels, out_channels=out_channels).to(device)
# criterion = nn.CrossEntropyLoss() # we will use this loss function if we have multiple classes (out_channels > 1)
criterion = nn.BCEWithLogitsLoss() # binary cross entropy with logits loss (it expects the logits, and it will apply the sigmoid function by itself), the sigmoid is applied because we have only 1 channel, and each pixel will have a value between 0 and 1 that represents the probability of that pixel being class 1
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# scalar
scaler = torch.cuda.amp.GradScaler() # this will help us to use mixed precision training

use_scheduler = True
if use_scheduler:
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1)


# if we want to load a model and continue training
load_model = False
if load_model:
    checkpoint = torch.load("model.pth")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])
    best_accuracy = checkpoint["best_accuracy"]
    print("=> Loaded model with accuracy {:.2f}".format(best_accuracy))

## Train

In [12]:
for epoch in range(num_epochs):
    ## Training phase 
    model.train()
    tk0 = tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch+1} Training")

    train_loss = 0
    train_exapmles = 0

    # loop on the train loader
    for batch_idx, (images, masks) in enumerate(tk0):
        images = images.to(device)
        masks = masks.float().unsqueeze(1).to(device) # add a channel dimension to the mask (since it is a single channel image)

        
        # we will use Float16 training to speed up the training process
        with torch.cuda.amp.autocast():
            # forward pass
            preds = model(images)

            # calculate the loss
            loss = criterion(preds, masks)
            train_loss += loss.item()
            train_exapmles += images.size(0)
        
        
        # backpropagation
        optimizer.zero_grad()
        scaler.scale(loss).backward() # scale the loss to avoid underflow or overflow
        
        # update the weights
        scaler.step(optimizer)
        scaler.update()

        # update the progress bar
        tk0.set_postfix(loss=(train_loss/train_exapmles))

    ## Validation phase
    model.eval()
    tk1 = tqdm(val_loader, total=len(val_loader), desc=f"Epoch {epoch+1} Validation")

    val_loss = 0
    val_examples = 0

    with torch.no_grad():
        for batch_idx, (images, masks) in enumerate(tk1):
            images = images.to(device)
            masks = masks.float().unsqueeze(1).to(device)

            # forward pass
            preds = model(images)

            # calculate the loss
            loss = criterion(preds, masks)
            val_loss += loss.item()
            val_examples += images.size(0)

            # update the progress bar
            tk1.set_postfix(loss=(val_loss/val_examples))

    # save the model if the accuracy is improved
    accuracy = check_accuracy_binary(val_loader, model, device)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "best_accuracy": best_accuracy,
        }
        torch.save(checkpoint, "model.pth")

    print(f"Epoch {epoch+1}, train loss: {train_loss/train_exapmles}, val loss: {val_loss/val_examples}, val accuracy: {accuracy}")

    # save the predictions as images every 5 epochs
    if epoch % 5 == 0:
        save_predictions_as_imgs(val_loader, model, device, folder="saved_images/")
    
    if use_scheduler:
        scheduler.step()        

Epoch 1 Training: 100%|██████████| 288/288 [06:46<00:00,  1.41s/it, loss=0.0136]
Epoch 1 Validation: 100%|██████████| 31/31 [00:22<00:00,  1.35it/s, loss=0.0131]


Got 72261742/74956800 with acc 96.40
Dice score: 0.9224391579627991
Epoch 1, train loss: 0.013601007944215898, val loss: 0.013110907656736061, val accuracy: 0.9224391579627991


Epoch 2 Training: 100%|██████████| 288/288 [06:47<00:00,  1.42s/it, loss=0.00719]
Epoch 2 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.33it/s, loss=0.00577]


Got 74513868/74956800 with acc 99.41
Dice score: 0.9863258004188538
Epoch 2, train loss: 0.007192053386698599, val loss: 0.0057733867653324954, val accuracy: 0.9863258004188538


Epoch 3 Training: 100%|██████████| 288/288 [06:48<00:00,  1.42s/it, loss=0.00476]
Epoch 3 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.32it/s, loss=0.00416]


Got 74470110/74956800 with acc 99.35
Dice score: 0.9849904775619507
Epoch 3, train loss: 0.00476276604215736, val loss: 0.00415501083232096, val accuracy: 0.9849904775619507


Epoch 4 Training: 100%|██████████| 288/288 [06:48<00:00,  1.42s/it, loss=0.00338]
Epoch 4 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.33it/s, loss=0.00295]


Got 74577623/74956800 with acc 99.49
Dice score: 0.9882967472076416
Epoch 4, train loss: 0.0033772764622193316, val loss: 0.0029505078726616064, val accuracy: 0.9882967472076416


Epoch 5 Training: 100%|██████████| 288/288 [06:48<00:00,  1.42s/it, loss=0.00265]
Epoch 5 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.33it/s, loss=0.00242]


Got 74639942/74956800 with acc 99.58
Dice score: 0.9901397824287415
Epoch 5, train loss: 0.002651454665414665, val loss: 0.002415576781772199, val accuracy: 0.9901397824287415


Epoch 6 Training: 100%|██████████| 288/288 [06:51<00:00,  1.43s/it, loss=0.00218]
Epoch 6 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.32it/s, loss=0.00201]


Got 74675280/74956800 with acc 99.62
Dice score: 0.9912691116333008
Epoch 6, train loss: 0.0021785037041358326, val loss: 0.0020084960813649367, val accuracy: 0.9912691116333008


Epoch 7 Training: 100%|██████████| 288/288 [06:50<00:00,  1.43s/it, loss=0.0019]
Epoch 7 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.32it/s, loss=0.00184]


Got 74686774/74956800 with acc 99.64
Dice score: 0.9916195869445801
Epoch 7, train loss: 0.001904298901638907, val loss: 0.0018442287987678267, val accuracy: 0.9916195869445801


Epoch 8 Training: 100%|██████████| 288/288 [06:48<00:00,  1.42s/it, loss=0.00175]
Epoch 8 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.33it/s, loss=0.00171]


Got 74682511/74956800 with acc 99.63
Dice score: 0.9915029406547546
Epoch 8, train loss: 0.001752724326013223, val loss: 0.001714758479540221, val accuracy: 0.9915029406547546


Epoch 9 Training: 100%|██████████| 288/288 [06:46<00:00,  1.41s/it, loss=0.00164]
Epoch 9 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.33it/s, loss=0.00163]


Got 74694405/74956800 with acc 99.65
Dice score: 0.9918637871742249
Epoch 9, train loss: 0.0016440162127432617, val loss: 0.0016264824662357569, val accuracy: 0.9918637871742249


Epoch 10 Training: 100%|██████████| 288/288 [06:48<00:00,  1.42s/it, loss=0.0016]
Epoch 10 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.33it/s, loss=0.00162]


Got 74697385/74956800 with acc 99.65
Dice score: 0.991949737071991
Epoch 10, train loss: 0.001601790799556867, val loss: 0.0016157866692262106, val accuracy: 0.991949737071991


Epoch 11 Training: 100%|██████████| 288/288 [06:48<00:00,  1.42s/it, loss=0.00194]
Epoch 11 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.33it/s, loss=0.0015]


Got 74572095/74956800 with acc 99.49
Dice score: 0.9880622625350952
Epoch 11, train loss: 0.0019438471997399692, val loss: 0.0014950417089047003, val accuracy: 0.9880622625350952


Epoch 12 Training: 100%|██████████| 288/288 [06:49<00:00,  1.42s/it, loss=0.00127]
Epoch 12 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.33it/s, loss=0.00106]


Got 74676511/74956800 with acc 99.63
Dice score: 0.9913032054901123
Epoch 12, train loss: 0.0012730310189173273, val loss: 0.0010636865817865388, val accuracy: 0.9913032054901123


Epoch 13 Training: 100%|██████████| 288/288 [06:48<00:00,  1.42s/it, loss=0.00102]
Epoch 13 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.33it/s, loss=0.000932]


Got 74672365/74956800 with acc 99.62
Dice score: 0.9911538362503052
Epoch 13, train loss: 0.0010211100536601052, val loss: 0.0009315157095428373, val accuracy: 0.9911538362503052


Epoch 14 Training: 100%|██████████| 288/288 [06:49<00:00,  1.42s/it, loss=0.000889]
Epoch 14 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.33it/s, loss=0.000817]


Got 74706178/74956800 with acc 99.67
Dice score: 0.9922254085540771
Epoch 14, train loss: 0.0008886557240444033, val loss: 0.0008171223901731313, val accuracy: 0.9922254085540771


Epoch 15 Training: 100%|██████████| 288/288 [06:49<00:00,  1.42s/it, loss=0.00087]
Epoch 15 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.32it/s, loss=0.000765]


Got 74710090/74956800 with acc 99.67
Dice score: 0.9923355579376221
Epoch 15, train loss: 0.0008698460360503067, val loss: 0.0007648017986479109, val accuracy: 0.9923355579376221


Epoch 16 Training: 100%|██████████| 288/288 [06:49<00:00,  1.42s/it, loss=0.000744]
Epoch 16 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.31it/s, loss=0.000705]


Got 74723318/74956800 with acc 99.69
Dice score: 0.9927541017532349
Epoch 16, train loss: 0.0007443755142309743, val loss: 0.0007048305257635771, val accuracy: 0.9927541017532349


Epoch 17 Training: 100%|██████████| 288/288 [06:54<00:00,  1.44s/it, loss=0.000702]
Epoch 17 Validation: 100%|██████████| 31/31 [00:24<00:00,  1.28it/s, loss=0.000679]


Got 74720271/74956800 with acc 99.68
Dice score: 0.9926553964614868
Epoch 17, train loss: 0.0007019807165488601, val loss: 0.0006793220603808028, val accuracy: 0.9926553964614868


Epoch 18 Training: 100%|██████████| 288/288 [06:49<00:00,  1.42s/it, loss=0.000671]
Epoch 18 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.31it/s, loss=0.00068]


Got 74720166/74956800 with acc 99.68
Dice score: 0.9926551580429077
Epoch 18, train loss: 0.0006712732379041288, val loss: 0.0006799222748787677, val accuracy: 0.9926551580429077


Epoch 19 Training: 100%|██████████| 288/288 [06:51<00:00,  1.43s/it, loss=0.000647]
Epoch 19 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.29it/s, loss=0.000645]


Got 74730316/74956800 with acc 99.70
Dice score: 0.9929762482643127
Epoch 19, train loss: 0.0006469412642004697, val loss: 0.0006449894166597333, val accuracy: 0.9929762482643127


Epoch 20 Training: 100%|██████████| 288/288 [06:52<00:00,  1.43s/it, loss=0.000639]
Epoch 20 Validation: 100%|██████████| 31/31 [00:23<00:00,  1.32it/s, loss=0.000643]


Got 74729886/74956800 with acc 99.70
Dice score: 0.9929567575454712
Epoch 20, train loss: 0.0006387507787946126, val loss: 0.0006434852066526159, val accuracy: 0.9929567575454712


## Submission

In [13]:
import numpy as np
import pandas as pd
import time

def rle_encode(img):
    '''
    img: numpy array, 1 - mask, 0 - background
    Returns run length as string formated
    '''
    pixels = img.flatten()
    pixels[0] = 0
    pixels[-1] = 0
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 2
    runs[1::2] -= runs[:-1:2]
    
    return ' '.join(str(x) for x in runs)

In [14]:
if 'test' not in os.listdir(WORKING_DIR):
    if os.path.isfile(WORKING_DIR + 'submission.csv'):
        os.remove(WORKING_DIR + 'submission.csv')
    with zipfile.ZipFile(DATASET_DIR + 'test.zip', 'r') as zip_file:
        zip_file.extractall(WORKING_DIR)
if 'test_images' not in os.listdir(WORKING_DIR):
    os.mkdir(WORKING_DIR + 'test_images')

In [15]:
TEST_DIR = WORKING_DIR + 'test'
THRESHOLD = 0.5
import torchvision.transforms.functional as TF

# Dataset
class CarvanaTestDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.transform = transform
        self.images = sorted(os.listdir(image_dir))

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        img_name = self.images[index]
        img_path = os.path.join(self.image_dir, self.images[index])
        image = np.array(Image.open(img_path).convert('RGB'))

        if self.transform is not None:
            augmentations = self.transform(image=image)
            image = augmentations['image']

        return img_name, image


test_transform = A.Compose(
    [
        A.Resize(height=image_height, width=image_width),
        A.Normalize(
            mean=[0.0, 0.0, 0.0],
            std=[1.0, 1.0, 1.0],
            max_pixel_value=255.0,
        ),
        ToTensorV2(), 
    ]
)
   
test_set = CarvanaTestDataset(
    image_dir=TEST_DIR,
    transform=test_transform
)    


test_loader = DataLoader(
    test_set, batch_size=batch_size, shuffle=False
)
    
# Model
checkpoint = torch.load(WORKING_DIR + 'model.pth')

model = UNet(in_channels=3, out_channels=1).to(device)
model.load_state_dict(checkpoint['state_dict'])

model.eval()

# Predictions
all_predictions = []
for img_names, x in tqdm(test_loader):
    x = x.to(device)
    with torch.no_grad():
        preds = torch.sigmoid(model(x))
        preds = (preds > THRESHOLD).float()   
    preds = TF.resize(
        preds, size=(1280, 1918), interpolation=TF.InterpolationMode.NEAREST
    )
    
    # Encoding
    for idx in range(len(img_names)):
        encoding = rle_encode(preds[idx].squeeze().cpu())
        all_predictions.append([img_names[idx], encoding])

100%|██████████| 6254/6254 [1:05:20<00:00,  1.60it/s]


In [16]:
shutil.rmtree(WORKING_DIR + 'test')
sub = pd.DataFrame(all_predictions)
sub.columns = ['img', 'rle_mask']
sub.to_csv(os.path.join(WORKING_DIR, 'submission.csv'), index=False)

In [17]:
import os
from IPython.display import FileLink

os.chdir('/kaggle/working')
FileLink('submission.csv')