# End-to-End Image Segmentation with U-Net in PyTorch



Designed, built, and trained a deep learning model from scratch for semantic segmentation of objects in the PASCAL VOC 2012 dataset. Developed a complete, end-to-end data pipeline, from data acquisition and preprocessing to model training, evaluation, and visualization in a cloud-based environment.

In [1]:
from google.colab import drive
import os
import glob
from PIL import Image
from torch.utils.data import Dataset
from torchvision import transforms
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import torchvision

In [2]:
# Mount Google Drive to access the Kaggle API key
drive.mount('/content/drive')

!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/ZidioProject/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

print("Key from Google Drive is complete!")

Mounted at /content/drive
Key from Google Drive is complete!


In [3]:
# Download the PASCAL VOC 2012 dataset from Kaggle
!kaggle datasets download -d sovitrath/voc-2012-segmentation-data -p ./data --unzip

Dataset URL: https://www.kaggle.com/datasets/sovitrath/voc-2012-segmentation-data
License(s): CC0-1.0
Downloading voc-2012-segmentation-data.zip to ./data
 93% 294M/316M [00:00<00:00, 729MB/s] 
100% 316M/316M [00:00<00:00, 784MB/s]


## Step 2: Building the Data Pipeline

In [4]:
#paths
train_image_path = f'./data/voc_2012_segmentation_data/train_images/'
train_label_path = f'./data/voc_2012_segmentation_data/train_labels/'

class SegmentationDataset(Dataset):
    def __init__(self, image_path, label_path):
        self.images = sorted(glob.glob(os.path.join(image_path, '*.jpg')))
        self.labels = sorted(glob.glob(os.path.join(label_path, '*.png')))

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):

        image_path = self.images[idx]
        label_path = self.labels[idx]

        image = Image.open(image_path).convert('RGB')
        mask = Image.open(label_path).convert('L')

        transform = transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.ToTensor()
        ])

        #Prepareconvert to Tensor)
        image = transform(image)
        mask = transform(mask)

        # return the pair
        return image, mask

# verification
train_dataset = SegmentationDataset(train_image_path, train_label_path)
print(f"Dataset ready with {len(train_dataset)} images.")


train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=4,
    shuffle=True,
    num_workers=2
)

first_batch_images, first_batch_masks = next(iter(train_dataloader))

print(f"Shape of one BATCH of images: {first_batch_images.shape}")
print(f"Shape of one BATCH of masks: {first_batch_masks.shape}")

Dataset ready with 1464 images.
Shape of one BATCH of images: torch.Size([4, 3, 256, 256])
Shape of one BATCH of masks: torch.Size([4, 1, 256, 256])


## Step 3: U-Net Model Architecture

In [5]:


class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(DoubleConv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        return self.conv(x)

In [10]:
class UNET(nn.Module):

    def __init__(self, in_channels=3, out_channels=1, features=[64, 128, 256, 512]):
        super(UNET, self).__init__()

        self.down = nn.ModuleList()
        self.up = nn.ModuleList()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        # encoder
        for feature in features:
            self.down.append(DoubleConv(in_channels, feature))
            in_channels = feature

        # decoder
        for feature in reversed(features):
            self.up.append(nn.ConvTranspose2d(feature*2, feature, kernel_size=2, stride=2))
            self.up.append(DoubleConv(feature*2, feature))

        # bottleneck
        self.bottleneck = DoubleConv(features[-1], features[-1]*2)

        # final conv
        self.final_conv = nn.Conv2d(features[0], out_channels, kernel_size=1)

    def forward(self, x):
        skip_connections = []

        #down the U-Net
        for down_block in self.down:
            x = down_block(x)
            skip_connections.append(x)
            x = self.pool(x)

        # bottleneck
        x = self.bottleneck(x)

        skip_connections.reverse()

        # Go up the U-Net
        for idx in range(0, len(self.up), 2):
            x = self.up[idx](x)
            skip_connection_tensor = skip_connections[idx//2]

            if x.shape != skip_connection_tensor.shape:
                x = F.interpolate(x, size=skip_connection_tensor.shape[2:])

            concat_tensor = torch.cat((skip_connection_tensor, x), dim=1)
            x = self.up[idx+1](concat_tensor)

        return self.final_conv(x)

model = UNET(in_channels=3, out_channels=1)

## Step 4: Training the Model

In [11]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Final check, running on device: {DEVICE.upper()}")

LEARNING_RATE = 1e-4
BATCH_SIZE = 16
NUM_EPOCHS = 35

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
model = UNET(in_channels=3, out_channels=1).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.BCEWithLogitsLoss()

#training
def train_fn(loader, model, optimizer, loss_fn, device):
    model.train()
    loop = tqdm(loader, leave=True)
    for batch_idx, (data, targets) in enumerate(loop):
        data = data.to(device=device)
        targets = targets.float().to(device=device)

        predictions = model(data)
        loss = loss_fn(predictions, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loop.set_postfix(loss=loss.item())

#training loop
for epoch in range(NUM_EPOCHS):
    print(f"\n--- Epoch {epoch+1}/{NUM_EPOCHS} ---")
    train_fn(train_loader, model, optimizer, loss_fn, DEVICE)

Final check, running on device: CUDA

--- Epoch 1/35 ---


100%|██████████| 92/92 [01:08<00:00,  1.35it/s, loss=0.39]



--- Epoch 2/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.27it/s, loss=0.389]



--- Epoch 3/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.27it/s, loss=0.425]



--- Epoch 4/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.27it/s, loss=0.295]



--- Epoch 5/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.38]



--- Epoch 6/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.392]



--- Epoch 7/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.312]



--- Epoch 8/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.31]



--- Epoch 9/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.247]



--- Epoch 10/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.363]



--- Epoch 11/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.424]



--- Epoch 12/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.426]



--- Epoch 13/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.261]



--- Epoch 14/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.325]



--- Epoch 15/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.351]



--- Epoch 16/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.673]



--- Epoch 17/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.336]



--- Epoch 18/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.276]



--- Epoch 19/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.258]



--- Epoch 20/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.366]



--- Epoch 21/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.209]



--- Epoch 22/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.27]



--- Epoch 23/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.296]



--- Epoch 24/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.318]



--- Epoch 25/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.311]



--- Epoch 26/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.3]



--- Epoch 27/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.289]



--- Epoch 28/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.194]



--- Epoch 29/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.299]



--- Epoch 30/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.245]



--- Epoch 31/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.337]



--- Epoch 32/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.404]



--- Epoch 33/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.222]



--- Epoch 34/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.22]



--- Epoch 35/35 ---


100%|██████████| 92/92 [01:12<00:00,  1.26it/s, loss=0.223]


## Step 5: Prepeare Test Data

In [12]:
valid_image_path = f'./data/voc_2012_segmentation_data/valid_images/'
valid_labels_path = f'./data/voc_2012_segmentation_data/valid_labels/'



# --- Verification ---
valid_dataset = SegmentationDataset(valid_image_path, valid_labels_path)
print(f"Dataset ready with {len(valid_dataset)} images.")



valid_dataloader = DataLoader(
    dataset=valid_dataset,
    batch_size=4,
    shuffle=False,
    num_workers=2
)

first_batch_images, first_batch_masks = next(iter(valid_dataloader))

print(f"Shape of one BATCH of images: {first_batch_images.shape}")
print(f"Shape of one BATCH of masks: {first_batch_masks.shape}")


Dataset ready with 1449 images.
Shape of one BATCH of images: torch.Size([4, 3, 256, 256])
Shape of one BATCH of masks: torch.Size([4, 1, 256, 256])


## Step 6: Evaluating the Model

In [13]:
def check_accuracy(loader, model, device="cuda"):
    num_correct = 0
    num_pixels = 0
    dice_score = 0

    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            y = y.to(device)
            preds = torch.sigmoid(model(x))
            preds = (preds > 0.5).float()
            num_correct += (preds == y).sum()
            num_pixels += torch.numel(preds)

            dice_score += (2 * (preds * y).sum()) / (
                (preds + y).sum() + 1e-6)

    print(f"Got {num_correct}/{num_pixels} with accuracy {num_correct/num_pixels*100:.2f}%")
    avg_dice_score = dice_score/len(loader)
    print(f"Dice score: {avg_dice_score:.4f}")

In [14]:
check_accuracy(valid_dataloader,model,DEVICE)

Got 63071576/94961664 with accuracy 66.42%
Dice score: 0.2984


## Step 7: Visualizing the Results

In [15]:
def save_predictions_as_imgs(loader, model, folder="saved_images/", device="cuda"):
    model.eval()

    # Get one batch from the loader
    x, y = next(iter(loader))
    x, y = x.to(device), y.to(device)

    # Create the folder if it doesn't exist
    if not os.path.exists(folder):
        os.makedirs(folder)

    with torch.no_grad():
        preds = torch.sigmoid(model(x))
        preds = (preds > 0.5).float()

    # Save the images, ground truth masks, and predictions
    # Each will be a grid of images from the batch
    torchvision.utils.save_image(
        x, f"{folder}/original_images.png")
    torchvision.utils.save_image(
        y, f"{folder}/true_masks.png")
    torchvision.utils.save_image(
        preds, f"{folder}/pred_masks.png")

    model.train()

# --- Run the visualization ---
print("Saving one batch of predictions to the 'saved_images' folder...")
save_predictions_as_imgs(valid_dataloader, model, device=DEVICE)
print("Done. Check the file explorer on the left.")

Saving one batch of predictions to the 'saved_images' folder...
Done. Check the file explorer on the left.
