# Deep Learning-Based WWR and Floor Count Extraction from FaÃ§ade Images to Improve UBEM

CISBAT 2025

[Ayca Duran](https://systems.arch.ethz.ch/ayca-duran), [Panagiotis Karapiperis](https://www.linkedin.com/in/panagiotis-karapiperis-ethz/), [Christoph Waibel](https://systems.arch.ethz.ch/christoph-waibel), [Arno Schlueter](https://systems.arch.ethz.ch/arno-schlueter)

### FCN Windows Finder - Pretrained on COCO with VOC labels

This notebook is to train an FCN model for binary semantic segmentation to parse windows on facades, using the zju_facade_jcst2020 dataset.

In [None]:
# Import libraries
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models, datasets, transforms
from torch.utils.data import Dataset, DataLoader, random_split

# For reproducibility
torch.manual_seed(1234)

# CPU / GPU
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda:0')

In [None]:
# Create Dataset Class
class ImageAnnotationDataset(Dataset):
    def __init__(self, image_dir, mask_dir, target_size=(520, 520)):
        """
        Args:
            image_dir (str): Directory with input RGB images.
            mask_dir (str): Directory with binary PNG masks.
            target_size (tuple): Image/mask resize dimensions (H, W).
        """
        self.image_dir = image_dir
        self.mask_dir = mask_dir
        self.target_size = target_size

        self.image_filenames = sorted([
            f for f in os.listdir(image_dir) if f.endswith(('.jpg', '.png'))
        ])

        self.to_tensor = transforms.PILToTensor()
        self.resize = transforms.Resize(target_size)
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                              std=[0.229, 0.224, 0.225])

    def __len__(self):
        return len(self.image_filenames)

    def __getitem__(self, idx):
        filename = self.image_filenames[idx]
        base_name = os.path.splitext(filename)[0]
        image_path = os.path.join(self.image_dir, filename)
        mask_path = os.path.join(self.mask_dir, base_name + ".png")

        # Load and process image
        image = Image.open(image_path).convert("RGB")
        image = self.resize(image)
        image = self.to_tensor(image).float() / 255.0
        image = self.normalize(image)

        # Load and process mask
        mask = Image.open(mask_path).convert("L")  # ensure grayscale
        mask = self.resize(mask)
        mask = self.to_tensor(mask)[0]  # single channel

        return image, mask


## Dataset

Download the dataset from [Li et al., 2020](https://github.com/lck1201/win_det_heatmaps) and place it within the root folder.

In [None]:
# Set paths for full zju_facade_jcst2020 dataset
dataset_dir = 'zju_facade_jcst2020'

train_image_path = os.path.join(dataset_dir, 'train', 'images')
val_image_path = os.path.join(dataset_dir, 'test', 'images')

train_ann_path = os.path.join(dataset_dir, 'train', 'annotations')
val_ann_path = os.path.join(dataset_dir, 'test', 'annotations')

In [None]:
# Create dataset objects
train_dataset = ImageAnnotationDataset(train_image_path, train_ann_path)
val_dataset = ImageAnnotationDataset(val_image_path, val_ann_path)

In [None]:
# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

## Model

We use a pretrained ResNet50 model on the COCO dataset with VOC labels. We load the model from torchvision models.

In [None]:
# Import model and modify last layer for binary segmentation
from torchvision.models.segmentation import fcn_resnet50
model = fcn_resnet50(weights='COCO_WITH_VOC_LABELS_V1').to(device)
model.classifier[-1] = nn.Conv2d(model.classifier[4].in_channels,1,kernel_size=(1,1),stride=(1,1)).to(device)
model.aux_classifier[-1] = nn.Conv2d(model.aux_classifier[4].in_channels,1, kernel_size=(1,1),stride=(1,1)).to(device)

In [None]:
list(model.children())

In [None]:
# Define Training / Evaluation steps
import evaluate
from torch.nn.functional import binary_cross_entropy_with_logits

# Metric setup
iou_metric = evaluate.load("mean_iou")

def train_one_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for images, masks in tqdm(dataloader, desc="Training", leave=False):
        images = images.to(device)
        masks = masks.to(device).float()

        optimizer.zero_grad()
        outputs = model(images)['out']
        outputs = outputs.squeeze(1)

        loss = torch.nn.functional.binary_cross_entropy_with_logits(outputs, masks)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


@torch.no_grad()
def eval_one_epoch(model, dataloader, device):
    model.eval()
    total_loss = 0

    all_preds = []
    all_labels = []

    for images, masks in tqdm(dataloader):
        images = images.to(device)
        masks = masks.to(device).float()

        outputs = model(images)['out'].squeeze(1)  # (B, H, W)

        loss = binary_cross_entropy_with_logits(outputs, masks)
        total_loss += loss.item()

        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).long()

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(masks.cpu().numpy().astype(int))

    # Compute IoU using Hugging Face's evaluate
    metrics = iou_metric.compute(
        predictions=all_preds,
        references=all_labels,
        num_labels=2,
        ignore_index=None
    )
    iou_class_1 = metrics["per_category_iou"][1]  # class 1

    return total_loss / len(dataloader), iou_class_1


In [None]:
# Training
from torch import optim

checkpoint_dir = 'trained_models/fcn_resnet50'
os.makedirs(checkpoint_dir, exist_ok=True)

num_epochs = 10 # Adjust
lr = 0.0005

def create_optimizer(net, lr):
    # Create optimizer
    opt = optim.Adam(net.parameters(), lr=lr)
    return opt

optimizer = create_optimizer(model, lr)

train_losses = []
val_losses = []
val_ious = [0]

for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, train_loader, optimizer, device)
    train_losses.append(train_loss)
    val_loss, val_iou = eval_one_epoch(model, val_loader, device)
    val_losses.append(val_loss)
    # Save models based on evaluation IoU
    if val_iou > max(val_ious):
       checkpoint_path = os.path.join(checkpoint_dir, f"model_best.pt")
       torch.save(model.state_dict(), checkpoint_path)
    val_ious.append(val_iou)

    print(f"[Epoch {epoch+1}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val IoU (class 1): {val_iou:.4f}")


In [None]:
# Plot validation loss during training
xs = [x for x in range(len(val_ious))]
plt.plot(xs, val_ious)
plt.xlabel("Epochs")
plt.ylabel("IoU")
plt.title("Validation IoUs")
plt.savefig('valid_ious.png')
plt.show()