# Setting up required libraries

In [1]:
!pip install pytorch-lightning transformers datasets

In [None]:
import os
import glob
from PIL import Image
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch import nn
from torch.nn.modules.utils import _pair
from torchvision.ops import deform_conv2d

# for equi-rectangular convolution
from torch.nn import ZeroPad2d as zeropad
from src.equi_conv import EquiConv2d, equi_conv2d
#--------------------####

In [2]:
### Configurations of bitmap to labels
id2label = {'0':'rand11','1': 'rand12', '2': ' rand13', '3':'rand14','4':'rand15','5':'rand16',
           '6':'rand17','7':'rand18','8':'rand1','9':'rand2','10':'rand3','11':'rand4','12':'rand5','15':'rand6','13':'rand7','14':'rand8'}

# Checking for gpus

In [3]:
import torch 
if torch.cuda.is_available():
    device = 'cuda:1'
else:
    device = 'cpu'
print(device)

cuda:1


# Loading the dataset and model

In [8]:
from transformers import SegformerFeatureExtractor
from dataset import SemanticSegmentationDataset

feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b1-finetuned-ade-512-512")
feature_extractor.reduce_labels = False
feature_extractor.do_rescale = False
feature_extractor.size = 128             

train_dataset = SemanticSegmentationDataset("dataset/train/", feature_extractor, id2label)
val_dataset = SemanticSegmentationDataset("dataset/test/", feature_extractor, id2label)

batch_size = 8
init_train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
init_val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

Number of images in test set: 566
Number of images in test set: 100


# Applying equi-convolution

In [10]:
train_dataloader = val_dataloader = []
    
for batch in init_train_dataloader:
    images, masks = batch['image'], batch['mask']
    masks = masks[:,:,:,-1]
    convol_img = EquiConv2d(3,3,3,padding=1, bias=False)
    equi1 = convol_img(images)
    equi2 = convol_img(equi1)
    equi3 = convol_img(equi2)
    equi3 = equi3+abs(equi3.min())
    output = equi3/equi3.max()
    output = output.to('cpu').detach()

    encoded_inputs = feature_extractor(output,masks, return_tensors="pt")
    for k,v in encoded_inputs.items():
      encoded_inputs[k].squeeze_()
    train_dataloader.append(encoded_inputs)
    
for batch in init_val_dataloader:
    images, masks = batch['image'], batch['mask']
    masks = masks[:,:,:,-1]
    convol_img = EquiConv2d(3,3,3,padding=1, bias=False)
    equi1 = convol_img(images)
    equi2 = convol_img(equi1)
    equi3 = convol_img(equi2)
    equi3 = equi3+abs(equi3.min())
    output = equi3/equi3.max()
    output = output.to('cpu').detach()
     
    encoded_inputs = feature_extractor(output,masks, return_tensors="pt")
    for k,v in encoded_inputs.items():
      encoded_inputs[k].squeeze_()
    val_dataloader.append(encoded_inputs)

In [11]:
train_dataloader = DataLoader(train_dataloader,batch_size=None)
val_dataloader = DataLoader(val_dataloader,batch_size=None)
print(train_dataloader,val_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x000001E796268910> <torch.utils.data.dataloader.DataLoader object at 0x000001E7966C5610>


# Building segformer fine-tuner for pano data 

In [13]:
from segformer_finetuner import SegformerFinetuner
segformer_finetuner = SegformerFinetuner(
    train_dataset.id2label, 
    train_dataloader=train_dataloader, 
    val_dataloader=val_dataloader, 
    metrics_interval=10,
)

Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/segformer-b1-finetuned-ade-512-512 and are newly initialized because the shapes did not match:
- decode_head.classifier.weight: found shape torch.Size([150, 256, 1, 1]) in the checkpoint and torch.Size([16, 256, 1, 1]) in the model instantiated
- decode_head.classifier.bias: found shape torch.Size([150]) in the checkpoint and torch.Size([16]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.train_mean_iou = load_metric("mean_iou")


# Training the model

In [14]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

early_stop_callback = EarlyStopping(
    monitor="val_loss", 
    min_delta=0.00, 
    patience=3, 
    verbose=False, 
    mode="min",
)

checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_loss")
# auto_lr_find=True
trainer = pl.Trainer(
    callbacks=[early_stop_callback, checkpoint_callback],
    max_epochs=300,
    val_check_interval=len(train_dataloader),
)

trainer.fit(segformer_finetuner)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: d:\Gautam\sun360_extended_dataset\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type                             | Params
-----------------------------------------------------------
0 | model | SegformerForSemanticSegmentation | 13.7 M
-----------------------------------------------------------
13.7 M    Trainable params
0         Non-trainable params
13.7 M    Total params
54.725    Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\user\anaconda3\envs\segment\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=39` in the `DataLoader` to improve performance.


                                                                           

  iou = total_area_intersect / total_area_union
  acc = total_area_intersect / total_area_label
c:\Users\user\anaconda3\envs\segment\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=39` in the `DataLoader` to improve performance.


Epoch 299: 100%|██████████| 84/84 [02:15<00:00,  0.62it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=300` reached.


Epoch 299: 100%|██████████| 84/84 [02:15<00:00,  0.62it/s, v_num=0]


# Log Results in tensorboards

In [3]:
!kill 54285

In [6]:
%load_ext tensorboard
%tensorboard --logdir ../../lightning_logs/

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6007 (pid 54457), started 0:01:55 ago. (Use '!kill 54457' to kill it.)

Tensorboard result is stored in results/log_iou.png in src folder.

## We achieved an validation mean accuracy of 96.97% and an IoU of 0.9455(SOTA) in the sun360 extended dataset.

# Visualizing the prediction map of model

In [None]:
color_map = {
    0:(0,0,0),
    .....,
    14:(113,174,206),
}

def prediction_to_vis(prediction):
    vis_shape = prediction.shape + (3,)
    vis = np.zeros(vis_shape)
    for i,c in color_map.items():
        vis[prediction == i] = color_map[i]
    return Image.fromarray(vis.astype(np.uint8))

for batch in val_dataloader:
    images, masks = batch['pixel_values'], batch['labels']
    outputs = segformer_finetuner.model(images, masks)
        
    loss, logits = outputs[0], outputs[1]

    upsampled_logits = nn.functional.interpolate(
        logits, 
        size=masks.shape[-2:], 
        mode="bilinear", 
        align_corners=False
    )

    predicted = upsampled_logits.argmax(dim=1).cpu().numpy()
    masks = masks.cpu().numpy()

from matplotlib import pyplot as plt
f, axarr = plt.subplots(predicted.shape[0],2, figsize=(20, 32))
for i in range(predicted.shape[0]):
    axarr[i,0].imshow(prediction_to_vis(predicted[i,:,:]))
    axarr[i,1].imshow(prediction_to_vis(masks[i,:,:]))

![Sample Image](final_val_result.jpg)

Final prediction map can be viewed from results/prediction_map_result.jpg