In [1]:
# !git clone https://github.com/tldrafael/DeepLabV3Plus-Pytorch
# !mv DeepLabV3Plus-Pytorch deeplabv3

# !pip install git+https://github.com/huggingface/transformers.git

In [2]:
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '3'
os.environ['WORLD_SIZE'] = '1'

import numpy as np
import matplotlib.pyplot as plt
from seaborn import color_palette
from PIL import Image
import pandas as pd
from glob import iglob
import re
from datetime import datetime
import random
import cv2
from copy import copy
from transformers import Mask2FormerForUniversalSegmentation
from importlib import reload
from glob import iglob
from sklearn.manifold import TSNE

import torch
import torch.nn.functional as F
import torchvision.transforms as T
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.tensorboard import SummaryWriter
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision.io import read_image

import sys
sys.path.append('../src')
import utils as ut
import dataset as ds

In [3]:
id2label = ds.IDs('mocamba').id2label
n_classes = len(id2label)
print(id2label)


long = 256
long = 512
# long = 1024
# long = 1536
# long = 2048

f = long/512
crop_size = (256*f, 352*f) 
crop_size = tuple(int(a) for a in crop_size)
print(f'long: {long}, crop_size: {crop_size}')

T_crop = T.Compose([
    T.RandomCrop(size=crop_size),
    T.RandomHorizontalFlip(p=.5)
])


train_annotation = f'../data/tidyv01-long{long}/trainpaths.txt'
val_annotation = f'../data/tidyv01-long{long}/valpaths.txt'


train_ds = ut.SimpleDataset(annotation_file=train_annotation, transform=T_crop, transform_target=T_crop)
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)

val_ds = ut.SimpleDataset(annotation_file=val_annotation)
val_loader = DataLoader(val_ds, batch_size=1)

{0: 'Background', 1: 'Animals', 2: 'Asphalt', 3: "Cat's eyes", 4: 'Cracks', 5: 'Dirt road', 6: 'Ego', 7: 'Hard sand', 8: 'Markings', 9: 'Obstacles', 10: 'People', 11: 'Potholes', 12: 'Retaining walls', 13: 'Soft sand', 14: 'Speed bump', 15: 'Vehicles', 16: 'Wet sand'}
long: 512, crop_size: (256, 352)


In [4]:
# tmp=[]
for inp_im, inp_label in train_loader:
    # tmp.append(inp_label.max())
    break

# print(np.max(tmp))
    
lim = 2
dummy_im = inp_im.clone()[:lim]
dummy_label = inp_label.clone()[:lim]
dummy_label.unique()


colorizer = ut.TorchColorizer(len(id2label))

ims = ut.Normalize.reverse(dummy_im)
labels = colorizer(dummy_label)
alpha = .2
blend = (1-alpha)*ims + alpha*labels

tmp = torch.concat([ims, labels, blend], axis=-2)
tmp = tmp.moveaxis(0,-2).flatten(-2,-1).permute(1,2,0)
tmp = ut.float_to_uint8(tmp.numpy())
# Image.fromarray(tmp)

# pre-trained m2f

In [5]:
model = ut.MyMask2Former.from_pretrained(
            "facebook/mask2former-swin-base-IN21k-ade-semantic",
            id2label=id2label,
            ignore_mismatched_sizes=True
)

model.cuda();

Some weights of MyMask2Former were not initialized from the model checkpoint at facebook/mask2former-swin-base-IN21k-ade-semantic and are newly initialized because the shapes did not match:
- class_predictor.weight: found shape torch.Size([151, 256]) in the checkpoint and torch.Size([18, 256]) in the model instantiated
- class_predictor.bias: found shape torch.Size([151]) in the checkpoint and torch.Size([18]) in the model instantiated
- criterion.empty_weight: found shape torch.Size([151]) in the checkpoint and torch.Size([18]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
segmentation_map = model(dummy_im.cuda())[0].cpu()

colorizer = ut.TorchColorizer(len(id2label))

ims = ut.Normalize.reverse(dummy_im)
labels = colorizer(segmentation_map)
alpha = .2
blend = (1-alpha)*ims + alpha*labels

tmp = torch.concat([ims, labels, blend], axis=-2)
tmp = tmp.moveaxis(0,-2).flatten(-2,-1).permute(1,2,0)
tmp = ut.float_to_uint8(tmp.numpy())
# Image.fromarray(tmp)

# Training

In [7]:
reload(ut)
model = ut.MyMask2Former.from_pretrained(
            "facebook/mask2former-swin-base-IN21k-ade-semantic",
            id2label=id2label,
            ignore_mismatched_sizes=True
)
model.cuda();
n_classes = len(id2label)
model.n_classes = n_classes
# model = torch.compile(model, fullgraph=True)


opt = torch.optim.AdamW(model.parameters(), lr=1e-4)

scheduler = ut.WarmupLR(opt, n_warmup_max=1000)
warmup_step = 1


time_now = datetime.now().strftime("%Y%m%d_%H%M")
time_now += f'-PAPER-m2f-tidyv01+long{long}'
logdir = os.path.join('../logs', time_now)
# logdir = '20250713_2010-PAPER-m2f-tidyv01+long1024'
writer = SummaryWriter(log_dir=logdir)
writer.flush()


scaler = torch.amp.GradScaler('cuda')

Some weights of MyMask2Former were not initialized from the model checkpoint at facebook/mask2former-swin-base-IN21k-ade-semantic and are newly initialized because the shapes did not match:
- class_predictor.weight: found shape torch.Size([151, 256]) in the checkpoint and torch.Size([18, 256]) in the model instantiated
- class_predictor.bias: found shape torch.Size([151]) in the checkpoint and torch.Size([18]) in the model instantiated
- criterion.empty_weight: found shape torch.Size([151]) in the checkpoint and torch.Size([18]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
val_ds = ut.SimpleDataset(annotation_file=val_annotation)
val_loader = DataLoader(val_ds, batch_size=1)

bs_train = 8
train_loader = DataLoader(train_ds, batch_size=bs_train, shuffle=True, pin_memory=True, persistent_workers=True, num_workers=8)

In [9]:
# state_dict = torch.load('../logs/20250713_2010-PAPER-m2f-tidyv01+long1024/model.last.pth', weights_only=False)
# model.load_state_dict(state_dict['state'])
# opt.load_state_dict(state_dict['opt_state'])
# miou_best = state_dict['best_miou']
# it = state_dict['it']

In [10]:
fl_resume = True
fl_resume = False

lossi = []
losses = []
if not fl_resume:
    it = 1
    miou_best = 0
    steps_cur = 0    
       
    lr_cur = opt.param_groups[0]["lr"]
    print(f'lr: {lr_cur:.2e}')
    writer.add_scalars('lr', {'nn': lr_cur}, steps_cur)
        
    model.eval()
    miou_train = ut.get_CM_fromloader(train_loader, model, n_classes)[0]
    miou_val = ut.get_CM_fromloader(val_loader, model, n_classes)[0]
    print(f'\tmiou-train: {miou_train:.2f}\tmiou-val: {miou_val:.2f}')
    writer.add_scalars('miou', {'train': miou_train}, steps_cur)
    writer.add_scalars('miou', {'val': miou_val}, steps_cur)


print_every_n = 250
iters_total = 10000
n_gradacc = max(8//bs_train,1) # bs=8
iters_total *= n_gradacc
print_every_n *= n_gradacc
print(f'grad-acc: {n_gradacc}')
print(f'iters_total: {iters_total}')
steps_cur = it // print_every_n

model.train()
opt.zero_grad(set_to_none=True)
while it <= iters_total:
    
    for inp_im, inp_label in train_loader:
        inp_im = inp_im.cuda()
        inp_label = inp_label[:, 0].cuda()

        # Only forward pass goes into the autocast context
        with torch.amp.autocast(dtype=torch.bfloat16, enabled=True, device_type='cuda'):
            segmentation_map, loss = model(inp_im, inp_label)
        
        scaler.scale(loss).backward()
        # Accumulate gradients
        if it % n_gradacc == 0:
            scaler.step(opt)
            scaler.update()
            opt.zero_grad(set_to_none=True)            
        

        lossi.append(loss.item())
        it += 1

        if scheduler.fl_warmup and it % warmup_step == 0:
            scheduler.step()
            print(f'lr: {opt.param_groups[0]["lr"]:.2e}')
        
        if it % print_every_n == 0:
            steps_cur += 1

            loss_avg = np.mean(lossi)
            losses.append(loss_avg)

            model.eval()
            miou_train = ut.get_CM_fromloader(train_loader, model, n_classes)[0]
            miou_val = ut.get_CM_fromloader(val_loader, model, n_classes)[0]
            model.train()

            print(
                f'it: {it}\tloss: {loss_avg:.4f}'
                f'\tmiou-train: {miou_train:.3f}',
                f'\tmiou-val: {miou_val:.3f}'
            )

            writer.add_scalars('objective', {'train': loss_avg}, steps_cur)
            writer.add_scalars('miou', {'train': miou_train}, steps_cur)
            writer.add_scalars('miou', {'val': miou_val}, steps_cur)

            
            lossi = []
            
            ckpt_last_path = os.path.join(logdir, 'model.last.pth')
            ut.save_ckpt(ckpt_last_path, model, opt, miou_val, it)

            if miou_val > miou_best:
                miou_best = miou_val
                ckpt_last_path = os.path.join(logdir, 'model.best.pth')
                ut.save_ckpt(ckpt_last_path, model, opt, miou_val, it)

            if it in [5000, 9000]:
                for pg in opt.param_groups:
                    pg['lr'] *= .1
                    
        if it > iters_total:
            break

lr: 1.00e-07
	miou-train: 0.01	miou-val: 0.01
grad-acc: 1
lr: 2.00e-07
lr: 3.00e-07
lr: 4.00e-07
lr: 5.00e-07
lr: 6.00e-07
lr: 6.99e-07
lr: 7.99e-07
lr: 8.99e-07
lr: 9.99e-07
lr: 1.10e-06
lr: 1.20e-06
lr: 1.30e-06
lr: 1.40e-06
lr: 1.50e-06
lr: 1.60e-06
lr: 1.70e-06
lr: 1.80e-06
lr: 1.90e-06
lr: 2.00e-06
lr: 2.10e-06
lr: 2.20e-06
lr: 2.30e-06
lr: 2.40e-06
lr: 2.50e-06
lr: 2.60e-06
lr: 2.70e-06
lr: 2.80e-06
lr: 2.90e-06
lr: 3.00e-06
lr: 3.10e-06
lr: 3.20e-06
lr: 3.30e-06
lr: 3.40e-06
lr: 3.50e-06
lr: 3.60e-06
lr: 3.70e-06
lr: 3.80e-06
lr: 3.90e-06
lr: 4.00e-06
lr: 4.10e-06
lr: 4.20e-06
lr: 4.30e-06
lr: 4.40e-06
lr: 4.50e-06
lr: 4.60e-06
lr: 4.70e-06
lr: 4.80e-06
lr: 4.90e-06
lr: 5.00e-06
lr: 5.10e-06
lr: 5.19e-06
lr: 5.29e-06
lr: 5.39e-06
lr: 5.49e-06
lr: 5.59e-06
lr: 5.69e-06
lr: 5.79e-06
lr: 5.89e-06
lr: 5.99e-06
lr: 6.09e-06
lr: 6.19e-06
lr: 6.29e-06
lr: 6.39e-06
lr: 6.49e-06
lr: 6.59e-06
lr: 6.69e-06
lr: 6.79e-06
lr: 6.89e-06
lr: 6.99e-06
lr: 7.09e-06
lr: 7.19e-06
lr: 7.29e-06
lr: 7.

In [11]:
%%notify -m "Training finished ðŸŽ‰"
print('oi')

UsageError: Cell magic `%%notify` not found.


In [None]:
# for pg in opt.param_groups:
#     print(pg['lr'])
#     pg['lr'] *= .1
#     print(pg['lr'])   

In [None]:
model.eval()
pred = model(dummy_im.cuda())[0].cpu()
ims = ut.Normalize.reverse(dummy_im)
# labels = colorizer(dummy_label)
labels = colorizer(pred)
alpha = .2
blend = (1-alpha)*ims + alpha*labels
tmp = torch.concat([ims, labels, blend], axis=-2)
tmp = tmp.moveaxis(0,-2).flatten(-2,-1).permute(1,2,0)
tmp = ut.float_to_uint8(tmp.numpy())
Image.fromarray(tmp)

# Evaluating model

In [None]:
m2f = ut.MyMask2Former.from_pretrained(
            "facebook/mask2former-swin-base-IN21k-ade-semantic",
            id2label=id2label,
            ignore_mismatched_sizes=True
)

m2f.eval().cuda();


modelpath = '../logs/20240702_1358-m2f-mocambav034+rtk-res1024/model.best.pth'
state_dict = torch.load(modelpath)['state']
m2f.load_state_dict(state_dict)

In [None]:
miou, CM_iou, CM_abs = ut.get_CM_fromloader(val_loader, m2f, n_classes)
print(miou.round(2))
print(CM_iou.round(2))
# v03.2
# 0.72
# [0.99  nan 0.92 0.23 0.3  0.88 0.9 0.6   nan 0.6  0.77 0.71 0.82 0.8 0.84 0.81 0.71]

# v03
# 0.68
# [0.98  nan 0.91 0.16 0.24 0.63 0.81 0.61  nan 0.49 0.76 0.73 0.78 0.79
#  0.86 0.79 0.58]

# v02
# 0.63
# [0.99  nan 0.91 0.22 0.18 0.61 0.66 0.62  nan 0.41 0.71 0.71 0.7  0.37
#  0.86 0.85 0.59]

# v01
# 0.69
# array([0.99,  nan, 0.95, 0.48, 0.49, 0.7 , 0.73,  nan, 0.43, 0.04, 0.56,
#        0.66, 0.88, 0.85, 0.55, 0.66, 0.97, 0.84, 0.67,  nan, 0.84, 0.77,
#        0.7 ])

In [None]:
# mocamba_classnames = {
#     0: 'background', 1: 'thing-Animals', 2: 'surface-Asphalt', 3: 'sign-Cat-s-Eye', 4: 'damage-Cracks', 5: 'thing-Ego',
#     6: 'surface-Hard-Sand', 7: 'sign-Markings', 8: 'thing-Obstacle', 9: 'thing-People', 10: 'damage-Pothole', 11: 'thing-Retaining-wall',
#     12: 'surface-Soft-Sand', 13: 'surface-Unpaved', 14: 'thing-Vehicles', 15: 'sign-Vertical-Signs', 16: 'surface-Wet-sand'
# }


txt = ''
for v, p in zip(mocamba_classnames.values(), CM_iou):
    txt += f'{v}:\t{round(p*100,2)}\n'

print(txt)

In [None]:
# fpath = 'all-trainpaths-wider.txt'
# fpath = 'all-valpaths-wider.txt'
fpath = '../data/mocamba/ds-mocamba-v0.3.3-long1024/trainpaths.txt'


val_ds = ut.SimpleDataset(annotation_file=fpath)
val_loader = DataLoader(val_ds, batch_size=1, shuffle=True)
m2f.eval()
alpha = .2
nplot = 10

plot_ims = []
for inp_im, inp_label in val_loader:
    pred_m2f = m2f(inp_im.cuda())[0].cpu()    
    im = ut.Normalize.reverse(inp_im)
    label = colorizer(inp_label)
    
    pred_m2f = colorizer(pred_m2f)
    blend = (1-alpha)*im + alpha*pred_m2f

    tmp = [im, label, pred_m2f, blend]
    tmp = [F.pad(x, (0, 1024-x.shape[-1], 0, 0, 0, 0, 0, 0)) for x in tmp]

    tmp = torch.concat([torch.concat(tmp, axis=-1)], axis=-2)
    tmp = tmp.moveaxis(0,-2).flatten(-2,-1).permute(1,2,0)
    tmp = ut.float_to_uint8(tmp.numpy())
    plot_ims.append(tmp)

    if len(plot_ims) >= nplot:
        break


tmp = np.concatenate(plot_ims, axis=0)
Image.fromarray(tmp)