<a href="https://colab.research.google.com/github/vini-castro/sidewalk_depthanything/blob/main/sidealkcolab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import subprocess
import pkg_resources

def get_numpy_version():
    try:
        return pkg_resources.get_distribution("numpy").version
    except pkg_resources.DistributionNotFound:
        return None

current_version = get_numpy_version()
print(current_version)

target_version = "1.23.1"

if current_version != target_version:
    print(f"Numpy versão {current_version} detectada. Atualizando para {target_version}...")

    !pip uninstall -y numpy
    !pip3 install mxnet-mkl==1.6.0 numpy==1.23.1

    print("Atualização concluída. Reinicie o ambiente")
else:
    print(f"Numpy já está na versão {target_version}. Nenhuma ação necessária.")


In [None]:
%cd /content/
!rm -rf OneFormer/
!git clone https://github.com/SHI-Labs/OneFormer-Colab.git
! mv OneFormer-Colab OneFormer
%cd /content/OneFormer/

In [None]:
import sys, os, distutils.core
!git clone 'https://github.com/facebookresearch/detectron2'
!git clone 'https://github.com/vini-castro/sidewalk_depthanything' -q
dist = distutils.core.run_setup("./detectron2/setup.py")
!python -m pip install {' '.join([f"'{x}'" for x in dist.install_requires])} --quiet
sys.path.insert(0, os.path.abspath('./detectron2'))

In [None]:
!pip3 install -U opencv-python --quiet
!pip3 install natten==0.14.6  --quiet

!pip3 install -r requirements.txt --quiet
!pip3 install ipython-autotime --quiet
!pip3 install imutils --quiet

In [None]:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()
setup_logger(name="oneformer")

# Import libraries
import numpy as np
import cv2
import torch
import math
from google.colab.patches import cv2_imshow
import imutils

# Import detectron2 utilities
from detectron2.config import get_cfg
from detectron2.projects.deeplab import add_deeplab_config
from detectron2.data import MetadataCatalog
from demo.defaults import DefaultPredictor
from demo.visualizer import Visualizer, ColorMode


# import OneFormer Project
from oneformer import (
    add_oneformer_config,
    add_common_config,
    add_swin_config,
    add_dinat_config,
    add_convnext_config,
)

In [None]:
cpu_device = torch.device("cpu")
SWIN_CFG_DICT = {"cityscapes": "configs/cityscapes/oneformer_swin_large_IN21k_384_bs16_90k.yaml",
            "ade20k": "configs/ade20k/oneformer_swin_large_IN21k_384_bs16_160k.yaml",}

DINAT_CFG_DICT = {"cityscapes": "configs/cityscapes/oneformer_dinat_large_bs16_90k.yaml",
            "ade20k": "configs/ade20k/oneformer_dinat_large_IN21k_384_bs16_160k.yaml",}

def setup_cfg(dataset, model_path, use_swin):
    cfg = get_cfg()
    add_deeplab_config(cfg)
    add_common_config(cfg)
    add_swin_config(cfg)
    add_dinat_config(cfg)
    add_convnext_config(cfg)
    add_oneformer_config(cfg)
    if use_swin:
      cfg_path = SWIN_CFG_DICT[dataset]
    else:
      cfg_path = DINAT_CFG_DICT[dataset]
    cfg.merge_from_file(cfg_path)
    cfg.MODEL.DEVICE = 'cpu'
    cfg.MODEL.WEIGHTS = model_path
    cfg.freeze()
    return cfg

def setup_modules(dataset, model_path, use_swin):
    cfg = setup_cfg(dataset, model_path, use_swin)
    predictor = DefaultPredictor(cfg)
    metadata = MetadataCatalog.get(
        cfg.DATASETS.TEST_PANOPTIC[0] if len(cfg.DATASETS.TEST_PANOPTIC) else "__unused"
    )
    if 'cityscapes_fine_sem_seg_val' in cfg.DATASETS.TEST_PANOPTIC[0]:
        from cityscapesscripts.helpers.labels import labels
        stuff_colors = [k.color for k in labels if k.trainId != 255]
        metadata = metadata.set(stuff_colors=stuff_colors)

    return predictor, metadata

def panoptic_run(img, predictor, metadata):
    visualizer = Visualizer(img[:, :, ::-1], metadata=metadata, instance_mode=ColorMode.IMAGE)
    predictions = predictor(img, "panoptic")
    panoptic_seg, segments_info = predictions["panoptic_seg"]
    out = visualizer.draw_panoptic_seg_predictions(
    panoptic_seg.to(cpu_device), segments_info, alpha=0.5
)
    return out

def instance_run(img, predictor, metadata):
    visualizer = Visualizer(img[:, :, ::-1], metadata=metadata, instance_mode=ColorMode.IMAGE)
    predictions = predictor(img, "instance")
    instances = predictions["instances"].to(cpu_device)
    out = visualizer.draw_instance_predictions(predictions=instances, alpha=0.5)
    return out

def semantic_run(img, predictor, metadata):
    visualizer = Visualizer(img[:, :, ::-1], metadata=metadata, instance_mode=ColorMode.IMAGE)
    predictions = predictor(img, "semantic")
    out = visualizer.draw_sem_seg(
        predictions["sem_seg"].argmax(dim=0).to(cpu_device), alpha=0.5
    )
    return out

def calcula_coeficiente(lat_orig, long_orig, lat_dest, long_dest):
    lat1 = math.radians(lat_orig)
    lon1 = math.radians(long_orig)
    lat2 = math.radians(lat_dest)
    lon2 = math.radians(long_dest)

    delta_lon = lon2 - lon1

    x = math.cos(lat2) * math.sin(delta_lon)
    y = math.cos(lat1) * math.sin(lat2) - math.sin(lat1) * math.cos(lat2) * math.cos(delta_lon)

    initial_bearing = math.atan2(x, y)
    initial_bearing = math.degrees(initial_bearing)

    #90 graus para olhar a calçada de frente
    bearing = (initial_bearing + 360 - 90 ) % 360

    return bearing

TASK_INFER = {"panoptic": panoptic_run,
              "instance": instance_run,
              "semantic": semantic_run}

In [None]:
######
#@markdown We use `DiNAT-L` as the default backbone. To use Swin-L as backbone, select the checkbox below.
use_swin = True #@param {type: 'boolean'}

In [None]:
import os
import subprocess
if not use_swin:
  if not os.path.exists("250_16_dinat_l_oneformer_ade20k_160k.pth"):
    subprocess.run('wget https://shi-labs.com/projects/oneformer/ade20k/250_16_dinat_l_oneformer_ade20k_160k.pth', shell=True)
  predictor, metadata = setup_modules("ade20k", "250_16_dinat_l_oneformer_ade20k_160k.pth", use_swin)
else:
  if not os.path.exists("250_16_swin_l_oneformer_ade20k_160k.pth"):
    subprocess.run('wget https://shi-labs.com/projects/oneformer/ade20k/250_16_swin_l_oneformer_ade20k_160k.pth', shell=True)
  predictor, metadata = setup_modules("ade20k", "250_16_swin_l_oneformer_ade20k_160k.pth", use_swin)

In [None]:
!mv /content/OneFormer/sidewalk_depthanything/torchhub /content/OneFormer/

In [None]:
!ls

In [None]:
#Caso não queira usar a API do Google Maps, insira as imagens desejadas, ou use as de exemplos em OneFormer/sideawlk_depthanything/assets/examples
#Caso queira usar a API do GoogleMaps, é necessário uma API_KEY valida

API_KEY = ''
img_path_list = ['./sidewalk_depthanything/assets/examples/passo1.png', './sidewalk_depthanything/assets/examples/passo2.png', './sidewalk_depthanything/assets/examples/passo3.png']

In [None]:
# import requests
# from PIL import Image
# from io import BytesIO
# import matplotlib.pyplot as plt

# origem = 'R. Antônio de Macedo Soares, 1399'
# destino = 'R. Vieira de Morais, 754'

# geocode_url_origem = f'https://maps.googleapis.com/maps/api/geocode/json?address={origem}&key={API_KEY}'
# geocode_url_destino = f'https://maps.googleapis.com/maps/api/geocode/json?address={destino}&key={API_KEY}'
# geocode_response_origem = requests.get(geocode_url_origem)
# geocode_response_destino = requests.get(geocode_url_destino)

# if geocode_response_origem.status_code == 200 and geocode_response_destino.status_code == 200:
#     geocode_data_origem = geocode_response_origem.json()
#     geocode_data_destino = geocode_response_destino.json()

#     if geocode_data_origem['status'] == 'OK' and geocode_data_destino['status'] == 'OK':

#         latitude_origem = geocode_data_origem['results'][0]['geometry']['location']['lat']
#         longitude_origem = geocode_data_origem['results'][0]['geometry']['location']['lng']

#         latitude_destino = geocode_data_destino['results'][0]['geometry']['location']['lat']
#         longitude_destino = geocode_data_destino['results'][0]['geometry']['location']['lng']

#         print(f"Origem (Coordenadas): {latitude_origem}, {longitude_origem}")
#         print(f"Destino (Coordenadas): {latitude_destino}, {longitude_destino}")

#         directions_url = f'https://maps.googleapis.com/maps/api/directions/json?origin={latitude_origem},{longitude_origem}&destination={latitude_destino},{longitude_destino}&key={API_KEY}'
#         directions_response = requests.get(directions_url)

#         if directions_response.status_code == 200:
#             directions_data = directions_response.json()

#             if directions_data['status'] == 'OK':

#                 polyline = directions_data['routes'][0]['overview_polyline']['points']

#                 size = '600x600'
#                 map_type = 'roadmap'
#                 color = '0x0000FF'
#                 weight = '5'

#                 static_map_url = f'https://maps.googleapis.com/maps/api/staticmap?size={size}&maptype={map_type}&path=color:{color}|weight:{weight}|enc:{polyline}&key={API_KEY}'

#                 response = requests.get(static_map_url)

#                 if response.status_code == 200:
#                     image = Image.open(BytesIO(response.content))
#                     plt.imshow(image)
#                     plt.axis('off')
#                     plt.show()
#                 else:
#                     print('Erro ao obter a imagem do mapa:', response.status_code)
#             else:
#                 print('Erro ao obter as direções:', directions_data['status'])
#         else:
#             print('Erro ao obter a rota:', directions_response.status_code)
#     else:
#         print('Erro na geocodificação:', geocode_data_origem['status'], geocode_data_destino['status'])
# else:
#     print('Erro ao obter a geocodificação:', geocode_response_origem.status_code, geocode_response_destino.status_code)


In [None]:
# import requests
# from PIL import Image
# from io import BytesIO
# import matplotlib.pyplot as plt

# origem = 'R. Antônio de Macedo Soares, 1399'
# destino = 'R. Vieira de Morais, 754'

# geocode_url_origem = f'https://maps.googleapis.com/maps/api/geocode/json?address={origem}&key={API_KEY}'
# geocode_url_destino = f'https://maps.googleapis.com/maps/api/geocode/json?address={destino}&key={API_KEY}'

# geocode_response_origem = requests.get(geocode_url_origem)
# geocode_response_destino = requests.get(geocode_url_destino)

# img_path_list = []
# if geocode_response_origem.status_code == 200 and geocode_response_destino.status_code == 200:
#     geocode_data_origem = geocode_response_origem.json()
#     geocode_data_destino = geocode_response_destino.json()

#     if geocode_data_origem['status'] == 'OK' and geocode_data_destino['status'] == 'OK':
#         latitude_origem = geocode_data_origem['results'][0]['geometry']['location']['lat']
#         longitude_origem = geocode_data_origem['results'][0]['geometry']['location']['lng']

#         latitude_destino = geocode_data_destino['results'][0]['geometry']['location']['lat']
#         longitude_destino = geocode_data_destino['results'][0]['geometry']['location']['lng']

#         directions_url = f'https://maps.googleapis.com/maps/api/directions/json?origin={latitude_origem},{longitude_origem}&destination={latitude_destino},{longitude_destino}&key={API_KEY}'
#         directions_response = requests.get(directions_url)

#         if directions_response.status_code == 200:
#             directions_data = directions_response.json()

#             if directions_data['status'] == 'OK':
#                 steps = directions_data['routes'][0]['legs'][0]['steps']

#                 for i in range(len(steps)-1):
#                     start_lat = steps[i]['start_location']['lat']
#                     start_lng = steps[i]['start_location']['lng']
#                     end_lat = steps[i+1]['start_location']['lat']
#                     end_lng = steps[i+1]['start_location']['lng']

#                     middle_lat = (start_lat + end_lat) / 2
#                     middle_lng = (start_lng + end_lng) / 2

#                     heading = calcula_coeficiente(start_lat, start_lng, end_lat, end_lng)
#                     pitch = -10

#                     street_view_url = f'https://maps.googleapis.com/maps/api/streetview?size=600x600&location={middle_lat},{middle_lng}&heading={heading}&pitch={pitch}&key={API_KEY}'

#                     response = requests.get(street_view_url)

#                     if response.status_code == 200:
#                         print(f"Mostrando imagem para o passo {i+1}")
#                         img_step = Image.open(BytesIO(response.content))
#                         img_step.save(f'passo{i+1}.png')
#                         img_path_list.append(f'passo{i+1}.png')

#                         plt.imshow(img_step)
#                         plt.axis('off')
#                         plt.show()
#                     else:
#                         print(f'Erro ao obter a imagem do passo {i+1}:', response.status_code)
#             else:
#                 print('Erro ao obter as direções:', directions_data['status'])
#         else:
#             print('Erro ao obter a rota:', directions_response.status_code)
#     else:
#         print('Erro na geocodificação:', geocode_data_origem['status'], geocode_data_destino['status'])
# else:
#     print('Erro ao obter a geocodificação:', geocode_response_origem.status_code, geocode_response_destino.status_code)


In [None]:
task = "panoptic"
def execute_prediction(img_path, show_img):
  img = cv2.resize(cv2.imread(img_path), (600, 600))
  predictions = predictor(img, "panoptic")

  id_calcada = -1
  for item in predictions["panoptic_seg"][1]:
      if item['category_id'] == 11:
          id_calcada = item['id']
          break
  if(show_img):
    img = np.clip(img, 1, 254)
    out = TASK_INFER[task](img, predictor, metadata).get_image()
    cv2_imshow(out[:, :, ::-1])
  return predictions, id_calcada

In [None]:
import argparse
import cv2
import numpy as np
import os
import torch
import torch.nn.functional as F

from sidewalk_depthanything.pixel_size import pixel_sum
from torchvision.transforms import Compose
from tqdm import tqdm
from sidewalk_depthanything.depth_anything.dpt import DepthAnything
from sidewalk_depthanything.depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet

import sys
for idx, arg in enumerate(sys.argv):
    if arg == '-f':
        sys.argv.pop(idx)
        sys.argv.pop(idx)

parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument('--outdir', type=str, default='depth_vis')
parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl'])
parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')
args = parser.parse_args()

margin_width = 50
caption_height = 60

font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 1
font_thickness = 2

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

print(args.encoder)
depth_anything = DepthAnything.from_pretrained('LiheYoung/depth_anything_{}14'.format(args.encoder)).to(DEVICE).eval()

total_params = sum(param.numel() for param in depth_anything.parameters())
print('Total parameters: {:.2f}M'.format(total_params / 1e6))

transform = Compose([
  Resize(
      width=518,
      height=518,
      resize_target=False,
      keep_aspect_ratio=True,
      ensure_multiple_of=14,
      resize_method='lower_bound',
      image_interpolation_method=cv2.INTER_CUBIC,
  ),
  NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
  PrepareForNet(),
])

def process_image(img_path):
  if os.path.isfile(img_path):
    if img_path.endswith('txt'):
        with open(img_path, 'r') as f:
            filenames = f.read().splitlines()
    else:
        filenames = [img_path]
  else:
    filenames = os.listdir(img_path)
    filenames = [os.path.join(img_path, filename) for filename in filenames if not filename.startswith('.')]
    filenames.sort()

  os.makedirs(args.outdir, exist_ok=True)

  for filename in tqdm(filenames):
    raw_image = cv2.resize(cv2.imread(filename), (600, 600))
    image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0

    h, w = image.shape[:2]

    image = transform({'image': image})['image']
    image = torch.from_numpy(image).unsqueeze(0).to(DEVICE)

    with torch.no_grad():
        depth = depth_anything(image)

    depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0]
    depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
    depth_norm = (depth - 1) / 254
    depth = depth.cpu().numpy().astype(np.uint8)

    value_bottom = depth_norm[h-4, w//2]
    value_above = depth_norm[h-5, w//2]
    pixel_size = 175/(value_above**2) - 175/(value_bottom**2)
    print(f'pixel_size = {pixel_size}')

    object_width_cm = pixel_size / depth_norm

    np.set_printoptions(threshold=np.inf, linewidth=np.inf)

    if args.grayscale:
        depth = np.repeat(depth[..., np.newaxis], 3, axis=-1)
    else:
        depth = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)

    filename = os.path.basename(filename)

    if args.pred_only:
        cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_depth.png'), depth)
    else:
        split_region = np.ones((raw_image.shape[0], margin_width, 3), dtype=np.uint8) * 255
        combined_results = cv2.hconcat([raw_image, split_region, depth])

        caption_space = np.ones((caption_height, combined_results.shape[1], 3), dtype=np.uint8) * 255
        captions = ['Raw image', 'Depth Anything']
        segment_width = w + margin_width

        for i, caption in enumerate(captions):
            text_size = cv2.getTextSize(caption, font, font_scale, font_thickness)[0]

            text_x = int((segment_width * i) + (w - text_size[0]) / 2)

            cv2.putText(caption_space, caption, (text_x, 40), font, font_scale, (0, 0, 0), font_thickness)

        final_result = cv2.vconcat([caption_space, combined_results])

        cv2.imwrite(os.path.join(args.outdir, filename[:filename.rfind('.')] + '_img_depth.png'), final_result)
    return object_width_cm


In [None]:
#Use esse metodo quando a calçada estiver na Verical
import numpy as np

count = 0
for img in img_path_list:
  object_width_cm = process_image(img)
  panoptic_seg, id_calcada = execute_prediction(img, True) #Utilize False para não mostrar a imagem e reduzir a chacne de erros
  panoptic_seg = panoptic_seg["panoptic_seg"][0]

  count += 1
  distancias_verticais = []

  for j in range(0, panoptic_seg.shape[1]):
      linha = panoptic_seg[:, j]

      soma = 0
      for i in range(int(panoptic_seg.shape[0]/3), panoptic_seg.shape[0]):
          if linha[i] == id_calcada:
              soma += object_width_cm[i][j]
      if soma != 0:
          distancias_verticais.append(soma)
  print(f'Media da calçada da img {count}: {np.mean(distancias_verticais)}')

In [None]:
#Use esse método quando a calçada estiver Paralela
count = 0
for img in img_path_list:
    count += 1
    object_width_cm = process_image(img)
    distancias_direita = []
    distancias_esquerda = []
    panoptic_seg, id_calcada = execute_prediction(img, True)
    panoptic_seg = panoptic_seg["panoptic_seg"][0]
    meio = panoptic_seg.shape[1] // 2

    for j in range(int(panoptic_seg.shape[0]/3), panoptic_seg.shape[0]):
        linha = panoptic_seg[j, :]

        soma = 0
        for i in range(meio, panoptic_seg.shape[1]):
            if linha[i] == id_calcada:
                soma += object_width_cm[j][i]

        if soma != 0:
            distancias_direita.append(soma)
    print(f'Media da direita da imagem {count}: {np.mean(distancias_direita)}')

    for j in range(int(panoptic_seg.shape[:][0]/3), panoptic_seg.shape[:][0]):
        linha = panoptic_seg[j, :]

        soma = 0
        for i in range(0, meio):
            if linha[i] == id_calcada:
                soma += object_width_cm[j][i]
        if soma != 0:
            distancias_esquerda.append(soma)

    print(f'Media da esquerda da imagem {count}: {np.mean(distancias_esquerda)}')

In [None]:
#Exemplo de profundidade
img_depth = cv2.imread('./depth_vis/passo1_img_depth.png')
cv2_imshow(img_depth)