# MiDaS Depth Estimation Pipeline

In [1]:
!pip install timm



 # Load the Model:

In [1]:
import torch

model_type = "DPT_Large" # "MiDaS_small"  # or "DPT_Hybrid"
model = torch.hub.load("intel-isl/MiDaS", model_type)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model.eval()


Using cache found in /Users/jerrycheng/.cache/torch/hub/intel-isl_MiDaS_master


DPTDepthModel(
  (pretrained): Module(
    (model): VisionTransformer(
      (patch_embed): PatchEmbed(
        (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))
        (norm): Identity()
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (patch_drop): Identity()
      (norm_pre): Identity()
      (blocks): Sequential(
        (0): Block(
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (attn): Attention(
            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
            (q_norm): Identity()
            (k_norm): Identity()
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=1024, out_features=1024, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
          )
          (ls1): Identity()
          (drop_path1): Identity()
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Linear(in_featur

# Preprocess the Input:

In [2]:
import cv2
import numpy as np

def preprocess_image(image_path):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (384, 384))
    img = np.transpose(img, (2, 0, 1))
    img = torch.from_numpy(img).float().div(255).unsqueeze(0)
    return img.to(device)


# Predict Depth:

In [3]:
def predict_depth(img_tensor):
    with torch.no_grad():
        depth = model(img_tensor)
    depth = depth.squeeze().cpu().numpy()
    return depth


# Post-process & Display:

In [18]:
import cv2
import numpy as np

# def display_depth(depth):
#     # Normalize the depth values between 0 and 255
#     normalized_depth = ((depth - depth.min()) / (depth.max() - depth.min())) * 255.0
#     # Convert to uint8 type
#     depth_uint8 = np.uint8(normalized_depth)
#     # Invert the colormap so that closest points are white and furthest are dark
#     inverted_depth_colormap = cv2.bitwise_not(depth_uint8)
    
#     cv2.imshow('Depth Map', inverted_depth_colormap)
#     cv2.waitKey(0)
#     cv2.destroyAllWindows()

import numpy as np
import cv2
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import plotly.graph_objs as go

def display_depth_with_plotly(depth):
    # Normalize the depth values between 0 and 255
    normalized_depth = ((depth - depth.min()) / (depth.max() - depth.min())) * 255.0
    # Convert to uint8 type
    depth_uint8 = np.uint8(normalized_depth)
    # Invert the colormap so that closest points are white and furthest are dark
    inverted_depth_colormap = cv2.bitwise_not(depth_uint8)
    
    # Display the inverted depth colormap using OpenCV
    # cv2.imshow('Depth Map', inverted_depth_colormap)
    # cv2.waitKey(0)
    # cv2.destroyAllWindows()
    
    # Now, let's visualize the depth as a 3D point cloud using Plotly
    depth_vis = np.flipud(inverted_depth_colormap)  # Use the inverted depth for visualization

    # Create the x, y grid. The z value will be the depth
    x, y = np.meshgrid(range(depth_vis.shape[1]), range(depth_vis.shape[0]))
    z = depth_vis

    # Downsample the depth map for quicker rendering
    STEP = 3
    x = x[::STEP, ::STEP]
    y = y[::STEP, ::STEP]
    z = z[::STEP, ::STEP]

    # Create a 3D scatter plot
    trace = go.Scatter3d(
        x=x.flatten(), 
        y=y.flatten(), 
        z=z.flatten(),
        mode='markers',
        marker=dict(
            size=2,
            color=z.flatten(),                # set color to an array/list of desired values
            colorscale='Viridis',   # choose a colorscale
            opacity=0.8
        )
    )

    # Define the layout of the plot
    layout = go.Layout(
        margin=dict(l=0, r=0, b=0, t=0),
        scene=dict(
            xaxis=dict(title='X Axis'),
            yaxis=dict(title='Y Axis'),
            zaxis=dict(title='Depth'),
            aspectmode='data'
        )
    )

    # Create the figure with the trace and layout
    fig = go.Figure(data=[trace], layout=layout)

    # Show the plot
    fig.show()

# Example usage:
# Assuming 'depth_data' is the depth information you have
# display_depth_with_plotly(depth_data)




# Depth without Normalization

In [39]:
import numpy as np
import cv2
import plotly.graph_objs as go

def display_true_depth(depth):
    # Convert depth to uint8 if it's not already (assuming depth is in a valid range for visualization)
    if depth.dtype != np.uint8:
        depth_uint8 = cv2.convertScaleAbs(depth)
    else:
        depth_uint8 = depth
    
    # Invert the colormap so that closest points are white and furthest are dark
    inverted_depth_colormap = cv2.bitwise_not(depth_uint8)
    
    # Flip the depth values vertically for correct orientation
    depth_vis = np.flipud(inverted_depth_colormap)

    # Create the x, y grid. The z value will be the depth
    x, y = np.meshgrid(range(depth_vis.shape[1]), range(depth_vis.shape[0]))
    z = depth* 10 # Use the raw depth for the z values

    # Downsample the depth map for quicker rendering
    STEP = 3
    x = x[::STEP, ::STEP]
    y = y[::STEP, ::STEP]
    z = z[::STEP, ::STEP]

    # Create a 3D scatter plot
    trace = go.Scatter3d(
        x=x.flatten(), 
        y=y.flatten(), 
        z=z.flatten(),
        mode='markers',
        marker=dict(
            size=2,
            color=z.flatten(),  # set color to an array/list of desired values
            colorscale='Viridis',  # choose a colorscale
            opacity=0.8
        )
    )

    # Define the layout of the plot
    layout = go.Layout(
        margin=dict(l=0, r=0, b=0, t=0),
        scene=dict(
            xaxis=dict(title='X Axis'),
            yaxis=dict(title='Y Axis'),
            zaxis=dict(title='Depth'),
            aspectmode='data'
        )
    )

    # Create the figure with the trace and layout
    fig = go.Figure(data=[trace], layout=layout)

    # Show the plot
    fig.show()

# Example usage:
# Assuming 'depth_data' is the depth information you have
# display_depth_with_plotly(depth_data)


# Pipeline for Image:

In [40]:
def depth_estimation_image(image_path):
    img_tensor = preprocess_image(image_path)
    depth = predict_depth(img_tensor)
    display_true_depth(depth)


# Pipeline for Video:

In [37]:
import cv2
import numpy as np
import torch

def display_depth_video(video_path):
    # Load the model
    model_type = "DPT_Large"  # or "DPT_Hybrid"
    model = torch.hub.load("intel-isl/MiDaS", model_type)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    model.eval()

    # Preprocess the frame
    def preprocess_image(frame):
        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (384, 384))
        img = np.transpose(img, (2, 0, 1))
        img = torch.from_numpy(img).float().div(255).unsqueeze(0)
        return img.to(device)

    # Predict depth
    def predict_depth(img_tensor):
        with torch.no_grad():
            depth = model(img_tensor)
        depth = depth.squeeze().cpu().numpy()
        return depth

    cap = cv2.VideoCapture(video_path)
    
    # Get video properties
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    
    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v') 
    out = cv2.VideoWriter('depth_output.mp4', fourcc, fps, (width, height), isColor=True)
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        img_tensor = preprocess_image(frame)
        depth = predict_depth(img_tensor)
        
        # Normalize the depth values between 0 and 255
        normalized_depth = ((depth - depth.min()) / (depth.max() - depth.min())) * 255.0
        depth_uint8 = np.uint8(normalized_depth)
        inverted_depth_colormap = cv2.bitwise_not(depth_uint8)
        
        # Resize the depth map to original video resolution
        resized_depth_colormap = cv2.resize(inverted_depth_colormap, (width, height), interpolation=cv2.INTER_LINEAR)
        
        # Convert grayscale to BGR for saving as a video
        bgr_depth_colormap = cv2.cvtColor(resized_depth_colormap, cv2.COLOR_GRAY2BGR)
        
        # Write the frame to the output video
        out.write(bgr_depth_colormap)
        
        cv2.imshow('Depth Video', resized_depth_colormap)
        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    out.release()
    cv2.destroyAllWindows()

# Usage
# display_depth_video("path_to_video.mp4")


# Usage:

In [47]:
# For Image
depth_estimation_image("data_set/my_dataset/photos/stair_3.png")


In [None]:
# For Image
# depth_estimation_image("data_set/NYU Depth/nyu2_test/00000_colors.png")

# For Video
# display_depth_video("data_set/my_dataset/videos/recorded-50521047003.mp4")
 


# Test on the EGO4D Dataset

In [None]:
model_type = "DPT_Large"  # or "DPT_Hybrid"
model = torch.hub.load("intel-isl/MiDaS", model_type)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model.eval()

# Preprocess the frame
def preprocess_image(frame):
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (384, 384))
    img = np.transpose(img, (2, 0, 1))
    img = torch.from_numpy(img).float().div(255).unsqueeze(0)
    return img.to(device)


Using cache found in /Users/jerrycheng/.cache/torch/hub/intel-isl_MiDaS_master


In [None]:
# Helper Functions:

## Loading the progress bar: 
def display_progress_bar(current, total, bar_length=50):
    progress = int((current / total) * bar_length)
    bar = "#" * progress + "-" * (bar_length - progress)
    print(f"\r[{bar}] {current}/{total} frames processed", end='')



In [None]:
import cv2
import os
import torch
import numpy as np

# def images_to_video_depth_estimation(image_folder, output_video_name, frame_time, model):
# Convert sequence of images to video
image_folder = "data_set/ego4d_images/c7344db5-fc90-42b9-9dad-346dd3c5e4f0" 
frame_time = 0.05 
output_video_name = "output_depth_video_c734.mp4"

def extract_number_from_filename(filename):
    # Extract numbers from the filename
    number_str = ''.join(filter(str.isdigit, filename))
    
    # Convert to integer or return 0 if empty
    return int(number_str) if number_str else 0


# Sort images based on the numbers in their filenames
images = sorted(os.listdir(image_folder), key=extract_number_from_filename)
images = [img for img in images if img.endswith(".png")]
total_frames = len(images)
frame = cv2.imread(os.path.join(image_folder, images[0]))
height, width, layers = frame.shape
video = cv2.VideoWriter(output_video_name, cv2.VideoWriter_fourcc(*'mp4v'), 1/frame_time, (width, height))

for image in images:
    video.write(cv2.imread(os.path.join(image_folder, image)))

video.release()

# Depth estimation on the video and save depth images
cap = cv2.VideoCapture(output_video_name)
depth_video_name = "depth_" + output_video_name
depth_video = cv2.VideoWriter(depth_video_name, cv2.VideoWriter_fourcc(*'mp4v'), 1/frame_time, (width, height))

print("Processing frames:")
for i in range(total_frames):
    ret, frame = cap.read()
    if not ret:
        break

    img_tensor = preprocess_image(frame) 
    depth = predict_depth(img_tensor) 

    # Normalize and save depth image
    normalized_depth = ((depth - depth.min()) / (depth.max() - depth.min())) * 255.0
    depth_uint8 = np.uint8(normalized_depth)
    inverted_depth_colormap = cv2.bitwise_not(depth_uint8)
    depth_image_name = images[i].split('.')[0] + "_depth.jpg"
    cv2.imwrite(os.path.join(image_folder, depth_image_name), inverted_depth_colormap)

    # Resize and save to depth video
    resized_depth_colormap = cv2.resize(inverted_depth_colormap, (width, height), interpolation=cv2.INTER_LINEAR)
    bgr_depth_colormap = cv2.cvtColor(resized_depth_colormap, cv2.COLOR_GRAY2BGR)
    depth_video.write(bgr_depth_colormap)

    # Display depth estimation
    cv2.imshow('Depth Estimation', resized_depth_colormap)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

    # Display visual progress bar
    display_progress_bar(i+1, total_frames)

cap.release()
depth_video.release()
cv2.destroyAllWindows()


# Load the MiDaS model
# model = load_midas_model()  # Assuming you have this function from previous discussions

# Use the function
# images_to_video_depth_estimation("data_set/ego4d_images/1a1cecb8-2476-4966-9786-3fe0dce1ec0f", "output_video_name.mp4", 0.05, model)


Processing frames:
[##################################################] 841/841 frames processed

# Metric Depth Estimation

In [48]:
import torch

torch.hub.help("intel-isl/MiDaS", "DPT_BEiT_L_384", force_reload=True)  # Triggers fresh download of MiDaS repo

Downloading: "https://github.com/intel-isl/MiDaS/zipball/master" to /Users/jerrycheng/.cache/torch/hub/master.zip


' # This docstring shows up in hub.help()\n    MiDaS DPT_BEiT_L_384 model for monocular depth estimation\n    pretrained (bool): load pretrained weights into model\n    '

In [52]:
import torch

repo = "isl-org/ZoeDepth"
# Zoe_N
# model_zoe_n = torch.hub.load(repo, "ZoeD_N", pretrained=True)

# Zoe_K
model_zoe_k = torch.hub.load(repo, "ZoeD_K", pretrained=True)

# Zoe_NK
model_zoe_nk = torch.hub.load(repo, "ZoeD_NK", pretrained=True)

Using cache found in /Users/jerrycheng/.cache/torch/hub/isl-org_ZoeDepth_main


Overwriting config with config_version kitti
img_size [384, 768]


Using cache found in /Users/jerrycheng/.cache/torch/hub/intel-isl_MiDaS_master


Params passed to Resize transform:
	width:  768
	height:  384
	resize_target:  True
	keep_aspect_ratio:  True
	ensure_multiple_of:  32
	resize_method:  minimal
Using pretrained resource url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt


Downloading: "https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt" to /Users/jerrycheng/.cache/torch/hub/checkpoints/ZoeD_M12_K.pt
100%|██████████| 1.34G/1.34G [01:22<00:00, 17.6MB/s]


RuntimeError: Error(s) in loading state_dict for ZoeDepth:
	Unexpected key(s) in state_dict: "core.core.pretrained.model.blocks.0.attn.relative_position_index", "core.core.pretrained.model.blocks.1.attn.relative_position_index", "core.core.pretrained.model.blocks.2.attn.relative_position_index", "core.core.pretrained.model.blocks.3.attn.relative_position_index", "core.core.pretrained.model.blocks.4.attn.relative_position_index", "core.core.pretrained.model.blocks.5.attn.relative_position_index", "core.core.pretrained.model.blocks.6.attn.relative_position_index", "core.core.pretrained.model.blocks.7.attn.relative_position_index", "core.core.pretrained.model.blocks.8.attn.relative_position_index", "core.core.pretrained.model.blocks.9.attn.relative_position_index", "core.core.pretrained.model.blocks.10.attn.relative_position_index", "core.core.pretrained.model.blocks.11.attn.relative_position_index", "core.core.pretrained.model.blocks.12.attn.relative_position_index", "core.core.pretrained.model.blocks.13.attn.relative_position_index", "core.core.pretrained.model.blocks.14.attn.relative_position_index", "core.core.pretrained.model.blocks.15.attn.relative_position_index", "core.core.pretrained.model.blocks.16.attn.relative_position_index", "core.core.pretrained.model.blocks.17.attn.relative_position_index", "core.core.pretrained.model.blocks.18.attn.relative_position_index", "core.core.pretrained.model.blocks.19.attn.relative_position_index", "core.core.pretrained.model.blocks.20.attn.relative_position_index", "core.core.pretrained.model.blocks.21.attn.relative_position_index", "core.core.pretrained.model.blocks.22.attn.relative_position_index", "core.core.pretrained.model.blocks.23.attn.relative_position_index". 

In [55]:
!git clone https://github.com/isl-org/ZoeDepth.git && cd ZoeDepth

fatal: destination path 'ZoeDepth' already exists and is not an empty directory.


In [59]:
import torch

# model.load_state_dict(pretrained_weights, strict=False)

# Zoe_N
model_zoe_n = torch.hub.load("ZoeDepth", "ZoeD_N", source="local", pretrained=False)

img_size [384, 512]


Using cache found in /Users/jerrycheng/.cache/torch/hub/intel-isl_MiDaS_master


Params passed to Resize transform:
	width:  512
	height:  384
	resize_target:  True
	keep_aspect_ratio:  True
	ensure_multiple_of:  32
	resize_method:  minimal
