In [None]:
# Prepare the ImageNet dataset for training
! python project_module/src/project_module/theia/scripts/preprocessing/image_datasets/organize_imagenet_webdataset.py --dataset imagenet-mini --imagenet-raw-path experimentation/dataset/temp/imagenet-mini --output-path experimentation/dataset

In [None]:
# Extract features from the ImageNet dataset
! python -m project_module.theia.scripts.preprocessing.feature_extraction --dataset imagenet-mini --dataset-root /home/tomo0530/theia-demo/experimentation/dataset --output-path /home/tomo0530/theia-demo/experimentation/dataset/imagenet-mini/feature --model facebook/dinov2-with-registers-base --split train

In [None]:
# Import
import os
import cv2
import torch
from PIL import Image
import numpy as np
from transformers import AutoModel
from torchvision.io import read_video, write_video
from project_module.theia.decoding import load_feature_stats, prepare_depth_decoder, prepare_mask_generator, decode_everything

In [None]:
# Load the model
device = "cuda:0" if torch.cuda.is_available() else "cpu"
theia_model = AutoModel.from_pretrained("theaiinstitute/theia-tiny-patch16-224-cddsv", trust_remote_code=True)
theia_model = theia_model.to(device)
target_model_names = [
    "google/vit-huge-patch14-224-in21k",
    "facebook/dinov2-large",
    "openai/clip-vit-large-patch14",
    "facebook/sam-vit-huge",
    "LiheYoung/depth-anything-large-hf",
]
feature_means, feature_vars = load_feature_stats(target_model_names, stat_file_root="/home/tomo0530/theia-demo/experimentation/dataset/feature_stats")

Some weights of ViTModel were not initialized from the model checkpoint at facebook/deit-tiny-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [None]:
# Load the mask generator and depth decoder
mask_generator, sam_model = prepare_mask_generator(device)
depth_anything_model_name = "LiheYoung/depth-anything-large-hf"
depth_anything_decoder, _ = prepare_depth_decoder(depth_anything_model_name, device)

Device set to use cuda:0


### Example Video

In [27]:
example_video_path = "/home/tomo0530/theia-demo/experimentation/dataset/video/example_video_to_visualize.mp4"
video, _, _ = read_video(example_video_path, pts_unit="sec", output_format="THWC")
video = video.numpy()
images = [Image.fromarray(cv2.resize(im, (224, 224))) for im in video]

theia_decode_results, gt_decode_results = decode_everything(
    theia_model=theia_model,
    feature_means=feature_means,
    feature_vars=feature_vars,
    images=images,
    mask_generator=mask_generator,
    sam_model=sam_model,
    depth_anything_decoder=depth_anything_decoder,
    pred_iou_thresh=0.5,
    stability_score_thresh=0.7,
    gt=True,
    device=device,
)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [28]:
vis_video = np.stack(
    [np.vstack([tr, gtr]) for tr, gtr in zip(theia_decode_results, gt_decode_results, strict=False)]
)
vis_video = torch.from_numpy(vis_video * 255.0).to(torch.uint8)
vis_save_path = "/home/tomo0530/theia-demo/experimentation/outputs/visualized.mp4"
write_video(vis_save_path, vis_video, fps=10)



### Pizza Pick

In [29]:
pizza_video_path = "/home/tomo0530/theia-demo/experimentation/dataset/video/pizza_pick.mp4"
video, _, _ = read_video(pizza_video_path, pts_unit="sec", output_format="THWC")
video = video.numpy()
images = [Image.fromarray(cv2.resize(im, (224, 224))) for im in video]

theia_decode_results, gt_decode_results = decode_everything(
    theia_model=theia_model,
    feature_means=feature_means,
    feature_vars=feature_vars,
    images=images,
    mask_generator=mask_generator,
    sam_model=sam_model,
    depth_anything_decoder=depth_anything_decoder,
    pred_iou_thresh=0.5,
    stability_score_thresh=0.7,
    gt=True,
    device=device,
)

vis_video = np.stack(
    [np.vstack([tr, gtr]) for tr, gtr in zip(theia_decode_results, gt_decode_results, strict=False)]
)
vis_video = torch.from_numpy(vis_video * 255.0).to(torch.uint8)
vis_save_path = "/home/tomo0530/theia-demo/experimentation/outputs/visualized_pizza_pick.mp4"
write_video(vis_save_path, vis_video, fps=10)



In [30]:
np.save(
    "/home/tomo0530/theia-demo/experimentation/outputs/theia_decode_results.npy",
    theia_decode_results,
)
np.save(
    "/home/tomo0530/theia-demo/experimentation/outputs/gt_decode_results.npy",
    gt_decode_results,
)