# 1. Create the dataset
Now, assuming you already have the `.r3d` data, modify your data path in the following sections to begin parsing the data and building the dataset. This process may take a few minutes. The target detection results and segmented object images will be saved in the `results` folder. To save memory and speed up execution, you can disable the visualization feature.

In [None]:
import sys
import os

import torch
from dataloaders import R3DSemanticDataset, DeticDenseLabelledDataset
from dataloaders.scannet_200_classes import AFF_OBJ_LIST
DATA_PATH = 'data/lab_0920.r3d'
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
dataset = R3DSemanticDataset(DATA_PATH, AFF_OBJ_LIST)

os.environ['CURL_CA_BUNDLE'] = ''
labelled_dataset = DeticDenseLabelledDataset(
    dataset, 
    use_extra_classes=False, 
    exclude_gt_images=False, 
    subsample_prob=0.01, 
    visualize_results=True, 
    detic_threshold=0.6,
    visualization_path="results/detic_labelled_results",
    item_coordinates_path="results/object_coordinates",
)

In [None]:
torch.save(labelled_dataset, "./labeled_dataset.pt")

# 2. Train the model
Now, you can run the train.py to get the model weights or use our weights that you can download at  [Google Drive](https://drive.google.com/file/d/1c7vfFWWDBZEn9XYfaSk7pmghoLD5K7nW/view?usp=drive_link).

# 3. Visualize the results
If you have obtained the model weights and the prepared dataset, you can now visualize the results to evaluate the performance of our model. Start by importing the necessary frameworks and libraries.

In [None]:
import open3d as o3d
import numpy as np
import matplotlib.pyplot as plt
from itertools import chain, cycle
from sentence_transformers import SentenceTransformer, util

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split

import tqdm
import einops

import os
import sys

from dataloaders.real_dataset_heatmap import DeticDenseLabelledDataset
from model.grid_hash_model import GridCLIPModel

from model.misc import MLP

import pandas as pd
import pyntcloud
from pyntcloud import PyntCloud
import clip
from torch.utils.data import Dataset
from scipy.signal import find_peaks

In [None]:
DEVICE = "cuda"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
model, preprocess = clip.load("ViT-B/32", device=DEVICE)
sentence_model = SentenceTransformer("all-mpnet-base-v2")

# you can change this to your own data to see the results for the other scenes
scene = 'lab' 

In [None]:
queries = [
    # lab
    'take out some food from the refrigerator',
    'warm up the food in the microwave',
    'help me to take the bottle',
    'give me the knife',
    'I want to eat banana',
    'I want to use the yellow pen to write something on the paper',

    # home
    # 'take out some food from the refrigerator',
    # 'help me input something on the laptop',
    # 'take the cup from the table',
    # 'where can i seat on chair'

    # scene0670
    # 'take out some food from the refrigerator',
    # 'take the bottle from the table',
    # 'give me the metal bowl',

    # scene0552
    # 'take out some food from the refrigerator',
    # 'warm up the food in the microwave',

    # scene0753
    # 'take the bottle to me from the table',
    # 'give me the book',

    # multi tasks
    # 'Put the bananas on the table in the refrigerator'
    # 'Use the knife to cut the banana',

    # disjunctive sentence
    # 'take out some food from the frige',

]

In [None]:
# change this to the path of the labeled dataset
training_data = torch.load("YOUR PATH HERE")
max_coords, _ = training_data._label_xyz.max(dim=0)
min_coords, _ = training_data._label_xyz.min(dim=0)

label_model = GridCLIPModel(
    image_rep_size=training_data[0]["clip_image_vector"].shape[-1],
    affordance_rep_size=training_data[0]["clip_affordance_vector"].shape[-1],
    mlp_depth=1,
    mlp_width=600,
    log2_hashmap_size=20,
    num_levels=18,
    level_dim=8,
    per_level_scale=2,
    max_coords=max_coords,
    min_coords=min_coords,
).to(DEVICE)

In [None]:
# change this to the path of the model weights
model_weights_path = "YOUR PATH HERE"
model_weights = torch.load(model_weights_path, map_location=DEVICE)
label_model.load_state_dict(model_weights["model"])

In [None]:
class CustomDataset(Dataset):
    def __init__(self, xyz_data, affordance_values):
        self.xyz_data = xyz_data
        self.affordance_values = affordance_values

    def __len__(self):
        return len(self.xyz_data)

    def __getitem__(self, index):
        # return the data and the label
        xyz = self.xyz_data[index]
        affordance = self.affordance_values[index]
        return xyz, affordance