In [1]:
import zipfile
import pickle
import os
import cv2
import numpy as np

def load_from_zip(zip_path):
    """
    Load pickled data stored inside a .zip file created by TrajectoryRecorder.save_buffer.
    
    Args:
        zip_path (str): Path to the .zip file.
    
    Returns:
        object: The Python object that was originally pickled (list, dict, etc.).
    """
    with zipfile.ZipFile(zip_path, 'r') as zip_file:
        with zip_file.open('data.pkl', 'r') as file:
            data = pickle.load(file)
    return data

def transform_to_list_of_images_dict(data):
    # Takes a list of type [act1, obs1, act2, obs2, ...] with obs being a dict "image_0", "image_1", "objects_pos"
    # Outputs a list of dict [obs1, obs2, ...] with obs being a dict {"_agentview":..., "wrist_image":...} 
    # image_0 is agentview, image_1 is wrist view
    images_list = []
    for episode in data:
        obs_list = []
        traj = episode[0]
        #print(f"Trajectory length: {traj}")
        for i in range(1, len(traj), 2):
            obs = traj[i]
            #print(obs["objects_pos"]["pot"])
            if obs is None:
                continue
            img_dict = {"objects_pos": obs["objects_pos"].copy()}  # Copy the objects_pos dict
            if "image_0" in obs:
                image = obs["image_0"]
                image = cv2.flip(image.reshape(256, 256, 3), 0)
                # Change the image to BGR
                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                img_dict["agentview"] = image
            if "image_1" in obs:
                image = obs["image_1"]
                image = cv2.flip(image.reshape(256, 256, 3), 0)
                # Change the image to BGR
                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                img_dict["wrist_image"] = image
            # # Display the images for debugging
            # cv2.imshow("agentview", img_dict["agentview"])
            # cv2.imshow("wrist_image", img_dict["wrist_image"])
            # cv2.waitKey(1)
            # print(img_dict["objects_pos"]["pot"])
            obs_list.append(img_dict)
        images_list.append(obs_list)
    return images_list

In [2]:
# yolo imports
from ultralytics import YOLO
from roboflow import Roboflow
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2
import joblib
import pandas as pd

In [3]:
def get_episode_preds_and_ground_truth(episode, yolo_model):
    info_list = []
    for obs_step in episode:
        info = {}
        if "agentview" not in obs_step or "wrist_image" not in obs_step:
            continue
        agentview = obs_step["agentview"]
        wrist_image = obs_step["wrist_image"]
        # Get the predictions
        agentview_results = yolo_model(agentview, verbose=False)[0]
        wrist_results = yolo_model(wrist_image, verbose=False)[0]
        # # display the image with bounding boxes
        # displayed_image = agentview_results.plot()
        # cv2.imshow('agentview', cv2.cvtColor(displayed_image, cv2.COLOR_RGB2BGR))
        # cv2.waitKey(1)
        # displayed_image = wrist_results.plot()
        # cv2.imshow('wrist_image', cv2.cvtColor(displayed_image, cv2.COLOR_RGB2BGR))
        # cv2.waitKey(1)
        
        #print(obs_step.keys())
        objects_info = obs_step["objects_pos"]  # dict with object names as keys and positions as values
        for pred in agentview_results.boxes:
            cls_id = int(pred.cls)
            cls = yolo_model.names[cls_id]
            x, y, w, h = pred.xywhn.tolist()[0]
            conf = pred.conf
            # Convert normalized coordinates to pixel coordinates
            x = int(x * agentview.shape[1])
            y = int(y * agentview.shape[0])
            w = int(w * agentview.shape[1])
            h = int(h * agentview.shape[0])

            # Convert to pixel coordinates
            x1, y1 = int(x - w / 2), int(y - h / 2)
            x2, y2 = int(x + w / 2), int(y + h / 2)

            # Get the ground truth position of the object
            if cls in objects_info:
                ground_truth_xyz = objects_info[cls]
                ee_pos = objects_info["gripper"]

                found_match = False
                for pred in wrist_results.boxes:
                    cls_id2 = int(pred.cls)
                    if cls_id2 == cls_id:
                            found_match = True
                            x_cam2, y_cam2, w_cam2, h_cam2 = pred.xywhn.tolist()[0]
                            conf_cam2 = pred.conf
                            # Convert normalized coordinates to pixel coordinates
                            x_cam2 = int(x_cam2 * wrist_image.shape[1])
                            y_cam2 = int(y_cam2 * wrist_image.shape[0])
                            w_cam2 = int(w_cam2 * wrist_image.shape[1])
                            h_cam2 = int(h_cam2 * wrist_image.shape[0])

                            info = {
                                "px_cam1": x,
                                "py_cam1": y,
                                "w_cam1": w,
                                "h_cam1": h,
                                "conf_cam1": float(conf),
                                "cls": cls,
                                "px_cam2": x_cam2,
                                "py_cam2": y_cam2,
                                "w_cam2": w_cam2,
                                "h_cam2": h_cam2,
                                "conf_cam2": float(conf_cam2),
                                "ee_x": ee_pos[0] if ee_pos is not None else None,
                                "ee_y": ee_pos[1] if ee_pos is not None else None,
                                "ee_z": ee_pos[2] if ee_pos is not None else None,
                                "world_x": ground_truth_xyz[0],
                                "world_y": ground_truth_xyz[1],
                                "world_z": ground_truth_xyz[2],
                            }
                if not found_match:
                    x_cam2, y_cam2, w_cam2, h_cam2, conf_cam2 = 0, 0, 0, 0, 0
                    info ={
                        "px_cam1": x,
                        "py_cam1": y,
                        "w_cam1": w,
                        "h_cam1": h,
                        "conf_cam1": float(conf),
                        "cls": cls,
                        "px_cam2": 0,
                        "py_cam2": 0,
                        "w_cam2": 0,
                        "h_cam2": 0,
                        "conf_cam2": 0,
                        "ee_x": ee_pos[0] if ee_pos is not None else None,
                        "ee_y": ee_pos[1] if ee_pos is not None else None,
                        "ee_z": ee_pos[2] if ee_pos is not None else None,
                        "world_x": ground_truth_xyz[0],
                        "world_y": ground_truth_xyz[1],
                        "world_z": ground_truth_xyz[2],
                        }
            # if cls == "pot":    
            #     print(ground_truth_xyz)
            info_list.append(info)
    return info_list
                

In [4]:
# Create a df that, for each label in the yolo model, gets the world_pos of the corresponding object as well as the ee_pos
df_list1 = []
# find all zip files in a directory
yolo_model1 = YOLO("../PDDL/yolo_kitchen.pt")
data_dir1 = "/home/lorangpi/CyclicLxM/data/KitchenEnv_seed_0/train_yolo/traces"
zip_files1 = [f for f in os.listdir(data_dir1) if f.endswith('reach_place.zip')]
for zip_file1 in zip_files1:
    zip_path1 = os.path.join(data_dir1, zip_file1)
    zip_name1 = os.path.splitext(zip_file1)[0]
    yolo_data1 = load_from_zip(zip_path1)
    print(f"Processing {zip_file1} with {len(yolo_data1)} entries")
    yolo_data1 = transform_to_list_of_images_dict(yolo_data1)
    for episode_id1, episode1 in enumerate(yolo_data1):
        print(f"expisode {episode_id1} with {len(episode1)} steps")
        ep_info1 = get_episode_preds_and_ground_truth(episode1, yolo_model1)
        # Display the first 10 yolo bounding boxes
        break

KeyboardInterrupt: 

In [5]:
# Create a df that, for each label in the yolo model, gets the world_pos of the corresponding object as well as the ee_pos
df_list = []
# find all zip files in a directory
yolo_model = YOLO("../PDDL/yolo_kitchen.pt")
data_dir = "/home/lorangpi/CyclicLxM/data/KitchenEnv_seed_0/train_yolo/traces/"
zip_files = [f for f in os.listdir(data_dir) if f.endswith('reach_place.zip')]
for zip_file in zip_files:
    zip_path = os.path.join(data_dir, zip_file)
    zip_name = os.path.splitext(zip_file)[0]
    videos_dir = os.path.join(data_dir, zip_name, 'videos')
    os.makedirs(videos_dir, exist_ok=True)
    yolo_data = load_from_zip(zip_path)
    h264_videos_dir = os.path.join(videos_dir, 'h264')
    os.makedirs(h264_videos_dir, exist_ok=True)
    print(f"Processing {zip_file} with {len(yolo_data)} entries")
    yolo_data = transform_to_list_of_images_dict(yolo_data)
    for episode_id, episode in enumerate(yolo_data):
        print(f"expisode {episode_id} with {len(episode)} steps")
        ep_info = get_episode_preds_and_ground_truth(episode, yolo_model)
        if len(ep_info) > 0:
            df_list.extend(ep_info)
        # if episode_id >= 10:
        #     break

Processing reach_place.zip with 33 entries
expisode 0 with 255 steps
expisode 1 with 250 steps
expisode 2 with 255 steps
expisode 3 with 254 steps
expisode 4 with 251 steps
expisode 5 with 254 steps
expisode 6 with 255 steps
expisode 7 with 252 steps
expisode 8 with 258 steps
expisode 9 with 255 steps
expisode 10 with 257 steps
expisode 11 with 259 steps
expisode 12 with 248 steps
expisode 13 with 248 steps
expisode 14 with 262 steps
expisode 15 with 259 steps
expisode 16 with 260 steps
expisode 17 with 255 steps
expisode 18 with 249 steps
expisode 19 with 251 steps
expisode 20 with 255 steps
expisode 21 with 249 steps
expisode 22 with 251 steps
expisode 23 with 251 steps
expisode 24 with 257 steps
expisode 25 with 250 steps
expisode 26 with 253 steps
expisode 27 with 250 steps
expisode 28 with 255 steps
expisode 29 with 251 steps
expisode 30 with 257 steps
expisode 31 with 252 steps
expisode 32 with 249 steps


In [6]:
df = pd.DataFrame(df_list)
df.head(5)

Unnamed: 0,px_cam1,py_cam1,w_cam1,h_cam1,conf_cam1,cls,px_cam2,py_cam2,w_cam2,h_cam2,conf_cam2,ee_x,ee_y,ee_z,world_x,world_y,world_z
0,68,134,58,48,0.944306,pot,74,192,148,122,0.932665,-0.107792,-0.22029,0.94393,-0.119829,-0.159997,0.903693
1,80,90,11,17,0.911553,bread,0,0,0,0,0.0,-0.107792,-0.22029,0.94393,-0.375936,-0.162103,0.919784
2,68,134,58,48,0.944888,pot,73,193,145,124,0.940177,-0.108201,-0.220848,0.949371,-0.120193,-0.161145,0.906641
3,80,90,11,17,0.911856,bread,0,0,0,0,0.0,-0.108201,-0.220848,0.949371,-0.375936,-0.162103,0.919784
4,68,134,58,48,0.942343,pot,72,193,143,123,0.936066,-0.10914,-0.220597,0.955799,-0.121032,-0.162033,0.910119


In [13]:
print(list(yolo_model.names.values()))

['bread', 'pot']


In [18]:
yolo_model.names

{0: 'bread', 1: 'pot'}

In [19]:
#reverse the dict
inv_class_dict = {v: k for k, v in yolo_model.names.items()}
print(inv_class_dict)

{'bread': 0, 'pot': 1}


In [20]:
# Replace cls names with the cls id from the yolo model
df['cls_id'] = df['cls'].map(inv_class_dict)
df.head(5)

Unnamed: 0,px_cam1,py_cam1,w_cam1,h_cam1,conf_cam1,cls,px_cam2,py_cam2,w_cam2,h_cam2,conf_cam2,ee_x,ee_y,ee_z,world_x,world_y,world_z,cls_id
0,68,134,58,48,0.944306,pot,74,192,148,122,0.932665,-0.107792,-0.22029,0.94393,-0.119829,-0.159997,0.903693,1
1,80,90,11,17,0.911553,bread,0,0,0,0,0.0,-0.107792,-0.22029,0.94393,-0.375936,-0.162103,0.919784,0
2,68,134,58,48,0.944888,pot,73,193,145,124,0.940177,-0.108201,-0.220848,0.949371,-0.120193,-0.161145,0.906641,1
3,80,90,11,17,0.911856,bread,0,0,0,0,0.0,-0.108201,-0.220848,0.949371,-0.375936,-0.162103,0.919784,0
4,68,134,58,48,0.942343,pot,72,193,143,123,0.936066,-0.10914,-0.220597,0.955799,-0.121032,-0.162033,0.910119,1


In [21]:
len(df)

16177

In [22]:
from sklearn.ensemble import GradientBoostingRegressor
X_dual = df[["cls_id", "px_cam1", "py_cam1", "w_cam1", "h_cam1", "conf_cam1", "px_cam2", "py_cam2", "w_cam2", "h_cam2", "conf_cam2", "ee_x", "ee_y", "ee_z"]].values
Y_dual = df[["world_x", "world_y", "world_z"]].values

reg_x_dual = GradientBoostingRegressor(n_estimators=200, max_depth=3, learning_rate=0.1)
reg_y_dual = GradientBoostingRegressor(n_estimators=200, max_depth=3, learning_rate=0.1)
reg_z_dual = GradientBoostingRegressor(n_estimators=200, max_depth=3, learning_rate=0.1)

reg_x_dual.fit(X_dual, Y_dual[:, 0])
reg_y_dual.fit(X_dual, Y_dual[:, 1])
reg_z_dual.fit(X_dual, Y_dual[:, 2])
def pixel_to_world_dual(cls, px1, py1, w1, h1, conf1, px2, py2, w2, h2, conf2, ee_x, ee_y, ee_z):
    features = np.array([[cls, px1, py1, w1, h1, conf1, px2, py2, w2, h2, conf2, ee_x, ee_y, ee_z]])
    x = reg_x_dual.predict(features)[0]
    y = reg_y_dual.predict(features)[0]
    z = reg_z_dual.predict(features)[0]
    return x, y, z
# --- Example usage ---
# Get a prediction for the first row of the df dataframe
cls = df.iloc[0]["cls_id"]
px1, py1, w1, h1, conf1 = df.iloc[0][["px_cam1", "py_cam1", "w_cam1", "h_cam1", "conf_cam1"]]
px2, py2, w2, h2, conf2 = df.iloc[0][["px_cam2", "py_cam2", "w_cam2", "h_cam2", "conf_cam2"]]
ee_x, ee_y, ee_z = df.iloc[0][["ee_x", "ee_y", "ee_z"]]
print(pixel_to_world_dual(cls, px1, py1, w1, h1, conf1, px2, py2, w2, h2, conf2, ee_x, ee_y, ee_z))

(-0.12079338838414694, -0.16070172883106865, 0.9046129247307965)


In [23]:
# Save the dual cam models
joblib.dump({"reg_x": reg_x_dual, "reg_y": reg_y_dual, "reg_z": reg_z_dual}, "kitchen_dual_cam_calibration_models.pkl")

['kitchen_dual_cam_calibration_models.pkl']

In [24]:
# Replace pred columns with the dual cam predictions
df["pred_x"] = df.apply(lambda row: pixel_to_world_dual(row["cls_id"], row["px_cam1"], row["py_cam1"], row["w_cam1"], row["h_cam1"], row["conf_cam1"], row["px_cam2"], row["py_cam2"], row["w_cam2"], row["h_cam2"], row["conf_cam2"], row["ee_x"], row["ee_y"], row["ee_z"])[0], axis=1)
df["pred_y"] = df.apply(lambda row: pixel_to_world_dual(row["cls_id"], row["px_cam1"], row["py_cam1"], row["w_cam1"], row["h_cam1"], row["conf_cam1"], row["px_cam2"], row["py_cam2"], row["w_cam2"], row["h_cam2"], row["conf_cam2"], row["ee_x"], row["ee_y"], row["ee_z"])[1], axis=1)
df["pred_z"] = df.apply(lambda row: pixel_to_world_dual(row["cls_id"], row["px_cam1"], row["py_cam1"], row["w_cam1"], row["h_cam1"], row["conf_cam1"], row["px_cam2"], row["py_cam2"], row["w_cam2"], row["h_cam2"], row["conf_cam2"], row["ee_x"], row["ee_y"], row["ee_z"])[2], axis=1)

# Compute the differences between the world and predicted positions
df["diff_x"] = df["world_x"] - df["pred_x"]
df["diff_y"] = df["world_y"] - df["pred_y"]
df["diff_z"] = df["world_z"] - df["pred_z"]

# Compute the error metrics
mean_error_x = df["diff_x"].mean()
mean_error_y = df["diff_y"].mean()
mean_error_z = df["diff_z"].mean()
std_error_x = df["diff_x"].std()
std_error_y = df["diff_y"].std()
std_error_z = df["diff_z"].std()
print(f"Mean Error X: {mean_error_x}, Std Error X: {std_error_x}")
print(f"Mean Error Y: {mean_error_y}, Std Error Y: {std_error_y}")
print(f"Mean Error Z: {mean_error_z}, Std Error Z: {std_error_z}")

Mean Error X: 4.226561474410599e-17, Std Error X: 0.0038526837356097714
Mean Error Y: 1.2349918605507629e-17, Std Error Y: 0.0018548650474177726
Mean Error Z: -1.2635418252070073e-16, Std Error Z: 0.0024229917013723163
