In [1]:
import zipfile
import pickle
import os
import cv2
import numpy as np

def load_from_zip(zip_path):
    """
    Load pickled data stored inside a .zip file created by TrajectoryRecorder.save_buffer.
    
    Args:
        zip_path (str): Path to the .zip file.
    
    Returns:
        object: The Python object that was originally pickled (list, dict, etc.).
    """
    with zipfile.ZipFile(zip_path, 'r') as zip_file:
        with zip_file.open('data.pkl', 'r') as file:
            data = pickle.load(file)
    return data

def transform_to_list_of_images_dict(data):
    # Takes a list of type [act1, obs1, act2, obs2, ...] with obs being a dict "image_0", "image_1", "objects_pos"
    # Outputs a list of dict [obs1, obs2, ...] with obs being a dict {"_agentview":..., "wrist_image":...} 
    # image_0 is agentview, image_1 is wrist view
    images_list = []
    for episode in data:
        obs_list = []
        traj = episode[0]
        for i in range(1, len(traj), 2):
            obs = traj[i]
            if obs is None:
                continue
            img_dict = {"objects_pos": obs["objects_pos"].copy()}  # Copy the objects_pos dict
            if "image_0" in obs:
                image = obs["image_0"]
                image = cv2.flip(image.reshape(256, 256, 3), 0)
                # Change the image to BGR
                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                img_dict["agentview"] = image
            if "image_1" in obs:
                image = obs["image_1"]
                image = cv2.flip(image.reshape(256, 256, 3), 0)
                # Change the image to BGR
                image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
                img_dict["wrist_image"] = image
            obs_list.append(img_dict)
        images_list.append(obs_list)
    return images_list

In [2]:
# yolo imports
from ultralytics import YOLO
from roboflow import Roboflow
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2
import joblib
import pandas as pd

In [3]:
def get_episode_preds_and_ground_truth(episode, yolo_model):
    info_list = []
    for obs_step in episode:
        info = {}
        if "agentview" not in obs_step or "wrist_image" not in obs_step:
            continue
        agentview = obs_step["agentview"]
        wrist_image = obs_step["wrist_image"]
        # Get the predictions
        agentview_results = yolo_model(agentview, verbose=False)[0]
        wrist_results = yolo_model(wrist_image, verbose=False)[0]
        #print(obs_step.keys())
        objects_info = obs_step["objects_pos"]  # dict with object names as keys and positions as values
        for pred in agentview_results.boxes:
            cls_id = int(pred.cls)
            cls = yolo_model.names[cls_id]
            # Remove the "_" from the class name
            cls = cls.replace("_", "")
            x, y, w, h = pred.xywhn.tolist()[0]
            conf = pred.conf
            # Convert normalized coordinates to pixel coordinates
            x = int(x * agentview.shape[1])
            y = int(y * agentview.shape[0])
            w = int(w * agentview.shape[1])
            h = int(h * agentview.shape[0])

            # Convert to pixel coordinates
            x1, y1 = int(x - w / 2), int(y - h / 2)
            x2, y2 = int(x + w / 2), int(y + h / 2)

            #print(cls, objects_info.keys())
            # Get the ground truth position of the object
            if cls in objects_info:
                ground_truth_xyz = objects_info[cls]
                ee_pos = objects_info["gripper"]

                found_match = False
                for pred in wrist_results.boxes:
                    cls_id2 = int(pred.cls)
                    if cls_id2 == cls_id:
                            found_match = True
                            x_cam2, y_cam2, w_cam2, h_cam2 = pred.xywhn.tolist()[0]
                            conf_cam2 = pred.conf
                            # Convert normalized coordinates to pixel coordinates
                            x_cam2 = int(x_cam2 * wrist_image.shape[1])
                            y_cam2 = int(y_cam2 * wrist_image.shape[0])
                            w_cam2 = int(w_cam2 * wrist_image.shape[1])
                            h_cam2 = int(h_cam2 * wrist_image.shape[0])

                            info = {
                                "px_cam1": x,
                                "py_cam1": y,
                                "w_cam1": w,
                                "h_cam1": h,
                                "conf_cam1": float(conf),
                                "cls": cls,
                                "px_cam2": x_cam2,
                                "py_cam2": y_cam2,
                                "w_cam2": w_cam2,
                                "h_cam2": h_cam2,
                                "conf_cam2": float(conf_cam2),
                                "ee_x": ee_pos[0] if ee_pos is not None else None,
                                "ee_y": ee_pos[1] if ee_pos is not None else None,
                                "ee_z": ee_pos[2] if ee_pos is not None else None,
                                "world_x": ground_truth_xyz[0],
                                "world_y": ground_truth_xyz[1],
                                "world_z": ground_truth_xyz[2],
                            }
                if not found_match:
                    x_cam2, y_cam2, w_cam2, h_cam2, conf_cam2 = 0, 0, 0, 0, 0
                    info ={
                        "px_cam1": x,
                        "py_cam1": y,
                        "w_cam1": w,
                        "h_cam1": h,
                        "conf_cam1": float(conf),
                        "cls": cls,
                        "px_cam2": 0,
                        "py_cam2": 0,
                        "w_cam2": 0,
                        "h_cam2": 0,
                        "conf_cam2": 0,
                        "ee_x": ee_pos[0] if ee_pos is not None else None,
                        "ee_y": ee_pos[1] if ee_pos is not None else None,
                        "ee_z": ee_pos[2] if ee_pos is not None else None,
                        "world_x": ground_truth_xyz[0],
                        "world_y": ground_truth_xyz[1],
                        "world_z": ground_truth_xyz[2],
                        }
            info_list.append(info)
    return info_list
                

In [None]:
# Create a df that, for each label in the yolo model, gets the world_pos of the corresponding object as well as the ee_pos
df_list = []
# find all zip files in a directory
yolo_model = YOLO("../PDDL/yolo_nutassembly.pt")
data_dir = "/home/lorangpi/CyclicLxM/data/NutAssembly_seed_0/train_yolo/traces"
zip_files = [f for f in os.listdir(data_dir) if f.endswith('reach_place.zip')]
for zip_file in zip_files:
    zip_path = os.path.join(data_dir, zip_file)
    zip_name = os.path.splitext(zip_file)[0]
    videos_dir = os.path.join(data_dir, zip_name, 'videos')
    os.makedirs(videos_dir, exist_ok=True)
    yolo_data = load_from_zip(zip_path)
    h264_videos_dir = os.path.join(videos_dir, 'h264')
    os.makedirs(h264_videos_dir, exist_ok=True)
    print(f"Processing {zip_file} with {len(yolo_data)} entries")
    yolo_data = transform_to_list_of_images_dict(yolo_data)
    for episode_id, episode in enumerate(yolo_data):
        print(f"expisode {episode_id} with {len(episode)} steps")
        ep_info = get_episode_preds_and_ground_truth(episode, yolo_model)
        if len(ep_info) > 0:
            df_list.extend(ep_info)
        # if episode_id >= 10:
        #     break

Processing pick.zip with 27 entries
expisode 0 with 150 steps
expisode 1 with 153 steps
expisode 2 with 150 steps
expisode 3 with 155 steps
expisode 4 with 156 steps
expisode 5 with 150 steps
expisode 6 with 149 steps
expisode 7 with 151 steps
expisode 8 with 151 steps
expisode 9 with 151 steps
expisode 10 with 151 steps
Processing reach_pick.zip with 27 entries
expisode 0 with 181 steps
expisode 1 with 182 steps
expisode 2 with 183 steps
expisode 3 with 176 steps
expisode 4 with 179 steps
expisode 5 with 184 steps
expisode 6 with 194 steps
expisode 7 with 179 steps
expisode 8 with 192 steps
expisode 9 with 185 steps
expisode 10 with 189 steps
Processing reach_place.zip with 27 entries
expisode 0 with 146 steps
expisode 1 with 151 steps
expisode 2 with 155 steps
expisode 3 with 150 steps
expisode 4 with 147 steps
expisode 5 with 151 steps
expisode 6 with 163 steps
expisode 7 with 150 steps
expisode 8 with 159 steps
expisode 9 with 157 steps
expisode 10 with 187 steps
Processing place.z

In [5]:
df_list

[{'px_cam1': 79,
  'py_cam1': 99,
  'w_cam1': 45,
  'h_cam1': 38,
  'conf_cam1': 0.9185290336608887,
  'cls': 'roundnut',
  'px_cam2': 151,
  'py_cam2': 152,
  'w_cam2': 46,
  'h_cam2': 57,
  'conf_cam2': 0.9521005153656006,
  'ee_x': 0.01074564284397464,
  'ee_y': 0.13323589969764552,
  'ee_z': 1.1559759483461147,
  'world_x': 0.03661059365796986,
  'world_y': -0.09740377454550436,
  'world_z': 0.8299884570834555},
 {'px_cam1': 179,
  'py_cam1': 103,
  'w_cam1': 56,
  'h_cam1': 34,
  'conf_cam1': 0.9157389402389526,
  'cls': 'squarenut',
  'px_cam2': 44,
  'py_cam2': 146,
  'w_cam2': 58,
  'h_cam2': 49,
  'conf_cam2': 0.9357568025588989,
  'ee_x': 0.01074564284397464,
  'ee_y': 0.13323589969764552,
  'ee_z': 1.1559759483461147,
  'world_x': 0.03473860578414316,
  'world_y': 0.10340704750126756,
  'world_z': 0.8299788753620365},
 {'px_cam1': 79,
  'py_cam1': 99,
  'w_cam1': 45,
  'h_cam1': 38,
  'conf_cam1': 0.9185612201690674,
  'cls': 'roundnut',
  'px_cam2': 151,
  'py_cam2': 151,
 

In [None]:
df = pd.DataFrame(df_list)
print(df.head(5))
print(len(df))
#reverse the dict
inv_class_dict = {v: k for k, v in yolo_model.names.items()}
print(inv_class_dict)
# Replace cls names with the cls id from the yolo model
df['cls_id'] = df['cls'].map(inv_class_dict)
df.head(5)
print(len(df))

   px_cam1  py_cam1  w_cam1  h_cam1  conf_cam1        cls  px_cam2  py_cam2  \
0       79       99      45      38   0.918529   roundnut      151      152   
1      179      103      56      34   0.915739  squarenut       44      146   
2       79       99      45      38   0.918561   roundnut      151      151   
3      179      103      56      34   0.915739  squarenut       44      146   
4       79       99      45      38   0.918580   roundnut      151      152   

   w_cam2  h_cam2  conf_cam2      ee_x      ee_y      ee_z   world_x  \
0      46      57   0.952101  0.010746  0.133236  1.155976  0.036611   
1      58      49   0.935757  0.010746  0.133236  1.155976  0.034739   
2      46      57   0.953994  0.010746  0.133236  1.155976  0.036611   
3      58      48   0.935890  0.010746  0.133236  1.155976  0.034739   
4      47      57   0.954007  0.010746  0.133236  1.155976  0.036611   

    world_y   world_z  
0 -0.097404  0.829988  
1  0.103407  0.829979  
2 -0.097404  0.82998

In [7]:
from sklearn.ensemble import GradientBoostingRegressor
X_dual = df[["px_cam1", "py_cam1", "w_cam1", "h_cam1", "conf_cam1", "px_cam2", "py_cam2", "w_cam2", "h_cam2", "conf_cam2", "ee_x", "ee_y", "ee_z"]].values
Y_dual = df[["world_x", "world_y", "world_z"]].values

reg_x_dual = GradientBoostingRegressor(n_estimators=200, max_depth=3, learning_rate=0.1)
reg_y_dual = GradientBoostingRegressor(n_estimators=200, max_depth=3, learning_rate=0.1)
reg_z_dual = GradientBoostingRegressor(n_estimators=200, max_depth=3, learning_rate=0.1)

reg_x_dual.fit(X_dual, Y_dual[:, 0])
reg_y_dual.fit(X_dual, Y_dual[:, 1])
reg_z_dual.fit(X_dual, Y_dual[:, 2])
def pixel_to_world_dual(px1, py1, w1, h1, conf1, px2, py2, w2, h2, conf2, ee_x, ee_y, ee_z):
    features = np.array([[px1, py1, w1, h1, conf1, px2, py2, w2, h2, conf2, ee_x, ee_y, ee_z]])
    x = reg_x_dual.predict(features)[0]
    y = reg_y_dual.predict(features)[0]
    z = reg_z_dual.predict(features)[0]
    return x, y, z
# --- Example usage ---
# Get a prediction for the first row of the df dataframe
px1, py1, w1, h1, conf1 = df.iloc[0][["px_cam1", "py_cam1", "w_cam1", "h_cam1", "conf_cam1"]]
px2, py2, w2, h2, conf2 = df.iloc[0][["px_cam2", "py_cam2", "w_cam2", "h_cam2", "conf_cam2"]]
ee_x, ee_y, ee_z = df.iloc[0][["ee_x", "ee_y", "ee_z"]]
print(pixel_to_world_dual(px1, py1, w1, h1, conf1, px2, py2, w2, h2, conf2, ee_x, ee_y, ee_z))

(0.03662168735129401, -0.09715792357140879, 0.8310022007968796)


In [8]:
# Replace pred columns with the dual cam predictions
df["pred_x"] = df.apply(lambda row: pixel_to_world_dual(row["px_cam1"], row["py_cam1"], row["w_cam1"], row["h_cam1"], row["conf_cam1"], row["px_cam2"], row["py_cam2"], row["w_cam2"], row["h_cam2"], row["conf_cam2"], row["ee_x"], row["ee_y"], row["ee_z"])[0], axis=1)
df["pred_y"] = df.apply(lambda row: pixel_to_world_dual(row["px_cam1"], row["py_cam1"], row["w_cam1"], row["h_cam1"], row["conf_cam1"], row["px_cam2"], row["py_cam2"], row["w_cam2"], row["h_cam2"], row["conf_cam2"], row["ee_x"], row["ee_y"], row["ee_z"])[1], axis=1)
df["pred_z"] = df.apply(lambda row: pixel_to_world_dual(row["px_cam1"], row["py_cam1"], row["w_cam1"], row["h_cam1"], row["conf_cam1"], row["px_cam2"], row["py_cam2"], row["w_cam2"], row["h_cam2"], row["conf_cam2"], row["ee_x"], row["ee_y"], row["ee_z"])[2], axis=1)

# Compute the differences between the world and predicted positions
df["diff_x"] = df["world_x"] - df["pred_x"]
df["diff_y"] = df["world_y"] - df["pred_y"]
df["diff_z"] = df["world_z"] - df["pred_z"]

# Compute the error metrics
mean_error_x = df["diff_x"].mean()
mean_error_y = df["diff_y"].mean()
mean_error_z = df["diff_z"].mean()
std_error_x = df["diff_x"].std()
std_error_y = df["diff_y"].std()
std_error_z = df["diff_z"].std()
print(f"Mean Error X: {mean_error_x}, Std Error X: {std_error_x}")
print(f"Mean Error Y: {mean_error_y}, Std Error Y: {std_error_y}")
print(f"Mean Error Z: {mean_error_z}, Std Error Z: {std_error_z}")

Mean Error X: 1.9395442388404577e-18, Std Error X: 0.0001581006049551298
Mean Error Y: -2.1788260255780467e-18, Std Error Y: 0.00023408488149310455
Mean Error Z: 1.3428682851521838e-17, Std Error Z: 0.0021298690581121014


In [9]:
# Save the dual cam models
joblib.dump({"reg_x": reg_x_dual, "reg_y": reg_y_dual, "reg_z": reg_z_dual}, "nutassembly_dual_cam_calibration_models.pkl")

['nutassembly_dual_cam_calibration_models.pkl']