# Post-Processing Predicted Camera Pose and CO3D Ground Truth Data

## Utility Functions

In [2]:
import pandas as pd

def parse_images_txt(file_path):
    """
    解析 images.txt 檔案，提取相機姿態資訊。
    """
    poses = []
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # 跳過前四行的註解和統計資訊
    i = 4
    while i < len(lines):
        # 這是包含姿態資訊的行
        pose_line = lines[i].strip().split()
        image_id = int(pose_line[0])
        qw, qx, qy, qz = map(float, pose_line[1:5])
        tx, ty, tz = map(float, pose_line[5:8])
        camera_id = int(pose_line[8])
        name = pose_line[9]

        poses.append({
            'name': name,
            'image_id': image_id,
            'qw': qw, 'qx': qx, 'qy': qy, 'qz': qz,
            'tx': tx, 'ty': ty, 'tz': tz
        })

        # 跳過下一行的 2D points 資訊
        i += 2

    return pd.DataFrame(poses)


import json
from typing import Optional

def parse_frame_annotations(file_path: str) -> Optional[pd.DataFrame]:
    """
    Parses a nested JSON file of frame annotations and flattens it into a DataFrame.

    This function specifically uses pandas.json_normalize to handle the nested
    structure, creating separate columns for nested data like 'image.path' and 'viewpoint.R'.

    Args:
        file_path (str): The path to the 'frame_annotations.json' file.

    Returns:
        Optional[pd.DataFrame]: A DataFrame containing the flattened data,
                                or None if an error occurs.
    """
    try:
        # Open and load the file's content first
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Use json_normalize to flatten the data into a clean table
        ground_truth_df = pd.json_normalize(data)
        return ground_truth_df

    except FileNotFoundError:
        print(f"❌ Error: The file was not found at '{file_path}'")
        return None
    except json.JSONDecodeError:
        print(f"❌ Error: The file at '{file_path}' is not a valid JSON.")
        return None
    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
        return None

In [12]:
PREDICTED_PATH = '/media/daniel/storage1/2.research/1.3d-reconstruct/results/CO3D/110_13051_23361/text/images.txt'
GT_PATH = '/media/daniel/storage1/2.research/1.3d-reconstruct/dataset/CO3D/apple/frame_annotations.json'
TARGET_SEQUENCE = "110_13051_23361" # The sequence name from the file path

predicted_df = parse_images_txt(PREDICTED_PATH)
ground_truth_df = parse_frame_annotations(GT_PATH)

if predicted_df is not None and ground_truth_df is not None:
    # --- Filter Ground Truth to the specific sequence ---
    print(f"\n--- Filtering for sequence: {TARGET_SEQUENCE} ---")
    filtered_gt_df = ground_truth_df[ground_truth_df['sequence_name'] == TARGET_SEQUENCE].copy()
    print(f"Found {len(filtered_gt_df)} matching frames in ground truth.")

    # --- Prepare the key for merging ---
    filtered_gt_df['filename'] = filtered_gt_df['image.path'].str.split('/').str[-1]

    # --- Perform a CLEAN merge ---
    merged_df = pd.merge(
            predicted_df,
            filtered_gt_df,
            left_on='name',
            right_on='filename'
        )

    # --- Sort the final result by frame number ---
    print("\n--- Sorting by frame number to ensure correct sequence ---")
    merged_df.sort_values(by='frame_number', inplace=True)
    merged_df.reset_index(drop=True, inplace=True)

    print("\n--- Final, Cleaned, and Sorted Data ---")
    print(merged_df[['name', 'frame_number', 'qw', 'qx', 'qy', 'qz', 'tx', 'ty', 'tz', 'viewpoint.R', 'viewpoint.T']].head())

    # merged_df.to_csv('merged_data.csv', index=False)

    # import matplotlib.pyplot as plt

    # Extract ground truth translation components
    # gt_T = pd.DataFrame(merged_df['viewpoint.T'].to_list(), columns=['gt_tx', 'gt_ty', 'gt_tz'])

    # plt.figure(figsize=(12, 6))
    # plt.title('Predicted vs. Ground Truth Camera Trajectory (X-axis)')
    # plt.xlabel('Frame Number')
    # plt.ylabel('X Translation')
    # plt.plot(merged_df['frame_number'], merged_df['tx'], label='Predicted TX', marker='.')
    # plt.plot(merged_df['frame_number'], gt_T['gt_tx'], label='Ground Truth TX', marker='.')
    # plt.legend()
    # plt.grid(True)
    # plt.show()


--- Filtering for sequence: 110_13051_23361 ---
Found 202 matching frames in ground truth.

--- Sorting by frame number to ensure correct sequence ---

--- Final, Cleaned, and Sorted Data ---
              name  frame_number        qw        qx        qy        qz  \
0  frame000001.jpg             1  0.999999 -0.001639  0.000317 -0.000082   
1  frame000002.jpg             2  1.000000 -0.000355  0.000677 -0.000270   
2  frame000003.jpg             3  0.999999  0.000557  0.000702 -0.000634   
3  frame000004.jpg             4  0.999998  0.001311  0.001161 -0.000914   
4  frame000005.jpg             5  0.999995  0.002528  0.001562 -0.001090   

         tx        ty        tz  \
0  0.055844 -1.761651  0.985386   
1  0.041557 -1.742804  0.996667   
2  0.030375 -1.726939  1.008236   
3  0.027887 -1.714112  1.013429   
4  0.032747 -1.702738  1.019127   

                                         viewpoint.R  \
0  [[-0.9983327388763428, -0.007844997569918633, ...   
1  [[-0.9983288645744324, -

## Convert Quaternion to Rotation Matrix

In [None]:
import numpy as np
from scipy.spatial.transform import Rotation as R

post_processed_df = merged_df[['name', 'frame_number', 'qw', 'qx', 'qy', 'qz', 'tx', 'ty', 'tz','viewpoint.R', 'viewpoint.T']].copy()

def quat_to_matrix(row):
    # 四元數順序為 [x, y, z, w]
    quaternion = [row['qx'], row['qy'], row['qz'], row['qw']]    
    # 進行轉換
    rotation = R.from_quat(quaternion)
    return rotation.as_matrix() # 返回 3x3 numpy array

def trans_to_vector(row):
    return np.array([row['tx'], row['ty'], row['tz']])

post_processed_df['pred_viewpoint_R'] = post_processed_df.apply(quat_to_matrix, axis=1)
post_processed_df['pred_viewpoint_T'] = post_processed_df.apply(trans_to_vector, axis=1)

print(post_processed_df[['name', 'pred_viewpoint_R']])
print(post_processed_df[['name', 'pred_viewpoint_T']])

post_processed_df[['name', 'frame_number', 'pred_viewpoint_R', 'pred_viewpoint_T', 'viewpoint.R', 'viewpoint.T']].to_csv('post_processed_df.csv', index=False)



                name                                   pred_viewpoint_R
0    frame000001.jpg  [[0.9999997855363876, 0.00016294812650218226, ...
1    frame000002.jpg  [[0.9999989362109638, 0.000539926128578973, 0....
2    frame000003.jpg  [[0.9999982091438151, 0.0012692536169403813, 0...
3    frame000004.jpg  [[0.9999956351318855, 0.001830993221596829, 0....
4    frame000005.jpg  [[0.9999927454323816, 0.002187878140049406, 0....
..               ...                                                ...
197  frame000198.jpg  [[0.9993601278158781, -0.02268173731189705, 0....
198  frame000199.jpg  [[0.9993914489637736, -0.02147576701304468, 0....
199  frame000200.jpg  [[0.9994587303140047, -0.020892163129824025, 0...
200  frame000201.jpg  [[0.9994978211021945, -0.020653738528080848, 0...
201  frame000202.jpg  [[0.999522353469453, -0.01900210169045953, 0.0...

[202 rows x 2 columns]
                name                                   pred_viewpoint_T
0    frame000001.jpg  [0.055843573628215

## Compute Predicted Camera Pose RRE, RTE

In [None]:
from itertools import combinations

# 獲取所有列的索引
indices = post_processed_df.index.tolist()

# 生成所有可能的索引對 (i, j)
all_pairs = list(combinations(indices, 2))

rre_errors = [] # 用來存放所有成對的 RRE 結果
rte_errors = [] # 用來存放所有成對的 RTE 結果

# 遍歷所有成對的組合
for i, j in all_pairs:
    # --- 獲取旋轉矩陣 (並確保是 numpy array) ---
    R_pred_i = np.array(post_processed_df.loc[i, 'pred_viewpoint_R'])
    R_pred_j = np.array(post_processed_df.loc[j, 'pred_viewpoint_R'])
    R_gt_i = np.array(post_processed_df.loc[i, 'viewpoint.R'])
    R_gt_j = np.array(post_processed_df.loc[j, 'viewpoint.R'])
    
    # --- 獲取平移向量 (並確保是 numpy array) ---
    T_pred_i = np.array(post_processed_df.loc[i, 'pred_viewpoint_T'])
    T_pred_j = np.array(post_processed_df.loc[j, 'pred_viewpoint_T'])
    T_gt_i = np.array(post_processed_df.loc[i, 'viewpoint.T'])
    T_gt_j = np.array(post_processed_df.loc[j, 'viewpoint.T'])

    # -----------------------------------------------------------------
    # RRE 計算
    # -----------------------------------------------------------------
    R_relative_pred = np.dot(R_pred_j, R_pred_i.T)
    R_relative_gt = np.dot(R_gt_j, R_gt_i.T)
    error_matrix = np.dot(R_relative_pred, R_relative_gt.T)
    trace = np.trace(error_matrix)
    angle_rad = np.arccos(np.clip((trace - 1) / 2, -1.0, 1.0))
    angle_deg = np.rad2deg(angle_rad)
    rre_errors.append(angle_deg)

    # -----------------------------------------------------------------
    # RTE 計算
    # -----------------------------------------------------------------
    # 1. 計算相機中心點 C = -R^T * T
    C_pred_i = -np.dot(R_pred_i.T, T_pred_i)
    C_pred_j = -np.dot(R_pred_j.T, T_pred_j)
    C_gt_i = -np.dot(R_gt_i.T, T_gt_i)
    C_gt_j = -np.dot(R_gt_j.T, T_gt_j)
    
    # 2. 計算相對平移向量 (兩個相機中心點的連線)
    T_relative_pred = C_pred_j - C_pred_i
    T_relative_gt = C_gt_j - C_gt_i
    
    # 3. 正規化向量 (轉換為單位向量) 以計算夾角
    # 加上一個極小值 epsilon 防止除以零的錯誤
    epsilon = 1e-8
    T_relative_pred_norm = T_relative_pred / (np.linalg.norm(T_relative_pred) + epsilon)
    T_relative_gt_norm = T_relative_gt / (np.linalg.norm(T_relative_gt) + epsilon)
    
    # 4. 計算兩個單位向量之間的夾角
    dot_product = np.dot(T_relative_pred_norm, T_relative_gt_norm)
    angle_rad_t = np.arccos(np.clip(dot_product, -1.0, 1.0))
    angle_deg_t = np.rad2deg(angle_rad_t)
    rte_errors.append(angle_deg_t)


# --- 統計 RRE 結果 ---
rre_series = pd.Series(rre_errors)
print("--- 所有成對 RRE 的統計結果 ---")
print(rre_series.describe())
accuracy_at_15_deg_r = (rre_series < 15).mean() * 100
print(f"\nRRE@15°: {accuracy_at_15_deg_r:.2f}%")

# --- 統計 RTE 結果 ---
rte_series = pd.Series(rte_errors)
print("\n--- 所有成對 RTE 的統計結果 ---")
print(rte_series.describe())
accuracy_at_15_deg_t = (rte_series < 15).mean() * 100
print(f"\nRTE@15°: {accuracy_at_15_deg_t:.2f}%")

--- 所有成對 RRE 的統計結果 ---
count    20301.000000
mean        71.346005
std         42.467144
min          0.115154
25%         32.343159
50%         75.267177
75%        112.042494
max        143.455819
dtype: float64

RRE@15°: 12.87%

--- 所有成對 RTE 的統計結果 ---
count    20301.000000
mean        13.994741
std         10.998033
min          0.047213
25%          4.458407
50%         11.548816
75%         20.188088
max         99.419709
dtype: float64

RTE@15°: 59.80%


## Compute Predicted Camera Pose AUC

In [57]:
def calculate_auc(rre_series, rte_series, max_threshold=30):
    """
    計算 AUC (mAA) 分數
    
    參數:
    rre_series (pd.Series): 包含所有成對 RRE 誤差的 Series
    rte_series (pd.Series): 包含所有成對 RTE 誤差的 Series
    max_threshold (int): 積分的最高閾值，論文中通常為 30
    
    返回:
    float: AUC 分數
    """
    
    accuracies = []
    # 步驟 1: 設定閾值範圍從 1 到 max_threshold
    thresholds = range(1, max_threshold + 1)
    
    # 步驟 2: 計算每個閾值的「綜合準確率」
    for t in thresholds:
        # a. 計算 RRE 在閾值 t 下的準確率
        acc_rre = (rre_series < t).mean()
        
        # b. 計算 RTE 在閾值 t 下的準確率
        acc_rte = (rte_series < t).mean()
        
        # c. 取兩者中的最小值
        acc_at_t = min(acc_rre, acc_rte)
        
        accuracies.append(acc_at_t)
        
    # 步驟 3: 計算所有準確率的平均值
    auc_score = np.mean(accuracies)
    
    return auc_score

# --- 計算並打印 AUC@30° ---
auc_30 = calculate_auc(rre_series, rte_series, max_threshold=30)

# 論文中的 AUC 分數通常乘以 100
print(f"\nAUC@30° (mAA): {auc_30 * 100:.2f}")


AUC@30° (mAA): 12.69
