In [23]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
import os
import json
import pandas as pd
import numpy as np
from datasets import load_dataset

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

In [25]:
vsibench = load_dataset("nyu-visionx/VSI-Bench")
df = vsibench["test"].to_pandas()

In [26]:
qdf = df[df["question_type"] == "object_rel_distance"].copy()
qdf.head(3)

Unnamed: 0,id,dataset,scene_name,question_type,question,ground_truth,options
1330,1334,arkitscenes,42446103,object_rel_distance,Measuring from the closest point of each objec...,A,"[A. chair, B. stool, C. stove, D. sofa]"
1331,1335,arkitscenes,42446103,object_rel_distance,Measuring from the closest point of each objec...,B,"[A. stove, B. table, C. stool, D. sofa]"
1332,1336,arkitscenes,42446103,object_rel_distance,Measuring from the closest point of each objec...,C,"[A. chair, B. tv, C. sofa, D. stove]"


In [27]:
qdf["question"].sample(3).values

array(['Measuring from the closest point of each object, which of these objects (refrigerator, bed, window, trash bin) is the closest to the door?',
       'Measuring from the closest point of each object, which of these objects (whiteboard, table, heater, chair) is the closest to the telephone?',
       'Measuring from the closest point of each object, which of these objects (table, cushion, sofa, clock) is the closest to the door?'],
      dtype=object)

In [28]:
# --- 1. Extract Target and GT Object Name ---
# Extract target object (last object mentioned in question)
qdf[["object_1", "object_2", "object_3", "object_4", "target_object"]] = qdf["question"].str.extract(r'which of these objects \((.*?), (.*?), (.*?), (.*?)\) is the closest to the (.*?)\?$')

# Extract GT object name from options
qdf["gt_idx"] = qdf["ground_truth"].apply(lambda x: "ABCD".index(x))
qdf["gt_option"] = qdf.apply(lambda row: row["options"][row["gt_idx"]], axis=1)
# Ensure no leading/trailing spaces in gt object name
qdf["gt_object"] = qdf["gt_option"].apply(lambda x: x.split(". ")[-1].strip())

qdf["tgt_gt_pair"] = qdf.apply(lambda row: "-".join(sorted([row["target_object"], row["gt_object"]])), axis=1)
qdf["tgt_gt_ord_pair"] = qdf.apply(lambda row: "-".join([row["target_object"], row["gt_object"]]), axis=1)

In [29]:
# -----------------------------------------------------------------------------
# 1.  Helper that injects frequency features, given training‑set counts
# -----------------------------------------------------------------------------

def add_frequency_features(df: pd.DataFrame,
                            gt_obj_counts: pd.Series,
                            global_pair_counts: pd.Series,
                            global_ordered_pair_counts: pd.Series) -> pd.DataFrame:
    """Return *new* DataFrame with leakage‑free frequency features added."""
    df = df.copy()
    for i in range(4):
        df[f"opt_{i}_option_freq"] = df[f"object_{i+1}"].map(gt_obj_counts).fillna(0)
        df[f"opt_{i}_tgt_option_pair_freq"] = df["tgt_gt_pair"].map(global_pair_counts).fillna(0)
        df[f"opt_{i}_tgt_option_ord_pair_freq"] = df["tgt_gt_ord_pair"].map(global_ordered_pair_counts).fillna(0)

    df["max_option_freq"] = df[[f"opt_{i}_option_freq" for i in range(4)]].max(axis=1)
    df["max_tgt_option_pair_freq"] = df[[f"opt_{i}_tgt_option_pair_freq" for i in range(4)]].max(axis=1)
    df["max_tgt_option_ord_pair_freq"] = df[[f"opt_{i}_tgt_option_ord_pair_freq" for i in range(4)]].max(axis=1)
    return df

In [30]:
import numpy as np # Added for mean/std calculation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer # Added make_scorer for cross_val_score
from sklearn.model_selection import StratifiedKFold, cross_val_score # Added KFold and cross_val_score
from sklearn.preprocessing import LabelEncoder


feature_cols = [
    'object_1', 'object_2', 'object_3', 'object_4', 'target_object',
    # 'option_prob', 'tgt_option_pair_prob', 'tgt_option_ord_pair_prob'
    'max_option_freq',
    'max_tgt_option_pair_freq', 'max_tgt_option_ord_pair_freq',
    'opt_0_option_freq', 'opt_0_tgt_option_pair_freq',
    'opt_0_tgt_option_ord_pair_freq', 'opt_1_option_freq',
    'opt_1_tgt_option_pair_freq', 'opt_1_tgt_option_ord_pair_freq',
    'opt_2_option_freq', 'opt_2_tgt_option_pair_freq',
    'opt_2_tgt_option_ord_pair_freq', 'opt_3_option_freq',
    'opt_3_tgt_option_pair_freq', 'opt_3_tgt_option_ord_pair_freq'
]
target_col = "ground_truth"

In [31]:
def evaluate_rf_bias_cv_no_leak(df: pd.DataFrame, n_splits: int = 5,
                                feature_cols=feature_cols,
                                target_col: str = target_col,
                                random_state: int = 42):
    """Return mean ± std accuracy of an RF trained with leakage‑free freq features."""

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    scores = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(df, df[target_col]), 1):
        train_df = df.iloc[train_idx].copy()
        test_df  = df.iloc[test_idx].copy()

        # --- (i)  Re‑compute counts from TRAIN ONLY --------------------------
        gt_obj_counts = train_df["gt_object"].value_counts()
        global_pair_counts = train_df["tgt_gt_pair"].value_counts()
        global_ordered_pair_counts = train_df["tgt_gt_ord_pair"].value_counts()

        # --- (ii) Add leakage‑free freq features -----------------------------
        train_df = add_frequency_features(train_df, gt_obj_counts,
                                          global_pair_counts, global_ordered_pair_counts)
        test_df  = add_frequency_features(test_df,  gt_obj_counts,
                                          global_pair_counts, global_ordered_pair_counts)

        # --- (iii) Encode categorical columns --------------------------------
        X_train, X_test = train_df[feature_cols].copy(), test_df[feature_cols].copy()
        cat_cols = X_train.select_dtypes(include="object").columns
        encoders = {}
        for col in cat_cols:
            enc = LabelEncoder().fit(pd.concat([X_train[col], X_test[col]], axis=0).astype(str))
            X_train[col] = enc.transform(X_train[col].astype(str))
            X_test[col]  = enc.transform(X_test[col].astype(str))
            encoders[col] = enc

        y_train, y_test = train_df[target_col], test_df[target_col]

        # --- (iv) Fit + evaluate --------------------------------------------
        clf = RandomForestClassifier(n_estimators=100, random_state=random_state, n_jobs=-1)
        clf.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        scores.append(acc)
        print(f"Fold {fold}: accuracy = {acc:.4f}")

    mean_acc, std_acc = np.mean(scores), np.std(scores)
    print(f"\nOverall: {mean_acc:.4f} ± {std_acc:.4f} (n_splits={n_splits})")
    return mean_acc, std_acc

evaluate_rf_bias_cv_no_leak(qdf, n_splits=5);

Fold 1: accuracy = 0.3803
Fold 2: accuracy = 0.3310
Fold 3: accuracy = 0.4085
Fold 4: accuracy = 0.3732
Fold 5: accuracy = 0.4014

Overall: 0.3789 ± 0.0272 (n_splits=5)


In [39]:
def evaluate_rf_bias_cv_no_leak(df: pd.DataFrame, n_splits: int = 5,
                                feature_cols=feature_cols,
                                target_col: str = target_col,
                                random_state: int = 42):
    """Return mean ± std accuracy and a DataFrame of RF feature importances."""

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    scores = []

    pbar = tqdm(enumerate(skf.split(df, df[target_col]), 1), total=n_splits, desc="CV Folds")
    for fold, (train_idx, test_idx) in pbar:
        train_df, test_df = df.iloc[train_idx].copy(), df.iloc[test_idx].copy()

        # (i) Re‑compute counts on *train* only ------------------------------
        gt_counts   = train_df["gt_object"].value_counts()
        pair_counts = train_df["tgt_gt_pair"].value_counts()
        ord_counts  = train_df["tgt_gt_ord_pair"].value_counts()

        # (ii) Inject leakage‑free features ---------------------------------
        train_df = add_frequency_features(train_df, gt_counts, pair_counts, ord_counts)
        test_df  = add_frequency_features(test_df,  gt_counts, pair_counts, ord_counts)

        # (iii) Encode categoricals -----------------------------------------
        X_train, X_test = train_df[feature_cols].copy(), test_df[feature_cols].copy()
        cat_cols = X_train.select_dtypes(include='object').columns
        encoders = {}
        for col in cat_cols:
            enc = LabelEncoder().fit(pd.concat([X_train[col], X_test[col]], axis=0).astype(str))
            X_train[col] = enc.transform(X_train[col].astype(str))
            X_test[col]  = enc.transform(X_test[col].astype(str))
            encoders[col] = enc

        y_train, y_test = train_df[target_col], test_df[target_col]

        # (iv) Fit and evaluate --------------------------------------------
        clf = RandomForestClassifier(n_estimators=100, random_state=random_state, n_jobs=-1)
        clf.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        scores.append(acc)
        pbar.set_postfix({"avg_acc": f"{np.mean(scores):.2%}"})

    mean_acc, std_acc = np.mean(scores), np.std(scores)
    print(f"\nOverall: {mean_acc:.2%} ± {std_acc:.2%} (n_splits={n_splits})")
    print(f"Scores for each fold: {scores}")

    # ---------------------------------------------------------------------
    # 3.  Train final model on *all* data for feature importances (OK to see all rows)
    # ---------------------------------------------------------------------
    print("\nTraining final model on all data to get feature importances…")
    full_gt_counts   = df['gt_object'].value_counts()
    full_pair_counts = df['tgt_gt_pair'].value_counts()
    full_ord_counts  = df['tgt_gt_ord_pair'].value_counts()

    full_df = add_frequency_features(df, full_gt_counts, full_pair_counts, full_ord_counts)

    X_full = full_df[feature_cols].copy()
    y_full = full_df[target_col]

    cat_cols = X_full.select_dtypes(include='object').columns
    for col in cat_cols:
        enc = LabelEncoder().fit(X_full[col].astype(str))
        X_full[col] = enc.transform(X_full[col].astype(str))

    final_model = RandomForestClassifier(n_estimators=100, random_state=random_state, n_jobs=-1)
    final_model.fit(X_full, y_full)

    feature_importance_df = (
        pd.DataFrame({
            'feature': X_full.columns,
            'importance': final_model.feature_importances_
        })
        .sort_values('importance', ascending=False)
        .reset_index(drop=True)
    )

    print("\nFeature Importances (from final model trained on all data):")
    print(feature_importance_df)

    return mean_acc, std_acc, feature_importance_df

evaluate_rf_bias_cv_no_leak(qdf, n_splits=5);

CV Folds:   0%|          | 0/5 [00:00<?, ?it/s]

CV Folds: 100%|██████████| 5/5 [00:00<00:00,  9.44it/s, avg_acc=37.89%]



Overall: 37.89% ± 2.72% (n_splits=5)
Scores for each fold: [0.38028169014084506, 0.33098591549295775, 0.4084507042253521, 0.3732394366197183, 0.4014084507042254]

Training final model on all data to get feature importances…

Feature Importances (from final model trained on all data):
                           feature  importance
0                opt_2_option_freq    0.086204
1                opt_3_option_freq    0.084059
2                opt_1_option_freq    0.082356
3                         object_1    0.078862
4                opt_0_option_freq    0.076571
5                    target_object    0.076182
6                         object_2    0.075562
7                         object_3    0.075552
8                         object_4    0.075303
9                  max_option_freq    0.038726
10        max_tgt_option_pair_freq    0.029416
11  opt_2_tgt_option_ord_pair_freq    0.026300
12      opt_1_tgt_option_pair_freq    0.026260
13      opt_2_tgt_option_pair_freq    0.026083
14      o

In [40]:
evaluate_rf_bias_cv_no_leak(qdf, n_splits=50);

CV Folds: 100%|██████████| 50/50 [00:05<00:00,  9.33it/s, avg_acc=39.42%]



Overall: 39.42% ± 11.81% (n_splits=50)
Scores for each fold: [0.4666666666666667, 0.4, 0.3333333333333333, 0.4666666666666667, 0.4, 0.4666666666666667, 0.3333333333333333, 0.4, 0.4, 0.4, 0.35714285714285715, 0.21428571428571427, 0.35714285714285715, 0.35714285714285715, 0.35714285714285715, 0.2857142857142857, 0.35714285714285715, 0.35714285714285715, 0.21428571428571427, 0.2857142857142857, 0.5, 0.5714285714285714, 0.6428571428571429, 0.5714285714285714, 0.35714285714285715, 0.2857142857142857, 0.2857142857142857, 0.5, 0.5714285714285714, 0.2857142857142857, 0.35714285714285715, 0.7142857142857143, 0.5, 0.2857142857142857, 0.35714285714285715, 0.21428571428571427, 0.42857142857142855, 0.14285714285714285, 0.5, 0.5, 0.2857142857142857, 0.5714285714285714, 0.5714285714285714, 0.2857142857142857, 0.35714285714285715, 0.35714285714285715, 0.2857142857142857, 0.35714285714285715, 0.5, 0.35714285714285715]

Training final model on all data to get feature importances…

Feature Importances (

In [41]:
evaluate_rf_bias_cv_no_leak(qdf, n_splits=100);

CV Folds:   0%|          | 0/100 [00:00<?, ?it/s]

CV Folds: 100%|██████████| 100/100 [00:10<00:00,  9.22it/s, avg_acc=38.23%]



Overall: 38.23% ± 17.76% (n_splits=100)
Scores for each fold: [0.5, 0.5, 0.5, 0.25, 0.5, 0.5, 0.25, 0.25, 0.625, 0.5, 0.42857142857142855, 0.14285714285714285, 0.5714285714285714, 0.14285714285714285, 0.5714285714285714, 0.5714285714285714, 0.42857142857142855, 0.7142857142857143, 0.42857142857142855, 0.14285714285714285, 0.14285714285714285, 0.14285714285714285, 0.2857142857142857, 0.5714285714285714, 0.42857142857142855, 0.14285714285714285, 0.42857142857142855, 0.5714285714285714, 0.7142857142857143, 0.2857142857142857, 0.0, 0.42857142857142855, 0.14285714285714285, 0.14285714285714285, 0.42857142857142855, 0.5714285714285714, 0.14285714285714285, 0.2857142857142857, 0.14285714285714285, 0.5714285714285714, 0.2857142857142857, 0.5714285714285714, 0.14285714285714285, 0.42857142857142855, 0.2857142857142857, 0.14285714285714285, 0.2857142857142857, 0.5714285714285714, 0.5714285714285714, 0.2857142857142857, 0.2857142857142857, 0.5714285714285714, 0.42857142857142855, 0.2857142857142