In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import json
import pandas as pd
import numpy as np
from datasets import load_dataset

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

In [4]:
vsibench = load_dataset("nyu-visionx/VSI-Bench")
df = vsibench["test"].to_pandas()

In [26]:
qdf = df[df["question_type"] == "object_rel_distance"].copy()
qdf.head(3)

Unnamed: 0,id,dataset,scene_name,question_type,question,ground_truth,options
1330,1334,arkitscenes,42446103,object_rel_distance,Measuring from the closest point of each objec...,A,"[A. chair, B. stool, C. stove, D. sofa]"
1331,1335,arkitscenes,42446103,object_rel_distance,Measuring from the closest point of each objec...,B,"[A. stove, B. table, C. stool, D. sofa]"
1332,1336,arkitscenes,42446103,object_rel_distance,Measuring from the closest point of each objec...,C,"[A. chair, B. tv, C. sofa, D. stove]"


In [27]:
qdf["question"].sample(3).values

array(['Measuring from the closest point of each object, which of these objects (sofa, refrigerator, table, fireplace) is the closest to the tv?',
       'Measuring from the closest point of each object, which of these objects (pan, trash can, kettle, crate) is the closest to the bed?',
       'Measuring from the closest point of each object, which of these objects (bowl, blanket, chair, ceiling light) is the closest to the printer?'],
      dtype=object)

In [28]:
# --- 1. Extract Target and GT Object Name ---
# Extract target object (last object mentioned in question)
qdf[["object_1", "object_2", "object_3", "object_4", "target_object"]] = qdf["question"].str.extract(r'which of these objects \((.*?), (.*?), (.*?), (.*?)\) is the closest to the (.*?)\?$')

# Extract GT object name from options
qdf["gt_idx"] = qdf["ground_truth"].apply(lambda x: "ABCD".index(x))
qdf["gt_option"] = qdf.apply(lambda row: row["options"][row["gt_idx"]], axis=1)
# Ensure no leading/trailing spaces in gt object name
qdf["gt_object"] = qdf["gt_option"].apply(lambda x: x.split(". ")[-1].strip())

qdf["tgt_gt_pair"] = qdf.apply(lambda row: "-".join(sorted([row["target_object"], row["gt_object"]])), axis=1)
qdf["tgt_gt_ord_pair"] = qdf.apply(lambda row: "-".join([row["target_object"], row["gt_object"]]), axis=1)

In [29]:
# --- 2. Calculate Global Frequencies of Correct Pairs ---
def get_counts(col):
    return qdf.groupby(col).size().reset_index(name='count').set_index(col)["count"].to_dict()

gt_obj_counts = get_counts("gt_object")
global_pair_counts = get_counts("tgt_gt_pair")
global_ordered_pair_counts = get_counts("tgt_gt_ord_pair")

In [31]:
# --- 3. Calculate Max Option Probability (Bias Score) for Each Question ---
bias_info = []
for index, row in qdf.iterrows():
    option_objects = row[['object_1', 'object_2', 'object_3', 'object_4']].tolist()
    target_obj = row['target_object']


    row_info =  {
        'id': row['id'],
        'max_option_freq': -np.inf,
        'max_tgt_option_pair_freq': -np.inf,
        'max_tgt_option_ord_pair_freq': -np.inf,
    }

    for i, opt_obj in enumerate(option_objects):
        # Form pair and get its global frequency
        tgt_option_pair = "-".join(sorted([target_obj, opt_obj]))
        tgt_option_ord_pair = "-".join([target_obj, opt_obj])

        option_freq = gt_obj_counts.get(opt_obj, 0)
        tgt_option_pair_freq = global_pair_counts.get(tgt_option_pair, 0)
        tgt_option_ord_pair_freq = global_ordered_pair_counts.get(tgt_option_ord_pair, 0)

        row_info[f"opt_{i}_option_freq"] = option_freq
        row_info[f"opt_{i}_tgt_option_pair_freq"] = tgt_option_pair_freq
        row_info[f"opt_{i}_tgt_option_ord_pair_freq"] = tgt_option_ord_pair_freq

        # update maxes
        row_info["max_option_freq"] = max(row_info["max_option_freq"], option_freq)
        row_info["max_tgt_option_pair_freq"] = max(row_info["max_tgt_option_pair_freq"], tgt_option_pair_freq)
        row_info["max_tgt_option_ord_pair_freq"] = max(row_info["max_tgt_option_ord_pair_freq"], tgt_option_ord_pair_freq)


    bias_info.append(row_info)

# Create a new DataFrame from the bias_info list
bias_df = pd.DataFrame(bias_info)
# Merge the bias scores back into the original DataFrame
qdf = pd.merge(qdf, bias_df, on='id', how='left')

In [32]:
qdf.head()

Unnamed: 0,id,dataset,scene_name,question_type,question,ground_truth,options,object_1,object_2,object_3,...,opt_0_tgt_option_ord_pair_freq,opt_1_option_freq,opt_1_tgt_option_pair_freq,opt_1_tgt_option_ord_pair_freq,opt_2_option_freq,opt_2_tgt_option_pair_freq,opt_2_tgt_option_ord_pair_freq,opt_3_option_freq,opt_3_tgt_option_pair_freq,opt_3_tgt_option_ord_pair_freq
0,1334,arkitscenes,42446103,object_rel_distance,Measuring from the closest point of each objec...,A,"[A. chair, B. stool, C. stove, D. sofa]",chair,stool,stove,...,12,10,8,8,3,0,0,40,11,11
1,1335,arkitscenes,42446103,object_rel_distance,Measuring from the closest point of each objec...,B,"[A. stove, B. table, C. stool, D. sofa]",stove,table,stool,...,0,123,25,25,10,8,8,40,11,11
2,1336,arkitscenes,42446103,object_rel_distance,Measuring from the closest point of each objec...,C,"[A. chair, B. tv, C. sofa, D. stove]",chair,tv,sofa,...,1,10,8,0,40,6,6,3,0,0
3,1337,arkitscenes,42446103,object_rel_distance,Measuring from the closest point of each objec...,D,"[A. chair, B. table, C. tv, D. sofa]",chair,table,tv,...,1,123,2,2,10,8,0,40,6,6
4,1338,arkitscenes,42446103,object_rel_distance,Measuring from the closest point of each objec...,A,"[A. chair, B. stove, C. table, D. tv]",chair,stove,table,...,1,3,0,0,123,2,2,10,8,0


In [33]:
qdf.columns

Index(['id', 'dataset', 'scene_name', 'question_type', 'question',
       'ground_truth', 'options', 'object_1', 'object_2', 'object_3',
       'object_4', 'target_object', 'gt_idx', 'gt_option', 'gt_object',
       'tgt_gt_pair', 'tgt_gt_ord_pair', 'max_option_freq',
       'max_tgt_option_pair_freq', 'max_tgt_option_ord_pair_freq',
       'opt_0_option_freq', 'opt_0_tgt_option_pair_freq',
       'opt_0_tgt_option_ord_pair_freq', 'opt_1_option_freq',
       'opt_1_tgt_option_pair_freq', 'opt_1_tgt_option_ord_pair_freq',
       'opt_2_option_freq', 'opt_2_tgt_option_pair_freq',
       'opt_2_tgt_option_ord_pair_freq', 'opt_3_option_freq',
       'opt_3_tgt_option_pair_freq', 'opt_3_tgt_option_ord_pair_freq'],
      dtype='object')

In [None]:
import numpy as np # Added for mean/std calculation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer # Added make_scorer for cross_val_score
from sklearn.model_selection import StratifiedKFold, cross_val_score # Added KFold and cross_val_score
from sklearn.preprocessing import LabelEncoder


feature_cols = [
    'object_1', 'object_2', 'object_3', 'object_4', 'target_object',
    # 'option_prob', 'tgt_option_pair_prob', 'tgt_option_ord_pair_prob'
    'max_option_freq',
    'max_tgt_option_pair_freq', 'max_tgt_option_ord_pair_freq',
    'opt_0_option_freq', 'opt_0_tgt_option_pair_freq',
    'opt_0_tgt_option_ord_pair_freq', 'opt_1_option_freq',
    'opt_1_tgt_option_pair_freq', 'opt_1_tgt_option_ord_pair_freq',
    'opt_2_option_freq', 'opt_2_tgt_option_pair_freq',
    'opt_2_tgt_option_ord_pair_freq', 'opt_3_option_freq',
    'opt_3_tgt_option_pair_freq', 'opt_3_tgt_option_ord_pair_freq'
]
target_col = "ground_truth"

def evaluate_rf_bias_cv(df: pd.DataFrame, n_splits: int = 5, feature_cols=feature_cols, target_col=target_col) -> tuple[float, float, pd.DataFrame]:
    """
    Trains and evaluates a RandomForestClassifier using k-fold cross-validation
    on non-visual features of object_rel_distance questions to predict the
    ground truth answer ('A'/'B'/'C'/'D').

    Args:
        df
        n_splits: Number of folds for StratifiedKFold cross-validation.

    Returns:
        A tuple containing:
        - mean_accuracy (float): Mean accuracy across the cross-validation folds.
        - std_accuracy (float): Standard deviation of accuracy across the folds.
        - feature_importance_df (pd.DataFrame | None): DataFrame with feature importances
            from a model trained on the full dataset, or None if calculation fails.
    """
    print(f"--- Starting RF Bias Evaluation (CV with {n_splits} splits) ---")

    # Check for essential columns
    required_cols = feature_cols + [target_col]
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        print(f"Error: Input DataFrame is missing required columns: {missing_cols}")
        raise ValueError(f"Missing required columns: {missing_cols}")

    # Create a working copy and drop rows with missing values in relevant columns
    data = df[required_cols].copy()
    initial_rows = len(data)
    data.dropna(inplace=True)
    final_rows = len(data)
    if final_rows < initial_rows:
        print(f"Warning: Dropped {initial_rows - final_rows} rows due to missing values in features or target.")

    if data.empty:
        print("Error: DataFrame is empty after dropping NA. Cannot proceed.")
        raise ValueError("DataFrame is empty after dropping NA.")

    X = data[feature_cols]
    y = data[target_col]

    print(f"Using {len(X)} samples for evaluation.")

    # --- 2. Preprocess Features ---
    encoders = {}
    X_encoded = X.copy()
    categorical_cols = X.select_dtypes(include='object').columns

    if not categorical_cols.empty:
        print(f"Encoding categorical features: {list(categorical_cols)}")
        for col in categorical_cols:
            encoders[col] = LabelEncoder()
            # Fit on the entire column before cross-validation
            X_encoded[col] = encoders[col].fit_transform(X_encoded[col].astype(str))
    else:
        print("No categorical features found to encode.")

    # --- 3. Define Model and Cross-Validation Strategy ---
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

    # Adjust n_splits if necessary based on class distribution
    min_samples_per_class = y.value_counts().min()
    if min_samples_per_class < n_splits:
        print(f"Warning: The least populated class has only {min_samples_per_class} samples.")
        if min_samples_per_class < 2:
            print("Error: The least populated class has less than 2 samples. Cannot perform stratified CV.")
            raise ValueError("Insufficient samples in the smallest class for stratified CV.")
        print(f"Reducing n_splits from {n_splits} to {min_samples_per_class} to match the smallest class size.")
        n_splits = min_samples_per_class

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # --- 4. Perform Cross-Validation ---
    print(f"Performing {n_splits}-Fold Cross-Validation...")
    try:
        # Use cross_val_score to get accuracy for each fold
        scores = cross_val_score(model, X_encoded, y, cv=cv, scoring='accuracy', n_jobs=-1)
        mean_accuracy = np.mean(scores)
        std_accuracy = np.std(scores)
        print(f"Scores for each fold: {scores}")
        print(f"Mean Accuracy: {mean_accuracy:.4f}")
        print(f"Standard Deviation of Accuracy: {std_accuracy:.4f}")
    except ValueError as e:
        print(f"Error during cross-validation: {e}")
        print("This might happen if a fold doesn't contain samples from all classes, especially with small datasets.")
        return np.nan, np.nan, None # Return NaN if CV fails

    # --- 5. Train Final Model and Get Feature Importances ---
    feature_importance_df = None # Initialize as None
    print("\nTraining final model on all data to get feature importances...")
    try:
        final_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
        final_model.fit(X_encoded, y)
        importances = final_model.feature_importances_
        feature_importance_df = pd.DataFrame({'feature': X_encoded.columns, 'importance': importances})
        feature_importance_df = feature_importance_df.sort_values('importance', ascending=False).reset_index(drop=True)
        print("\nFeature Importances (from final model trained on all data):")
        print(feature_importance_df)
    except Exception as e:
        print(f"Could not calculate feature importances: {e}")

    print("--- Evaluation Complete ---")
    return mean_accuracy, std_accuracy, feature_importance_df

evaluate_rf_bias_cv(qdf, n_splits=5);

--- Starting RF Bias Evaluation (CV with 5 splits) ---
Using 710 samples for evaluation.
Encoding categorical features: ['object_1', 'object_2', 'object_3', 'object_4', 'target_object']
Performing 5-Fold Cross-Validation...
Scores for each fold: [0.64084507 0.61971831 0.66901408 0.68309859 0.69014085]
Mean Accuracy: 0.6606
Standard Deviation of Accuracy: 0.0265

Training final model on all data to get feature importances...

Feature Importances (from final model trained on all data):
                           feature  importance
0   opt_2_tgt_option_ord_pair_freq    0.097340
1   opt_3_tgt_option_ord_pair_freq    0.091505
2   opt_0_tgt_option_ord_pair_freq    0.088609
3   opt_1_tgt_option_ord_pair_freq    0.087946
4       opt_1_tgt_option_pair_freq    0.058820
5       opt_2_tgt_option_pair_freq    0.057737
6       opt_3_tgt_option_pair_freq    0.056519
7       opt_0_tgt_option_pair_freq    0.053885
8                opt_3_option_freq    0.040185
9                opt_1_option_freq    0.0

In [36]:
evaluate_rf_bias_cv(qdf, n_splits=50);

--- Starting RF Bias Evaluation (CV with 50 splits) ---
Using 710 samples for evaluation.
Encoding categorical features: ['object_1', 'object_2', 'object_3', 'object_4', 'target_object']
Performing 50-Fold Cross-Validation...
Scores for each fold: [0.73333333 0.66666667 0.6        0.66666667 0.6        0.53333333
 0.73333333 0.66666667 0.66666667 0.8        0.5        0.78571429
 0.57142857 0.64285714 0.71428571 0.71428571 0.64285714 0.71428571
 0.71428571 0.64285714 0.57142857 0.71428571 0.85714286 0.71428571
 0.78571429 0.64285714 0.92857143 0.71428571 0.57142857 0.57142857
 0.71428571 0.71428571 0.85714286 0.71428571 0.71428571 0.71428571
 0.71428571 0.57142857 0.78571429 0.85714286 0.78571429 0.64285714
 0.64285714 0.64285714 1.         0.42857143 0.92857143 0.57142857
 0.64285714 0.78571429]
Mean Accuracy: 0.6962
Standard Deviation of Accuracy: 0.1105

Training final model on all data to get feature importances...

Feature Importances (from final model trained on all data):
      

In [37]:
evaluate_rf_bias_cv(qdf, n_splits=100);

--- Starting RF Bias Evaluation (CV with 100 splits) ---
Using 710 samples for evaluation.
Encoding categorical features: ['object_1', 'object_2', 'object_3', 'object_4', 'target_object']
Performing 100-Fold Cross-Validation...
Scores for each fold: [0.75       0.75       0.5        0.625      0.625      0.875
 0.75       0.75       0.625      0.625      0.42857143 0.71428571
 0.57142857 0.71428571 0.85714286 0.28571429 0.71428571 0.85714286
 0.71428571 0.42857143 0.71428571 0.42857143 0.42857143 0.71428571
 0.71428571 0.42857143 0.85714286 0.57142857 0.71428571 0.57142857
 0.71428571 0.71428571 0.71428571 0.42857143 0.71428571 0.85714286
 0.71428571 0.85714286 0.28571429 0.42857143 0.28571429 0.57142857
 0.42857143 1.         0.85714286 0.14285714 0.42857143 0.71428571
 0.57142857 0.71428571 0.42857143 0.85714286 0.85714286 1.
 0.57142857 0.85714286 0.85714286 0.71428571 0.85714286 0.71428571
 0.85714286 1.         0.71428571 0.85714286 1.         0.57142857
 0.71428571 0.71428571 0.5