In [32]:
import pandas as pd
import ast
import numpy as np

# --- 1. Load and Preprocess the Data ---

# Load the new dataset from the provided CSV file name
try:
    df = pd.read_csv("wandb_export_2025-09-18T12_21_56.746+08_00.csv")
    df["trainer.max_epochs"] = pd.to_numeric(df["trainer.max_epochs"], errors="coerce")
    print("Successfully loaded and processed the new CSV file.")
except FileNotFoundError:
    print(
        "Error: The file 'wandb_export_2025--18T12_21_56.746+08_00.csv' was not found."
    )
    exit()


# --- 2. Feature Engineering and Baseline Definition ---

# Create helper columns for analysis
df["dataset"] = df["data_module.data.name"]
df["prompt_type"] = df["module.model.prompt_type"]
# *** ACTION: Create short config name by extracting it from the 'Name' column ***
# This assumes the format is {dataset}_{prompt}_{config}__{id}
df["config"] = df["Name"].apply(lambda x: x.split("_")[2])
df["trainable_modules_list"] = df["module.model.trainable_modules"].apply(
    ast.literal_eval
)
df["has_memory"] = df["trainable_modules_list"].apply(lambda x: "memory_encoder" in x)
df["has_image_encoder"] = df["trainable_modules_list"].apply(
    lambda x: "image_encoder" in x
)

# Define the baseline as any run where training was not performed (max_epochs is 0)
df_baseline = df[df["trainer.max_epochs"] == 0].copy()
df_trained = df[df["trainer.max_epochs"] > 0].copy()


# --- 3. Establish Baseline for Comparison ---

# Extract baseline metrics.
baseline_metrics = df_baseline.set_index(["dataset", "prompt_type"])[
    ["eval/Dice", "eval/mIoU", "eval/MAE"]
].rename(
    columns={
        "eval/Dice": "Dice_baseline",
        "eval/mIoU": "mIoU_baseline",
        "eval/MAE": "MAE_baseline",
    }
)

# Join the baseline metrics onto the DataFrame of trained models.
df_trained = df_trained.set_index(["dataset", "prompt_type"]).join(baseline_metrics)

# Sort the DataFrame to group related rows together
df_trained.sort_index(inplace=True)


# --- 4. Calculate Improvement Over Baseline ---

# Calculate the percentage improvement of trained models over their untrained counterparts.
df_trained["Dice_improvement_%"] = (
    (df_trained["eval/Dice"] - df_trained["Dice_baseline"])
    / df_trained["Dice_baseline"]
    * 100
)
df_trained["mIoU_improvement_%"] = (
    (df_trained["eval/mIoU"] - df_trained["mIoU_baseline"])
    / df_trained["mIoU_baseline"]
    * 100
)
df_trained["MAE_reduction_%"] = (
    (df_trained["eval/MAE"] - df_trained["MAE_baseline"])
    / df_trained["MAE_baseline"]
    * 100
)

df_trained.fillna(0, inplace=True)


# --- 5. Generate Summaries and Insights (with grouped output) ---

print("\n--- Trained Models DataFrame with Baselines and Improvements ---")
print("(Grouped by Dataset and Prompt Type)")
# *** ACTION: Use the new 'config' column for display ***
display_cols = [
    "config",
    "eval/Dice",
    "Dice_baseline",
    "Dice_improvement_%",
    "eval/mIoU",
    "mIoU_baseline",
    "mIoU_improvement_%",
    "eval/MAE",
    "MAE_baseline",
    "MAE_reduction_%",
]
print(df_trained[display_cols])


print("\n\n--- Insight 1: What is the overall impact of training? ---")
training_impact = df_trained.groupby("dataset")[
    ["Dice_improvement_%", "mIoU_improvement_%", "MAE_reduction_%"]
].mean()
print("Average improvement from training (vs. epoch 0 baseline):")
print(training_impact)
print(
    "\nSummary: Training provides a massive performance uplift. The 'endovis17' and 'endovis18' datasets,"
)
print(
    "which are more challenging, show a greater relative improvement from training compared to 'cholecseg8k'."
)


print("\n\n--- Insight 2: Among trained models, do memory modules help? ---")
memory_impact = (
    df_trained.groupby(["dataset", "has_memory"])["eval/Dice"].mean().unstack()
)
memory_impact.columns = ["Without Memory", "With Memory"]
print("Mean Dice Score for Trained Models:")
print(memory_impact)
print(
    "\nSummary: For trained models, including a memory module consistently improves the average Dice score across all datasets."
)


print(
    "\n\n--- Insight 3: Among trained models, does fine-tuning the Image Encoder help? ---"
)
image_encoder_impact = (
    df_trained.groupby(["dataset", "has_image_encoder"])["Dice_improvement_%"]
    .mean()
    .unstack()
)
image_encoder_impact.columns = ["Without Image Encoder", "With Image Encoder"]
print("Average Dice Improvement (%) for Trained Models:")
print(image_encoder_impact)
print(
    "\nSummary: Fine-tuning the image encoder provides another significant boost in performance on top of standard training,"
)
print("especially for the 'endovis17' and 'endovis18' datasets.")


print("\n\n--- Insight 4: Which prompt type is most effective for trained models? ---")
prompt_performance = (
    df_trained.groupby(["dataset", "prompt_type"])["eval/Dice"].mean().unstack()
)
print("Mean Dice Score by Prompt Type (Trained Models):")
print(prompt_performance)
print("\nSummary: Even after training, 'mask' prompts deliver the highest performance.")
print(
    "'Point' prompts remain the least effective, confirming that prompt quality is crucial."
)


print(
    "\n\n--- Insight 5: What are the best overall trained configurations per dataset? ---"
)
best_configs = df_trained.loc[df_trained.groupby("dataset")["eval/Dice"].idxmax()]
best_configs = best_configs.reset_index().sort_values(by="dataset")
print("The top-performing configurations (based on Dice score) are:")
# *** ACTION: Use the new 'config' column for display ***
print(best_configs[["dataset", "Name", "eval/Dice", "Dice_improvement_%", "config"]])


print(
    "\n\n--- Insight 6: What is the single best fine-tuning configuration overall? ---"
)
# *** ACTION: Group by the new 'config' column ***
overall_performance = (
    df_trained.groupby("config")
    .agg(
        mean_dice=("eval/Dice", "mean"),
        mean_dice_improvement=("Dice_improvement_%", "mean"),
        run_count=("Name", "count"),
    )
    .sort_values(by="mean_dice", ascending=False)
)

print(
    "Average performance by fine-tuning configuration (across all datasets and prompts):"
)
print(overall_performance)

best_config_name = overall_performance.index[0]
best_config_stats = overall_performance.iloc[0]

print(f"\nSummary: The best overall fine-tuning configuration is '{best_config_name}'.")
print(
    f"On average, this configuration achieves a Dice score of {best_config_stats['mean_dice']:.4f} "
    f"and provides a {best_config_stats['mean_dice_improvement']:.2f}% improvement over the baseline."
)
print(
    "This configuration (mem+md+pe+ie) includes the memory modules and fine-tunes all available encoders."
)
print(
    "This indicates that for the highest and most robust performance, training all available components is the most effective strategy."
)

Successfully loaded and processed the new CSV file.

--- Trained Models DataFrame with Baselines and Improvements ---
(Grouped by Dataset and Prompt Type)
                               config  eval/Dice  Dice_baseline  Dice_improvement_%  eval/mIoU  mIoU_baseline  mIoU_improvement_%  eval/MAE  MAE_baseline  MAE_reduction_%
dataset     prompt_type                                                                                                                                                   
cholecseg8k box              md+pe+ie     0.8308         0.7963              4.3332     0.7627         0.7168              6.4015    1.4159        2.6365         -46.2979
            box          mem+md+pe+ie     0.8382         0.7963              5.2626     0.7696         0.7168              7.3631    1.4044        2.6365         -46.7328
            box                 md+pe     0.8147         0.7963              2.3040     0.7412         0.7168              3.4100    1.6169        2.6365        

In [33]:
df_trained[display_cols]

Unnamed: 0_level_0,Unnamed: 1_level_0,config,eval/Dice,Dice_baseline,Dice_improvement_%,eval/mIoU,mIoU_baseline,mIoU_improvement_%,eval/MAE,MAE_baseline,MAE_reduction_%
dataset,prompt_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cholecseg8k,box,md+pe+ie,0.8308,0.7963,4.3332,0.7627,0.7168,6.4015,1.4159,2.6365,-46.2979
cholecseg8k,box,mem+md+pe+ie,0.8382,0.7963,5.2626,0.7696,0.7168,7.3631,1.4044,2.6365,-46.7328
cholecseg8k,box,md+pe,0.8147,0.7963,2.304,0.7412,0.7168,3.41,1.6169,2.6365,-38.6708
cholecseg8k,box,md,0.8146,0.7963,2.2928,0.7411,0.7168,3.388,1.6182,2.6365,-38.6242
cholecseg8k,box,mem+md+pe,0.817,0.7963,2.5932,0.7447,0.7168,3.8949,1.6192,2.6365,-38.5853
cholecseg8k,box,mem+md,0.817,0.7963,2.594,0.7447,0.7168,3.8978,1.6144,2.6365,-38.7669
cholecseg8k,box,mem,0.8025,0.7963,0.782,0.7265,0.7168,1.3524,1.9916,2.6365,-24.4598
cholecseg8k,mask,mem+md+pe+ie,0.8606,0.8515,1.067,0.8057,0.7932,1.5775,1.277,1.7016,-24.9505
cholecseg8k,mask,md+pe+ie,0.8577,0.8515,0.7271,0.8025,0.7932,1.1663,1.3093,1.7016,-23.0567
cholecseg8k,mask,md+pe,0.8547,0.8515,0.3784,0.7988,0.7932,0.6952,1.3564,1.7016,-20.2852


In [None]:
import pandas as pd
import ast
import numpy as np

# --- 1. Load and Preprocess the Data ---

try:
    df = pd.read_csv('wandb_export_2025-09-18T12_21_56.746+08_00.csv')
    df['trainer.max_epochs'] = pd.to_numeric(df['trainer.max_epochs'], errors='coerce')
    print("Successfully loaded and processed the new CSV file.")
except FileNotFoundError:
    print("Error: The file 'wandb_export_2025-09-18T12_21_56.746+08_00.csv' was not found.")
    exit()


# --- 2. Feature Engineering ---

df['dataset'] = df['data_module.data.name']
df['prompt_type'] = df['module.model.prompt_type']
# The config name for 'mem' is different in the baseline vs trained runs, so we standardize it.
df['config'] = df['Name'].apply(lambda x: x.split('_')[2] if 'mem' not in x.split('_')[2] else 'mem')


# --- 3. Prepare Trained and Baseline DataFrames ---

# Separate the data into trained models and their baselines
df_trained = df[df['trainer.max_epochs'] > 0].copy()
df_baseline = df[df['trainer.max_epochs'] == 0].copy()

# Create the performance string for TRAINED runs: "Dice / mIoU"
df_trained['perf_str_trained'] = (
    df_trained['eval/Dice'].round(3).astype(str) + ' / ' +
    df_trained['eval/mIoU'].round(3).astype(str)
)

# Create the performance string for BASELINE runs: "(Dice / mIoU)"
df_baseline['perf_str_baseline'] = (
    '(' + df_baseline['eval/Dice'].round(3).astype(str) + ' / ' +
    df_baseline['eval/mIoU'].round(3).astype(str) + ')'
)


# --- 4. Combine and Pivot the Data ---

# Set a common index for joining
join_cols = ['dataset', 'prompt_type', 'config']
df_trained.set_index(join_cols, inplace=True)
df_baseline.set_index(join_cols, inplace=True)

# Join the trained and baseline data on their common dataset, prompt, and config
# We only need the performance string from the baseline
combined_df = df_trained.join(df_baseline[['perf_str_baseline']])

# Create the final cell value, e.g., "0.861 / 0.806 (0.852 / 0.793)"
combined_df['final_perf_str'] = combined_df['perf_str_trained'] + ' ' + combined_df['perf_str_baseline']





Successfully loaded and processed the new CSV file.


Unnamed: 0_level_0,config,mem
dataset,prompt_type,Unnamed: 2_level_1
cholecseg8k,box,0.838 / 0.77 (0.796 / 0.717)
cholecseg8k,mask,0.861 / 0.806 (0.852 / 0.793)
cholecseg8k,point,0.735 / 0.65 (0.666 / 0.569)
endovis17,box,0.802 / 0.755 (0.803 / 0.748)
endovis17,mask,0.826 / 0.779 (0.817 / 0.767)
endovis17,point,0.825 / 0.778 (0.717 / 0.646)
endovis18,box,0.391 / 0.368 (0.385 / 0.351)
endovis18,mask,0.388 / 0.363 (0.387 / 0.356)
endovis18,point,0.344 / 0.322 (0.337 / 0.289)


In [46]:
import pandas as pd
import ast
import numpy as np

# --- 1. Load and Preprocess the Data ---

try:
    df = pd.read_csv('wandb_export_2025-09-18T12_21_56.746+08_00.csv')
    df['trainer.max_epochs'] = pd.to_numeric(df['trainer.max_epochs'], errors='coerce')
    print("Successfully loaded and processed the new CSV file.")
except FileNotFoundError:
    print("Error: The file 'wandb_export_2025-09-18T12_21_56.746+08_00.csv' was not found.")
    exit()


# --- 2. Feature Engineering ---

df['dataset'] = df['data_module.data.name']
df['prompt_type'] = df['module.model.prompt_type']
df['config'] = df['Name'].apply(lambda x: x.split('_')[2])


# --- 3. Prepare Trained and Baseline DataFrames ---

df_trained = df[df['trainer.max_epochs'] > 0].copy()
df_baseline = df[df['trainer.max_epochs'] == 0].copy()

# Create the performance strings for both dataframes
df_trained['perf_str_trained'] = (
    df_trained['eval/Dice'].round(3).astype(str) + ' / ' +
    df_trained['eval/mIoU'].round(3).astype(str) + ' / ' +
    df_trained['eval/MAE'].round(2).astype(str)
)

df_baseline['perf_str_baseline'] = (
    '(' + df_baseline['eval/Dice'].round(3).astype(str) + ' / ' +
    df_baseline['eval/mIoU'].round(3).astype(str) + ' / ' +
    df_baseline['eval/MAE'].round(2).astype(str) + ')'
)


# --- 4. Combine and Pivot the Data ---

# Standardize baseline config names for joining
df_baseline.loc[df_baseline['config'].str.contains('mem'), 'config'] = 'mem'
join_cols = ['dataset', 'prompt_type', 'config']
df_trained.set_index(join_cols, inplace=True)
df_baseline.set_index(join_cols, inplace=True)

# Join the trained and baseline data
combined_df = df_trained.join(df_baseline[['perf_str_baseline']])
combined_df.reset_index(inplace=True)

# Merge the 'mem' baseline onto all other memory-containing trained runs
mem_baselines = df_baseline[df_baseline.index.get_level_values('config') == 'mem'].copy()
mem_baselines.index = mem_baselines.index.droplevel('config')

combined_df = combined_df.merge(mem_baselines[['perf_str_baseline']],
                                on=['dataset', 'prompt_type'],
                                suffixes=('', '_mem_baseline'),
                                how='left')

combined_df['perf_str_baseline'] = combined_df['perf_str_baseline'].fillna(combined_df['perf_str_baseline_mem_baseline'])
combined_df['final_perf_str'] = combined_df['perf_str_trained'] + ' ' + combined_df['perf_str_baseline'].fillna('')

# *** ACTION: Swap the index and columns in the pivot_table call ***
final_table = combined_df.pivot_table(
    index='config',
    columns=['dataset', 'prompt_type'],
    values='final_perf_str',
    aggfunc='first'
)


# --- 5. Final Formatting for Publication ---

# Define a logical order for the rows (the index)
row_order = [
    'md', 'md+pe', 'md+pe+ie',
    'mem', 'mem+md', 'mem+md+pe', 'mem+md+pe+ie'
]
existing_rows = [row for row in row_order if row in final_table.index]
final_table = final_table.reindex(existing_rows)

# Function to mark the best score in each COLUMN with an asterisk
def mark_best_dice_in_column(column):
    # This function now operates on a column (a pd.Series)
    trained_perf = column.str.split(r' \(').str[0]
    dice_scores = pd.to_numeric(trained_perf.str.split(' / ').str[0], errors='coerce')

    if dice_scores.notna().any():
        max_idx = dice_scores.idxmax() # Get the index (config name) of the best score
        column[max_idx] = f"{column[max_idx]}*"
    return column

# *** ACTION: Apply the function column-wise (axis=0) ***
final_table = final_table.apply(mark_best_dice_in_column, axis=0)
final_table.fillna('-', inplace=True)


# --- 6. Display the Final Result ---

print("\n\n--- Comprehensive Performance Table for Publication (Transposed) ---")
print("\nTable 1: Performance of all model configurations across datasets and prompt types.")
print("Each cell shows: Trained Dice / mIoU / MAE (Baseline Dice / mIoU / MAE). Best trained Dice score per column is marked with *.")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
print(final_table)

Successfully loaded and processed the new CSV file.


--- Comprehensive Performance Table for Publication (Transposed) ---

Table 1: Performance of all model configurations across datasets and prompt types.
Each cell shows: Trained Dice / mIoU / MAE (Baseline Dice / mIoU / MAE). Best trained Dice score per column is marked with *.
dataset                                       cholecseg8k                                                                                                                               endovis17                                                                                                                                 endovis18                                                                                            
prompt_type                                           box                                         mask                                        point                                           box                                          mask    