In [11]:
import pandas as pd

# Read data from CSV
file_path = "data.csv" 
df = pd.read_csv(file_path, sep='\t')



print(df.columns)

# Convert relevant columns to float
float_columns = ["ours_Accuracy", "ours_Steps", "ours_Latency", "ours_Total_Tokens","ours_Cost","caesura_Accuracy","caesura_Steps",
                 "caesura_Latency","caesura_Total_Tokens","caesura_Cost"]
df[float_columns] = df[float_columns].replace(",", ".", regex=True)

df[float_columns] = df[float_columns].astype(float)
# Group by Output Type and Modality
 #"Modality","Parallel/Sequential"
output_type_grouped = df.groupby(["Output_Type"]).agg({
    "ours_Accuracy": "sum",
    "ours_Steps": ["sum","mean", "min", "max"],
    "ours_Latency": ["sum","mean", "min", "max"],
    "ours_Total_Tokens": ["sum","mean", "min", "max"],
    "ours_Cost": "sum",
    "caesura_Accuracy": "sum",
    "caesura_Steps": ["sum","mean", "min", "max"],
    "caesura_Latency": ["sum","mean", "min", "max"],
    "caesura_Total_Tokens":["sum","mean", "min", "max"],
    "caesura_Cost": "sum",
}).reset_index()

modality_grouped = df.groupby(["Modality"]).agg({
    "ours_Accuracy": "sum",
    "ours_Steps": ["sum","mean", "min", "max"],
    "ours_Latency": ["sum","mean", "min", "max"],
    "ours_Total_Tokens": ["sum","mean", "min", "max"],
    "ours_Cost": "sum",
    
    "caesura_Accuracy": "sum",
    "caesura_Steps": ["sum","mean", "min", "max"],
    "caesura_Latency": ["sum","mean", "min", "max"],
    "caesura_Total_Tokens":["sum","mean", "min", "max"],
    "caesura_Cost": "sum",
}).reset_index()


plan_grouped = df.groupby(["Parallel/Sequential"]).agg({
    "ours_Accuracy": "sum",
    "ours_Steps": ["sum","mean", "min", "max"],
    "ours_Latency": ["sum","mean", "min", "max"],
    "ours_Total_Tokens": ["sum","mean", "min", "max"],
    "ours_Cost": "sum",
    
    "caesura_Accuracy": "sum",
    "caesura_Steps": ["sum","mean", "min", "max"],
    "caesura_Latency": ["sum","mean", "min", "max"],
    "caesura_Total_Tokens":["sum","mean", "min", "max"],
    "caesura_Cost": "sum",
}).reset_index()

# Flatten MultiIndex columns for Output_Type and Modality groups
output_type_grouped.columns = [
    "_".join(col).strip("_") if isinstance(col, tuple) else col for col in output_type_grouped.columns
]
modality_grouped.columns = [
    "_".join(col).strip("_") if isinstance(col, tuple) else col for col in modality_grouped.columns
]

plan_grouped.columns = [
    "_".join(col).strip("_") if isinstance(col, tuple) else col for col in plan_grouped.columns
]


# Add Total_Count column for Output_Type and Modality
output_type_count = df.groupby("Output_Type").size().reset_index(name="Total_Count")
modality_count = df.groupby("Modality").size().reset_index(name="Total_Count")
plan_count = df.groupby("Parallel/Sequential").size().reset_index(name="Total_Count")

output_type_grouped = output_type_grouped.merge(output_type_count, on="Output_Type", how="left")
modality_grouped = modality_grouped.merge(modality_count, on="Modality", how="left")
plan_grouped = plan_grouped.merge(plan_count, on="Parallel/Sequential", how="left")



# Calculate accuracy as sum / total

output_type_grouped["ours_Avg_Accuracy"] = output_type_grouped["ours_Accuracy_sum"] / output_type_grouped["Total_Count"] *100
output_type_grouped["caesura_Avg_Accuracy"] = output_type_grouped["caesura_Accuracy_sum"] / output_type_grouped["Total_Count"] *100

modality_grouped["ours_Avg_Accuracy"] = modality_grouped["ours_Accuracy_sum"] / modality_grouped["Total_Count"] *100
modality_grouped["caesura_Avg_Accuracy"] = modality_grouped["caesura_Accuracy_sum"] / modality_grouped["Total_Count"] *100

plan_grouped["ours_Avg_Accuracy"] = plan_grouped["ours_Accuracy_sum"] / plan_grouped["Total_Count"] *100
plan_grouped["caesura_Avg_Accuracy"] = plan_grouped["caesura_Accuracy_sum"] / plan_grouped["Total_Count"] *100



# Clean up column names
output_type_grouped.rename(columns={
    "ours_Steps_mean": "ours_Avg_Steps",
    "ours_Steps_min": "ours_Min_Steps",
    "ours_Steps_max": "ours_Max_Steps",
    "ours_Latency_mean": "ours_Avg_Latency",
    "ours_Latency_min": "ours_Min_Latency", 
    "ours_Latency_max": "ours_Max_Latency",
    "ours_Total_Tokens_sum": "ours_Sum_Total_Tokens", "ours_Total_Tokens_min": "ours_Min_Total_Tokens", 
    "ours_Total_Tokens_max": "ours_Max_Total_Tokens",
    
    "caesura_Steps_mean": "caesura_Avg_Steps", "caesura_Steps_min": "caesura_Min_Steps", "caesura_Steps_max": "caesura_Max_Steps",
    "caesura_Latency_mean": "caesura_Avg_Latency", "caesura_Latency_min": "caesura_Min_Latency", "caesura_Latency_max": "caesura_Max_Latency",
    "caesura_Total_Tokens_sum": "caesura_Sum_Total_Tokens", "caesura_Total_Tokens_min": "caesura_Min_Total_Tokens", 
    "caesura_Total_Tokens_max": "caesura_Max_Total_Tokens"
}, inplace=True)


modality_grouped.rename(columns={
    "ours_Steps_mean": "ours_Avg_Steps", "ours_Steps_min": "ours_Min_Steps", "ours_Steps_max": "ours_Max_Steps",
    "ours_Latency_mean": "ours_Avg_Latency", "ours_Latency_min": "ours_Min_Latency", "ours_Latency_max": "ours_Max_Latency",
    "ours_Total_Tokens_sum": "ours_Sum_Total_Tokens", "ours_Total_Tokens_min": "ours_Min_Total_Tokens", 
    "ours_Total_Tokens_max": "ours_Max_Total_Tokens",
    
    "caesura_Steps_mean": "caesura_Avg_Steps", "caesura_Steps_min": "caesura_Min_Steps", "caesura_Steps_max": "caesura_Max_Steps",
    "caesura_Latency_mean": "caesura_Avg_Latency", "caesura_Latency_min": "caesura_Min_Latency", "caesura_Latency_max": "caesura_Max_Latency",
    "caesura_Total_Tokens_sum": "caesura_Sum_Total_Tokens", "caesura_Total_Tokens_min": "caesura_Min_Total_Tokens", 
    "caesura_Total_Tokens_max": "caesura_Max_Total_Tokens"
}, inplace=True)

plan_grouped.rename(columns={
    "ours_Steps_mean": "ours_Avg_Steps", "ours_Steps_min": "ours_Min_Steps", "ours_Steps_max": "ours_Max_Steps",
    "ours_Latency_mean": "ours_Avg_Latency", "ours_Latency_min": "ours_Min_Latency", "ours_Latency_max": "ours_Max_Latency",
    "ours_Total_Tokens_sum": "ours_Sum_Total_Tokens", "ours_Total_Tokens_min": "ours_Min_Total_Tokens", 
    "ours_Total_Tokens_max": "ours_Max_Total_Tokens",
    
    "caesura_Steps_mean": "caesura_Avg_Steps", "caesura_Steps_min": "caesura_Min_Steps", "caesura_Steps_max": "caesura_Max_Steps",
    "caesura_Latency_mean": "caesura_Avg_Latency", "caesura_Latency_min": "caesura_Min_Latency", "caesura_Latency_max": "caesura_Max_Latency",
    "caesura_Total_Tokens_sum": "caesura_Sum_Total_Tokens", "caesura_Total_Tokens_min": "caesura_Min_Total_Tokens", 
    "caesura_Total_Tokens_max": "caesura_Max_Total_Tokens"
}, inplace=True)








Index(['id', 'questions', 'Output_Type', 'Modality', 'Parallel/Sequential',
       'caesura_Total_Tokens', 'caesura_Latency', 'caesura_Cost',
       'ours_Total_Tokens', 'ours_Latency', 'ours_Cost', 'ours_Steps',
       'ours_Accuracy', 'caesura_Accuracy', 'caesura_Steps'],
      dtype='object')


In [12]:
modality_grouped

Unnamed: 0,Modality,ours_Accuracy_sum,ours_Steps_sum,ours_Avg_Steps,ours_Min_Steps,ours_Max_Steps,ours_Latency_sum,ours_Avg_Latency,ours_Min_Latency,ours_Max_Latency,...,caesura_Min_Latency,caesura_Max_Latency,caesura_Sum_Total_Tokens,caesura_Total_Tokens_mean,caesura_Min_Total_Tokens,caesura_Max_Total_Tokens,caesura_Cost_sum,Total_Count,ours_Avg_Accuracy,caesura_Avg_Accuracy
0,Multiple,4.5,107.0,7.133333,4.0,10.0,2515.03,167.668667,5.79,334.36,...,45.172,917.989,268918.0,17927.866667,9400.0,45082.0,1.64873,15,30.0,6.666667
1,Single,12.0,96.0,6.4,3.0,16.0,525.09,35.006,5.9,119.0,...,35.393,228.466,214014.0,14267.6,9257.0,45055.0,1.33104,15,80.0,46.666667


In [13]:

# Filter columns that contain 'ours' in their name
[col for col in modality_grouped.columns if 'ours' in col]
requires=["ours_Avg_Accuracy","ours_Steps_sum","ours_Sum_Total_Tokens","ours_Latency_sum","ours_Cost_sum"]
ours_type_columns = ["Output_Type"]+requires
ours_modality_columns = ["Modality"]+requires
ours_plan_columns = ["Parallel/Sequential"]+requires

output_type_grouped[ours_type_columns]


Unnamed: 0,Output_Type,ours_Avg_Accuracy,ours_Steps_sum,ours_Sum_Total_Tokens,ours_Latency_sum,ours_Cost_sum
0,Data Structure,35.0,67.0,223528.0,1330.4,0.89
1,Plot,62.5,52.0,118431.0,798.97,0.48
2,Plot|Data Structure,100.0,14.0,21970.0,107.05,0.1
3,Plot|Plot,100.0,14.0,50108.0,308.92,0.22
4,Single Value,50.0,56.0,114092.0,494.78,0.395213


In [14]:
modality_grouped[ours_modality_columns]

Unnamed: 0,Modality,ours_Avg_Accuracy,ours_Steps_sum,ours_Sum_Total_Tokens,ours_Latency_sum,ours_Cost_sum
0,Multiple,30.0,107.0,368917.0,2515.03,1.485213
1,Single,80.0,96.0,159212.0,525.09,0.6


In [15]:
plan_grouped[ours_plan_columns]

Unnamed: 0,Parallel/Sequential,ours_Avg_Accuracy,ours_Steps_sum,ours_Sum_Total_Tokens,ours_Latency_sum,ours_Cost_sum
0,Parallel,75.0,40.0,146846.0,909.01,0.59
1,Sequential,50.0,163.0,381283.0,2131.11,1.495213


In [19]:
caesura_columns = [col for col in modality_grouped.columns if 'caesura' in col]
requires=["caesura_Avg_Accuracy","caesura_Steps_sum","caesura_Sum_Total_Tokens","caesura_Latency_sum","caesura_Cost_sum"]
ours_type_columns = ["Output_Type"]+requires
ours_modality_columns = ["Modality"]+requires
ours_plan_columns = ["Parallel/Sequential"]+requires

output_type_grouped[ours_type_columns]


Unnamed: 0,Output_Type,caesura_Avg_Accuracy,caesura_Steps_sum,caesura_Sum_Total_Tokens,caesura_Latency_sum,caesura_Cost_sum
0,Data Structure,30.0,116.0,183454.0,2683.034,1.14497
1,Plot,25.0,79.0,112732.0,1856.663,0.69209
2,Plot|Data Structure,0.0,17.0,30161.0,125.423,0.185165
3,Plot|Plot,0.0,16.0,21508.0,108.874,0.1358
4,Single Value,37.5,88.0,135077.0,1047.238001,0.821745


In [20]:
modality_grouped[ours_modality_columns]

Unnamed: 0,Modality,caesura_Avg_Accuracy,caesura_Steps_sum,caesura_Sum_Total_Tokens,caesura_Latency_sum,caesura_Cost_sum
0,Multiple,6.666667,164.0,268918.0,4847.951,1.64873
1,Single,46.666667,152.0,214014.0,973.281,1.33104


In [17]:
plan_grouped[ours_plan_columns]

Unnamed: 0,Parallel/Sequential,caesura_Avg_Accuracy,caesura_Steps_sum,caesura_Sum_Total_Tokens,caesura_Latency_sum,caesura_Cost_sum
0,Parallel,0.0,55.0,83887.0,491.105,0.525735
1,Sequential,33.333333,261.0,399045.0,5330.127,2.454035
