# The following code allows to analyse the data collected from the VLM-Framework.

## Data loading

In [26]:
# First uploading the file to google colab (if needed)
from google.colab import files
uploaded = files.upload()

Saving metrics.csv to metrics.csv


In [33]:
# Reading the csv into a dataframe
import pandas as pd

df = pd.read_csv(
    "metrics.csv",
    sep=";",        # column separator
    decimal=","     # decimal separator
)

df.head()

Unnamed: 0,model_name,initial_cam_parameters,question,response_correctness,navigation_correctness,iterations,total_translation,total_rotation,elapsed_time
0,gpt-5.1,"{""translation"": [191950.98509773708, 87732.862...",Does this column have ram protection?,1,1,1,0.0,0,4.99
1,gpt-5.1,"{""translation"": [191950.98509773708, 87732.862...",Does this column have ram protection?,1,1,1,0.0,0,5.4
2,gpt-5.1,"{""translation"": [191950.98509773708, 87732.862...",Does this column have ram protection?,1,1,1,0.0,0,4.07
3,gpt-5.1,"{""translation"": [191950.98509773708, 87732.862...",Does this column have ram protection?,1,1,1,0.0,0,4.69
4,gpt-5.1,"{""translation"": [191950.98509773708, 87732.862...",Does this column have ram protection?,1,1,1,0.0,0,6.05


## Computing averages per question

In [38]:
# Preserving order of appearance from the CSV
df["model_name"] = pd.Categorical(df["model_name"], categories=df["model_name"].unique(), ordered=True)
df["question"]   = pd.Categorical(df["question"], categories=df["question"].unique(), ordered=True)

# Aggregating by question and model
aggregated_df = (
    df.groupby(["question", "model_name"], observed=True)
      .agg({
          "response_correctness": "mean",
          "navigation_correctness": "mean",
          "iterations": "mean",
          "total_translation": "mean",
          "total_rotation": "mean",
          "elapsed_time": "mean"
      })
      .reset_index()
      .sort_values(["question", "model_name"])
)


aggregated_df


Unnamed: 0,question,model_name,response_correctness,navigation_correctness,iterations,total_translation,total_rotation,elapsed_time
0,Does this column have ram protection?,gpt-5.1,1.0,1.0,1.0,0.0,0.0,5.04
1,Does this column have ram protection?\n,Qwen/Qwen3-VL-30B-A3B-Instruct:novita,0.8,1.0,3.4,280.0,0.0,18.304
2,How many robots are in this cell?,gpt-5.1,0.2,1.0,1.0,0.0,0.0,4.572
3,How many robots are in this cell?,Qwen/Qwen3-VL-30B-A3B-Instruct:novita,0.2,1.0,1.0,0.0,0.0,5.178
4,Would another robot fit within this first prod...,gpt-5.1,0.0,0.4,3.4,748.632,59.0,28.322
5,Would another robot fit within this first prod...,Qwen/Qwen3-VL-30B-A3B-Instruct:novita,0.0,0.0,4.8,380.0,4.0,28.364
6,Are the control cabinets behind the second col...,gpt-5.1,0.4,0.4,4.4,975.668,60.0,37.866
7,Are the control cabinets behind the second col...,Qwen/Qwen3-VL-30B-A3B-Instruct:novita,0.0,0.6,7.0,1013.43,1.0,40.224


## Computing total averages

In [35]:
# Computing averages per Model & across all questions
overall_df = (
    df.groupby(["model_name"])
      .agg({
          "response_correctness": "mean",
          "navigation_correctness": "mean",
          "iterations": "mean",
          "total_translation": "mean",
          "total_rotation": "mean",
          "elapsed_time": "mean"
      })
      .reset_index()
)

# This renames the question
overall_df["question"] = "ALL"

# Changing the column order to match the aggregated_df
overall_df = overall_df[aggregated_df.columns]

overall_df

  df.groupby(["model_name"])


Unnamed: 0,question,model_name,response_correctness,navigation_correctness,iterations,total_translation,total_rotation,elapsed_time
0,ALL,gpt-5.1,0.4,0.7,2.45,431.075,29.75,18.95
1,ALL,Qwen/Qwen3-VL-30B-A3B-Instruct:novita,0.25,0.65,4.05,418.3575,1.25,23.0175


## Aggregating and pivoting

In [36]:
# Combining both df
combined_df = pd.concat([aggregated_df, overall_df], ignore_index=True)

# Creating column labels (Q+model) and pivoting
combined_df["col_label"] = (
    combined_df["question"].astype(str) + " + " + combined_df["model_name"].astype(str)
)

pivot_df = (
    combined_df
    .set_index("col_label")
    .drop(columns=["question", "model_name"])
    .T
)

pivot_df

col_label,Does this column have ram protection? + gpt-5.1,Does this column have ram protection?\n + Qwen/Qwen3-VL-30B-A3B-Instruct:novita,How many robots are in this cell? + gpt-5.1,How many robots are in this cell? + Qwen/Qwen3-VL-30B-A3B-Instruct:novita,Would another robot fit within this first production cell on the left? + gpt-5.1,Would another robot fit within this first production cell on the left? + Qwen/Qwen3-VL-30B-A3B-Instruct:novita,Are the control cabinets behind the second column accessible for the worker? + gpt-5.1,Are the control cabinets behind the second column accessible for the worker? + Qwen/Qwen3-VL-30B-A3B-Instruct:novita,ALL + gpt-5.1,ALL + Qwen/Qwen3-VL-30B-A3B-Instruct:novita
response_correctness,1.0,0.8,0.2,0.2,0.0,0.0,0.4,0.0,0.4,0.25
navigation_correctness,1.0,1.0,1.0,1.0,0.4,0.0,0.4,0.6,0.7,0.65
iterations,1.0,3.4,1.0,1.0,3.4,4.8,4.4,7.0,2.45,4.05
total_translation,0.0,280.0,0.0,0.0,748.632,380.0,975.668,1013.43,431.075,418.3575
total_rotation,0.0,0.0,0.0,0.0,59.0,4.0,60.0,1.0,29.75,1.25
elapsed_time,5.04,18.304,4.572,5.178,28.322,28.364,37.866,40.224,18.95,23.0175
