In [4]:
import numpy as np
import pandas as pd

In [5]:
def evaluate_task_success(filename:str):
    df = pd.read_csv(filename, header=None, names=['dataset', 'train_or_val', 'task', 'start_timeframe', 'end_timeframe', 'action_loss', 'joint_loss', 'success'])
    #data = data.drop([0,1,2], axis=1)
    mean_action_loss = np.mean(df['action_loss'])
    mean_join_loss = np.mean(df['joint_loss'])
    success_rate = len(df.loc[df['success'] != -1]) / df.shape[0]
    return df, mean_action_loss, mean_join_loss, success_rate

In [3]:
df, mean_action_loss, mean_joint_loss, success_rate = evaluate_task_success('model2_eval-task_D_D-val-push_pink_block_right.csv')

In [4]:
df.head()

Unnamed: 0,dataset,train_or_val,task,start_timeframe,end_timeframe,action_loss,joint_loss,success
0,task_D_D,val,push_pink_block_right,404279,404343,0.322616,1.478312,-1
1,task_D_D,val,push_pink_block_right,414511,414575,0.455005,2.181839,-1
2,task_D_D,val,push_pink_block_right,414530,414573,0.147384,1.050131,-1
3,task_D_D,val,push_pink_block_right,404283,404347,0.384765,1.383245,-1
4,task_D_D,val,push_pink_block_right,39644,39708,0.516463,2.366415,-1


In [5]:
print('mean action loss ', mean_action_loss)
print('mean joint loss  ', mean_joint_loss)
print('success rate     ', success_rate)

mean action loss  0.2299818160689655
mean joint loss   0.8440425412926396
success rate      0.41379310344827586


In [6]:
tasks = ['push_pink_block_right',
        #'push_pink_block_left',
        'stack_block',
        'turn_on_lightbulb',
        'turn_off_lightbulb']

for task in tasks:
    filename = 'model2_eval-task_D_D-val-' + task + '.csv'
    df, mean_action_loss, mean_joint_loss, success_rate = evaluate_task_success(filename)
    
    print('task name:', task)
    print('\tmean action loss:', mean_action_loss)
    print('\tmean joint loss :', mean_joint_loss)
    print('\tsuccess rate    :', success_rate)

task name: push_pink_block_right
	mean action loss: 0.2299818160689655
	mean joint loss : 0.8440425412926396
	success rate    : 0.41379310344827586
task name: stack_block
	mean action loss: 0.35519965186206903
	mean joint loss : 1.041678197887064
	success rate    : 0.1724137931034483
task name: turn_on_lightbulb
	mean action loss: 0.22259436893939394
	mean joint loss : 0.7975435374527032
	success rate    : 0.5151515151515151
task name: turn_off_lightbulb
	mean action loss: 0.2136064796896552
	mean joint loss : 0.7038246218356655
	success rate    : 0.6896551724137931


In [7]:
import plotly.graph_objects as go

tasks = ['push_pink_block_right',
        #'push_pink_block_left',
        'stack_block',
        'turn_on_lightbulb',
        'turn_off_lightbulb']

table = pd.DataFrame(columns=["Task", "Mean Action Loss", "Joint Loss", "Success Rate"])

for task in tasks:
    print(task)
    filename = 'model2_eval-task_D_D-val-' + task + '.csv'
    df, mean_action_loss, mean_joint_loss, success_rate = evaluate_task_success(filename)
    #table.loc[len(df.index)] = [task, round(mean_action_loss, 2), round(mean_joint_loss, 2), round(success_rate, 2)]
    tb2 = {"Task": task, "Mean Action Loss": round(mean_action_loss, 2), 
           "Joint Loss": round(mean_joint_loss, 2), "Success Rate": round(success_rate, 2)}
    table = table.append(tb2, ignore_index = True) 
table

push_pink_block_right
stack_block
turn_on_lightbulb
turn_off_lightbulb


  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)


Unnamed: 0,Task,Mean Action Loss,Joint Loss,Success Rate
0,push_pink_block_right,0.23,0.84,0.41
1,stack_block,0.36,1.04,0.17
2,turn_on_lightbulb,0.22,0.8,0.52
3,turn_off_lightbulb,0.21,0.7,0.69


In [76]:
datasets = ['train', 'val']
sensors = ['rgb_static',
            'rgb_static_proprio',
            'rgb_static_rgb_gripper_proprio',
            'rgbd_static_rgbd_gripper_proprio',
]

table = pd.DataFrame(columns=["Dataset", "Sensors", "Success Rate", "Mean Action Loss", "Joint Loss"])

for sensor in sensors:
    for dataset in datasets:

        print(sensor)
        filename = 'sensor_combination_tests/model2_eval-task_D_D-' + dataset + '-stack_block-' + sensor + '.csv'
        df, mean_action_loss, mean_joint_loss, success_rate = evaluate_task_success(filename)
        #table.loc[len(df.index)] = [task, round(mean_action_loss, 2), round(mean_joint_loss, 2), round(success_rate, 2)]
        tb2 = {"Dataset": dataset, "Sensors": sensor, "Mean Action Loss": round(mean_action_loss, 2), 
               "Joint Loss": round(mean_joint_loss, 2), "Success Rate": round(success_rate, 2)}
        table = table.append(tb2, ignore_index = True) 
table

rgb_static
rgb_static
rgb_static_proprio
rgb_static_proprio
rgb_static_rgb_gripper_proprio
rgb_static_rgb_gripper_proprio
rgbd_static_rgbd_gripper_proprio
rgbd_static_rgbd_gripper_proprio


  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)


Unnamed: 0,Dataset,Sensors,Success Rate,Mean Action Loss,Joint Loss
0,train,rgb_static,0.05,0.22,0.82
1,val,rgb_static,0.0,0.25,0.63
2,train,rgb_static_proprio,0.01,0.38,1.0
3,val,rgb_static_proprio,0.0,0.36,0.66
4,train,rgb_static_rgb_gripper_proprio,0.3,0.26,0.75
5,val,rgb_static_rgb_gripper_proprio,0.17,0.25,0.61
6,train,rgbd_static_rgbd_gripper_proprio,0.3,0.28,0.8
7,val,rgbd_static_rgbd_gripper_proprio,0.24,0.32,0.79


In [18]:
!ls

eval_data.ipynb
eval_table.png
model2_eval-calvin_debug_dataset-val-turn_off_lightbulb.csv
model2_eval-task_D_D-train-stack_block-rgb_static.csv
model2_eval-task_D_D-val-push_pink_block_right.csv
model2_eval-task_D_D-val-stack_block.csv
model2_eval-task_D_D-val-turn_off_lightbulb.csv
model2_eval-task_D_D-val-turn_on_lightbulb.csv
model3_eval
sensor_combination_tests
table.tbl


In [38]:
# model3 evaluation
datasets = ['val']
categories = ['lights',
            'lights_no_pretrain',
            'push_pink_block',
            'turnonlightbulb_and_pushpinkblockleft',
]

table = pd.DataFrame(columns=["Dataset", "Category", "Task", "Mean Action Loss", "Joint Loss", "Success Rate"])

for dataset in datasets:
    for category in categories:
        print(category)
        filename = 'model3_eval/model3_eval-task_D_D-' + dataset + '-' + category + '.csv'
        df, mean_action_loss, mean_joint_loss, success_rate = evaluate_task_success(filename)
        #table.loc[len(df.index)] = [task, round(mean_action_loss, 2), round(mean_joint_loss, 2), round(success_rate, 2)]
        tb2 = {"Dataset": dataset, "Category": category, "Task":df ,"Mean Action Loss": round(mean_action_loss, 2), 
               "Joint Loss": round(mean_joint_loss, 2), "Success Rate": round(success_rate, 2)}
        table = table.append(tb2, ignore_index = True) 
table

lights
lights_no_pretrain
push_pink_block
turnonlightbulb_and_pushpinkblockleft


  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)
  table = table.append(tb2, ignore_index = True)


Unnamed: 0,Dataset,Category,Task,Mean Action Loss,Joint Loss,Success Rate
0,val,lights,dataset train_or_val task...,0.29,1.0,0.46
1,val,lights_no_pretrain,dataset train_or_val task...,0.31,0.76,0.25
2,val,push_pink_block,dataset train_or_val ta...,0.31,0.92,0.22
3,val,turnonlightbulb_and_pushpinkblockleft,dataset train_or_val tas...,0.27,0.76,0.4


In [77]:
# model3 evaluation
datasets = ['val']
categories = ['lights',
              'lights-dummy_language',
            'lights-no_pretrain_2',
            'push_pink_block',
            'turnonlightbulb_and_pushpinkblockleft',
]

table1 = pd.DataFrame(columns=["dataset", "train_or_val", "task", "action_loss", "joint_loss", "success"])
table2 = pd.DataFrame(columns=["Dataset", "Category", "Mean Action Loss", "Joint Loss", "Success Rate"])
for dataset in datasets:
    for category in categories:
        print(category)
        filename = 'model3_eval/model3_eval-task_D_D-' + dataset + '-' + category + '.csv'
        df, mean_action_loss, mean_joint_loss, success_rate = evaluate_task_success(filename)
        table1 = table1.append(df, ignore_index = True)
        #table.loc[len(df.index)] = [task, round(mean_action_loss, 2), round(mean_joint_loss, 2), round(success_rate, 2)]
        tb2 = {"Dataset": dataset, "Category": category ,"Mean Action Loss": round(mean_action_loss, 2), 
               "Joint Loss": round(mean_joint_loss, 2), "Success Rate": round(success_rate, 2)}
        table2 = table2.append(tb2, ignore_index = True) 

lights
lights-dummy_language
lights-no_pretrain_2
push_pink_block
turnonlightbulb_and_pushpinkblockleft


  table1 = table1.append(df, ignore_index = True)
  table2 = table2.append(tb2, ignore_index = True)
  table1 = table1.append(df, ignore_index = True)
  table2 = table2.append(tb2, ignore_index = True)
  table1 = table1.append(df, ignore_index = True)
  table2 = table2.append(tb2, ignore_index = True)
  table1 = table1.append(df, ignore_index = True)
  table2 = table2.append(tb2, ignore_index = True)
  table1 = table1.append(df, ignore_index = True)
  table2 = table2.append(tb2, ignore_index = True)


In [78]:
table1

Unnamed: 0,dataset,train_or_val,task,action_loss,joint_loss,success,start_timeframe,end_timeframe
0,task_D_D,val,turn_on_led,0.375690,2.002056,-1,35774.0,35722.0
1,task_D_D,val,turn_off_lightbulb,0.269369,0.125390,-1,232762.0,232688.0
2,task_D_D,val,turn_on_lightbulb,0.427134,0.732281,81,909.0,862.0
3,task_D_D,val,turn_on_led,0.244572,1.114252,49,24205.0,24131.0
4,task_D_D,val,turn_off_led,0.349401,0.389306,154,51091.0,51017.0
...,...,...,...,...,...,...,...,...
484,task_D_D,val,push_pink_block_left,0.405475,1.522289,-1,34324.0,34250.0
485,task_D_D,val,push_pink_block_left,0.394242,1.512292,-1,34323.0,34249.0
486,task_D_D,val,push_pink_block_left,0.358439,1.373189,-1,414653.0,414579.0
487,task_D_D,val,push_pink_block_left,0.378094,1.879979,-1,20047.0,19973.0


In [79]:
table2

Unnamed: 0,Dataset,Category,Mean Action Loss,Joint Loss,Success Rate
0,val,lights,0.29,1.0,0.46
1,val,lights-dummy_language,0.28,1.07,0.13
2,val,lights-no_pretrain_2,0.29,0.9,0.22
3,val,push_pink_block,0.31,0.92,0.22
4,val,turnonlightbulb_and_pushpinkblockleft,0.27,0.76,0.4


In [81]:
# per-task successes within each model
dataset = 'val'
categories = [('lights', ['turn_on_lightbulb', 'turn_off_lightbulb', 'turn_on_led', 'turn_off_led']),
              ('lights-dummy_language', ['turn_on_lightbulb', 'turn_off_lightbulb', 'turn_on_led', 'turn_off_led']),
              ('lights-no_pretrain_2', ['turn_on_lightbulb', 'turn_off_lightbulb', 'turn_on_led', 'turn_off_led']),
              ('push_pink_block', ['push_pink_block_left', 'push_pink_block_right']),
              ('turnonlightbulb_and_pushpinkblockleft', ['turn_on_lightbulb', 'push_pink_block_left'])]
metrics = ['success', 'action_loss', 'joint_loss']

for category, tasks in categories:
    print(category)
    filename = 'model3_eval/model3_eval-task_D_D-' + dataset + '-' + category + '.csv'
    df, mean_action_loss, mean_joint_loss, success_rate = evaluate_task_success(filename)
    for task in tasks:
        print('\t', task)
        for metric in metrics:
            print('\t\t', metric)
            if metric == 'success':
                print('\t\t\t', len(df.loc[(df['success'] != -1) & (df['task'] == task)]) / len(df.loc[df['task'] == task]))
            else:
                print('\t\t\t', df.loc[df['task'] == task, metric].mean())

lights
	 turn_on_lightbulb
		 success
			 0.7575757575757576
		 action_loss
			 0.2600033193939394
		 joint_loss
			 0.7798340424497039
	 turn_off_lightbulb
		 success
			 0.3103448275862069
		 action_loss
			 0.28316999724137937
		 joint_loss
			 0.9095177626077308
	 turn_on_led
		 success
			 0.4482758620689655
		 action_loss
			 0.29987787517241377
		 joint_loss
			 1.123987316000997
	 turn_off_led
		 success
			 0.3125
		 action_loss
			 0.32094859937500003
		 joint_loss
			 1.2037419168679797
lights-dummy_language
	 turn_on_lightbulb
		 success
			 0.0
		 action_loss
			 0.28203515575757576
		 joint_loss
			 1.1366807749590884
	 turn_off_lightbulb
		 success
			 0.0
		 action_loss
			 0.30645224689655176
		 joint_loss
			 1.2188439368390183
	 turn_on_led
		 success
			 0.3793103448275862
		 action_loss
			 0.2529824370344828
		 joint_loss
			 0.8758374909148641
	 turn_off_led
		 success
			 0.15625
		 action_loss
			 0.29751490625
		 joint_loss
			 1.0535696590041808
lights-no_pre

In [None]:
filename = 'model3_eval/model3_eval-task_D_D-' + dataset + '-' + category + '.csv'
df, mean_action_loss, mean_joint_loss, success_rate = evaluate_task_success(filename)

In [8]:
!pip install -U kaleido
df = table
fig = go.Figure(data=[go.Table(
    header=dict(values=list(df.columns),
                align='left'),
    cells=dict(values=[df["Task"], round(df["Mean Action Loss"], 2), round(df["Joint Loss"], 2), round(df["Success Rate"], 2)],
               align='left'))
])

fig.write_image("eval_table.png")



In [82]:
def bold_extreme_values(data, data_max=-1):

    if data == data_max:
        return "\\bfseries %s" % data

    return data
import os
# Specify in which columns to make the maximum bold
col_show_max = ["Mean Action Loss", "Joint Loss", "Success Rate"]
for col in col_show_max:
    df[col] = df[col].astype(float)
# Iterate through columns
for k in col_show_max:
    df[k] = df[k].apply(
        lambda data: bold_extreme_values(data, data_max=df[k].max()))

# Set column header to bold title case
df.columns = (df.columns.to_series()
              .apply(lambda r: "\\textbf".format(
                  r.replace("_", " ").title())))

# Write to file
with open(
    "table.tbl", "w") as f:

    format = "l" + \
        "@{\hskip 12pt}" +\
        4*"S[table-format = 2.2]"

    f.write(df.to_latex(index=False,
                      escape=False,
                      column_format=format)
            )


In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



Interesting Gifs:

/home/grail/willaria_research/hobbes/hobbes_models/hobbes_agent/recordings/sensor-eval-gifs/rgb_static/model2-val-(stack_block)-(416516-416560)-prediction.gif

Shows a stack blocks failure due to letting the block go
