In [1]:
import os
import json
import pandas as pd
from glob import glob


In [2]:
files = glob('results/**/api-results.json', recursive=True)
print('Found {} files'.format(len(files)))


Found 117 files


In [3]:
df = pd.DataFrame(files, columns=['path'])

def load_json(path):
    with open(path) as f:
        return json.load(f)

def parse_path(path):
    parts = path.split('/')
    return {
        'dataset': parts[1],
        'model': parts[2],
        'mode': parts[3],
    }

df['data'] = df['path'].apply(load_json)
df = pd.concat([df, df['path'].apply(parse_path).apply(pd.Series)], axis=1)

df["accuracy"] = df["data"].apply(lambda x: x["accuracy"])
df = df[df["dataset"] == "level-1-given-desc"]

In [4]:
_viz_df = df.set_index(['dataset', 'model', 'mode'])[["accuracy"]]\
    .unstack()

_viz_df.style.format("{:.2%}")\
    .highlight_max(color='lightgreen', axis=1)


Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,accuracy,accuracy
Unnamed: 0_level_1,mode,code_as_action,json_as_action,text_as_action
dataset,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
level-1-given-desc,CodeLlama-13b-Instruct-hf,11.78%,7.77%,14.04%
level-1-given-desc,CodeLlama-34b-Instruct-hf,17.29%,12.03%,16.79%
level-1-given-desc,CodeLlama-7b-Instruct-hf,12.53%,12.03%,17.04%
level-1-given-desc,Llama-2-13b-chat-hf,38.10%,8.52%,37.34%
level-1-given-desc,Llama-2-70b-chat-hf,35.59%,14.29%,37.59%
level-1-given-desc,Llama-2-7b-chat-hf,28.82%,11.28%,25.81%
level-1-given-desc,Mistral-7B-Instruct-v0.1,2.51%,2.26%,3.01%
level-1-given-desc,claude-2,76.69%,59.40%,73.68%
level-1-given-desc,claude-instant-1,75.19%,64.91%,73.18%
level-1-given-desc,gemini-pro,70.43%,73.18%,71.18%


In [5]:
_viz_latex_df = _viz_df.sort_index().reset_index().drop(columns=['dataset'])
# make all accuracies percentages and bold the best one, and \underline the second best
    # .apply(lambda x: ['\\textbf{' + str(v) + '}' if v == x.max() else v for v in x], axis=1)\
    # .apply(lambda x: ['\\underline{' + str(v) + '}' if v == x.max() else v for v in x], axis=1)\
    # .set_properties(**{'text-align': 'center'})\
    # .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])\
    # .hide_index()\
    # .render()
def _process_row(row):
    sorted_values = sorted(row, reverse=True)
    max_value = sorted_values[0]
    second_max_value = sorted_values[1]
    
    def _format_value(value):
        formatted_value = "{:.2f}".format(value * 100)
        if value == max_value:
            return '$\\mathbf{' + str(formatted_value) + '}$'
        elif value == second_max_value:
            return '\\underline{$' + str(formatted_value) + '$}'
        else:
            return "$" + str(formatted_value) + "$"
    row = row.apply(_format_value)
    return row


_last_row = _viz_latex_df["accuracy"].idxmax(axis=1).value_counts()
display(_last_row.to_frame().T.style.highlight_max(color='lightgreen', axis=1))
print(_last_row.to_frame().T.to_latex(escape=False))

_viz_latex_df["accuracy"] = _viz_latex_df["accuracy"].apply(_process_row, axis=1)
_viz_latex_df["model"] = _viz_latex_df["model"].apply(lambda x: f"\\texttt{{{x}}}")

print(
    _viz_latex_df.set_index(['model']).to_latex(escape=False, index=True, column_format='lrrr')
)

  _viz_latex_df = _viz_df.sort_index().reset_index().drop(columns=['dataset'])


Unnamed: 0,code_as_action,json_as_action,text_as_action
0,8,5,4


\begin{tabular}{lrrr}
\toprule
{} &  code_as_action &  json_as_action &  text_as_action \\
\midrule
0 &               8 &               5 &               4 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrr}
\toprule
{} & \multicolumn{3}{l}{accuracy} \\
mode &       code_as_action &       json_as_action &       text_as_action \\
model                              &                      &                      &                      \\
\midrule
\texttt{CodeLlama-13b-Instruct-hf} &  \underline{$11.78$} &               $7.77$ &     $\mathbf{14.04}$ \\
\texttt{CodeLlama-34b-Instruct-hf} &     $\mathbf{17.29}$ &              $12.03$ &  \underline{$16.79$} \\
\texttt{CodeLlama-7b-Instruct-hf}  &  \underline{$12.53$} &              $12.03$ &     $\mathbf{17.04}$ \\
\texttt{Llama-2-13b-chat-hf}       &     $\mathbf{38.10}$ &               $8.52$ &  \underline{$37.34$} \\
\texttt{Llama-2-70b-chat-hf}       &  \underline{$35.59$} &              $14.29$ &     $\mathbf{37.59}$ \\
\texttt{Llama-2-7b

  print(_last_row.to_frame().T.to_latex(escape=False))
  _viz_latex_df.set_index(['model']).to_latex(escape=False, index=True, column_format='lrrr')


In [6]:
# format everything as a percentage
(df.set_index(['dataset', 'model', 'mode'])[["accuracy"]].unstack() * 100).round(2)


Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,accuracy,accuracy
Unnamed: 0_level_1,mode,code_as_action,json_as_action,text_as_action
dataset,model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
level-1-given-desc,CodeLlama-13b-Instruct-hf,11.78,7.77,14.04
level-1-given-desc,CodeLlama-34b-Instruct-hf,17.29,12.03,16.79
level-1-given-desc,CodeLlama-7b-Instruct-hf,12.53,12.03,17.04
level-1-given-desc,Llama-2-13b-chat-hf,38.1,8.52,37.34
level-1-given-desc,Llama-2-70b-chat-hf,35.59,14.29,37.59
level-1-given-desc,Llama-2-7b-chat-hf,28.82,11.28,25.81
level-1-given-desc,Mistral-7B-Instruct-v0.1,2.51,2.26,3.01
level-1-given-desc,claude-2,76.69,59.4,73.68
level-1-given-desc,claude-instant-1,75.19,64.91,73.18
level-1-given-desc,gemini-pro,70.43,73.18,71.18
