In [1]:
from pathlib import Path

results_path = Path("/home/ubuntu/cape/eval_harness_outputs")

In [2]:
# load json file

def load_json(file):
    import json
    with open(file) as f:
        data = json.load(f)
    return data

In [3]:
tasks = {"hellaswag":"acc_norm,none",
         "arc_challenge":"acc_norm,none",
         "truthfulqa":"acc,none",
         "mmlu":"acc,none",
         "winogrande":"acc,none",
         "gsm8k":'exact_match,get-answer'}

In [4]:
files = list(results_path.glob("*.json"))
files

[PosixPath('/home/ubuntu/cape/eval_harness_outputs/checkpoint-woven-pyramid-75:v1.json'),
 PosixPath('/home/ubuntu/cape/eval_harness_outputs/checkpoint-graceful-frog-87:v1.json'),
 PosixPath('/home/ubuntu/cape/eval_harness_outputs/checkpoint-zesty-flower-96:v1.json'),
 PosixPath('/home/ubuntu/cape/eval_harness_outputs/mixtral_baseline.json'),
 PosixPath('/home/ubuntu/cape/eval_harness_outputs/mixtral_baseline_instruct.json')]

In [5]:
raw_results = [load_json(res_file) for res_file in files]

In [6]:
def get_base_model_name(results):
    model_name = results["config"]["model_args"].split(",")[0].split("=")[1]
    return model_name

get_base_model_name(raw_results[0])

'/home/ubuntu/cape/mixtral/artifacts/checkpoint-woven-pyramid-75:v1/merged/'

In [7]:
def get_final_results(res_file):

    res = load_json(res_file)

    # model_name = get_base_model_name(res)
    model_name = res_file.name.split(".")[0]
    print("-"*50+f"\nfile: {res_file}\nResults for \"{model_name}\"\n"+"-"*50)

    model_results_eval_harness = {}
    for task, metric in tasks.items():
        model_results_eval_harness[task] = res["results"][task][metric]
        print(f"{task}[{metric}]:  {model_results_eval_harness[task]:.3f}")
    return model_name, model_results_eval_harness

model_name, final_res = get_final_results(files[0])

--------------------------------------------------
file: /home/ubuntu/cape/eval_harness_outputs/checkpoint-woven-pyramid-75:v1.json
Results for "checkpoint-woven-pyramid-75:v1"
--------------------------------------------------
hellaswag[acc_norm,none]:  0.838
arc_challenge[acc_norm,none]:  0.611
truthfulqa[acc,none]:  0.466
mmlu[acc,none]:  0.659
winogrande[acc,none]:  0.769
gsm8k[exact_match,get-answer]:  0.633


In [16]:
rename_map = {
    "checkpoint-graceful-frog-87:v1": "proj_r=32_a=16_tok=32002*_seq=8k_lr=0.002",
    "checkpoint-woven-pyramid-75:v1": "proj_r=16_a=32_tok=32000_seq=8k_lr=0.002_v2",
    "checkpoint-zesty-flower-96:v1": "proj_r=16_a=32_tok=32000_seq=8k_lr=0.002",
}

def rename_model(model_name):
    if model_name in rename_map:
        return rename_map[model_name]
    else:
        return model_name

In [17]:
import plotly.express as px
import pandas as pd

results_df = []

for res_file in files:
    model_name, final_res = get_final_results(res_file)

    df = pd.DataFrame(dict(
        value=final_res.values(),
        metric=final_res.keys(),
        model=rename_model(model_name)))
    results_df.append(df)

results_df = pd.concat(results_df)
results_df.to_csv("results_eval_harness.csv", index=False)


--------------------------------------------------
file: /home/ubuntu/cape/eval_harness_outputs/checkpoint-woven-pyramid-75:v1.json
Results for "checkpoint-woven-pyramid-75:v1"
--------------------------------------------------
hellaswag[acc_norm,none]:  0.838
arc_challenge[acc_norm,none]:  0.611
truthfulqa[acc,none]:  0.466
mmlu[acc,none]:  0.659
winogrande[acc,none]:  0.769
gsm8k[exact_match,get-answer]:  0.633
--------------------------------------------------
file: /home/ubuntu/cape/eval_harness_outputs/checkpoint-graceful-frog-87:v1.json
Results for "checkpoint-graceful-frog-87:v1"
--------------------------------------------------
hellaswag[acc_norm,none]:  0.780
arc_challenge[acc_norm,none]:  0.538
truthfulqa[acc,none]:  0.427
mmlu[acc,none]:  0.635
winogrande[acc,none]:  0.717
gsm8k[exact_match,get-answer]:  0.451
--------------------------------------------------
file: /home/ubuntu/cape/eval_harness_outputs/checkpoint-zesty-flower-96:v1.json
Results for "checkpoint-zesty-flowe

In [18]:

fig = px.line_polar(results_df, 
                    r='value',
                    range_r=[0,.9],
                    theta='metric', 
                    color='model',
                    line_close=True, 
                    title="Fine-tuned model performance on each task")
## add title to series
# fig.update_traces(fill='toself')
fig.show()

In [10]:
import wandb

WANDB_PROJECT = "mixtral"
WANDB_ENTITY = "reviewco"

with wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY, job_type="eval_harness"):
    at = wandb.Artifact("eval_harness", type="eval_harness")
    at.add_file("results_eval_harness.csv")
    wandb.log_artifact(at)

    plotly_fig = wandb.Plotly(fig)
    wandb.log({"eval_harness":plotly_fig})

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m ([33mreviewco[0m). Use [1m`wandb login --relogin`[0m to force relogin




VBox(children=(Label(value='0.011 MB of 0.021 MB uploaded\r'), FloatProgress(value=0.5175184110689578, max=1.0…